datly 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,685 +0,0 @@
1
- /**
2
- * AutoAnalyzer - Módulo de análise automática para StatLibrary
3
- * Identifica tipos de variáveis e gera insights automaticamente
4
- */
5
-
6
- class AutoAnalyzer {
7
- constructor(statsInstance) {
8
- this.stats = statsInstance;
9
- this.insights = [];
10
- this.visualizations = [];
11
- }
12
-
13
- /**
14
- * Análise completa automática de um dataset
15
- * @param {Object} dataset - { headers: string[], data: Array<Record<string, any>> }
16
- * @param {Object} options
17
- * @returns {Object} Relatório completo
18
- */
19
- autoAnalyze(dataset, options = {}) {
20
- const config = {
21
- minCorrelationThreshold: 0.3,
22
- significanceLevel: 0.05,
23
- generateVisualizations: true,
24
- includeAdvancedAnalysis: true,
25
- ...options
26
- };
27
-
28
- console.log('🔍 Iniciando análise automática...');
29
-
30
- // 1) Validar
31
- const validation = this.stats.validateData(dataset);
32
- if (!validation.valid) {
33
- throw new Error(`Dados inválidos: ${validation.errors.join(', ')}`);
34
- }
35
-
36
- // 2) Classificar variáveis
37
- const variableTypes = this.classifyVariables(dataset);
38
- console.log(`📊 Identificadas ${variableTypes.quantitative.length} variáveis quantitativas e ${variableTypes.qualitative.length} qualitativas`);
39
-
40
- // 3) Análises
41
- const descriptiveAnalysis = this.performDescriptiveAnalysis(dataset, variableTypes);
42
- const correlationAnalysis = this.performCorrelationAnalysis(dataset, variableTypes.quantitative, config);
43
- const regressionAnalysis = this.performRegressionAnalysis(dataset, variableTypes.quantitative, correlationAnalysis, config);
44
- const distributionAnalysis = this.performDistributionAnalysis(dataset, variableTypes);
45
- const outlierAnalysis = this.performOutlierAnalysis(dataset, variableTypes.quantitative);
46
- const temporalAnalysis = this.performTemporalAnalysis(dataset, variableTypes);
47
-
48
- // 4) Insights + Visualizações
49
- const insights = this.generateAutoInsights(
50
- dataset,
51
- {
52
- variableTypes,
53
- descriptiveAnalysis,
54
- correlationAnalysis,
55
- regressionAnalysis,
56
- distributionAnalysis,
57
- outlierAnalysis,
58
- temporalAnalysis
59
- },
60
- config
61
- );
62
-
63
- const visualizationSuggestions = this.suggestVisualizations(
64
- variableTypes,
65
- correlationAnalysis,
66
- distributionAnalysis
67
- );
68
-
69
- console.log('✅ Análise concluída!');
70
-
71
- return {
72
- metadata: {
73
- analysisDate: new Date().toISOString(),
74
- datasetSize: dataset.length,
75
- columnsAnalyzed: dataset.headers.length,
76
- configuration: config
77
- },
78
- variableClassification: variableTypes,
79
- descriptiveStatistics: descriptiveAnalysis,
80
- correlationAnalysis,
81
- regressionAnalysis,
82
- distributionAnalysis,
83
- outlierAnalysis,
84
- temporalAnalysis,
85
- insights,
86
- visualizationSuggestions,
87
- summary: this.generateExecutiveSummary(insights)
88
- };
89
- }
90
-
91
- // =======================
92
- // Classificação de variáveis
93
- // =======================
94
- classifyVariables(dataset) {
95
- const quantitative = [];
96
- const qualitative = [];
97
- const datetime = [];
98
- const binary = [];
99
- const ordinal = [];
100
-
101
- dataset.headers.forEach(header => {
102
- const column = dataset.data.map(row => row[header]);
103
- const nonNullValues = column.filter(val => val != null);
104
-
105
- if (nonNullValues.length === 0) {
106
- qualitative.push({
107
- name: header,
108
- type: 'empty',
109
- description: 'Coluna vazia'
110
- });
111
- return;
112
- }
113
-
114
- const classification = this.classifyVariable(nonNullValues, header);
115
-
116
- switch (classification.type) {
117
- case 'quantitative':
118
- quantitative.push(classification);
119
- break;
120
- case 'datetime':
121
- datetime.push(classification);
122
- break;
123
- case 'binary':
124
- binary.push(classification);
125
- break;
126
- case 'ordinal':
127
- ordinal.push(classification);
128
- break;
129
- default:
130
- qualitative.push(classification);
131
- }
132
- });
133
-
134
- return { quantitative, qualitative, datetime, binary, ordinal };
135
- }
136
-
137
- classifyVariable(values, name) {
138
- const uniqueValues = [...new Set(values)];
139
- const numericValues = values.filter(v => typeof v === 'number' && !isNaN(v));
140
- const numericRatio = numericValues.length / values.length;
141
-
142
- // datetime?
143
- if (this.isDateTimeColumn(values)) {
144
- return {
145
- name,
146
- type: 'datetime',
147
- uniqueCount: uniqueValues.length,
148
- description: 'Variável temporal'
149
- };
150
- }
151
-
152
- // quantitativa
153
- if (numericRatio > 0.8) {
154
- const subtype = this.determineQuantitativeSubtype(numericValues);
155
- return {
156
- name,
157
- type: 'quantitative',
158
- subtype,
159
- uniqueCount: uniqueValues.length,
160
- description: `Variável quantitativa ${subtype}`,
161
- range: {
162
- min: Math.min(...numericValues),
163
- max: Math.max(...numericValues)
164
- }
165
- };
166
- }
167
-
168
- // binária
169
- if (uniqueValues.length === 2) {
170
- return {
171
- name,
172
- type: 'binary',
173
- categories: uniqueValues,
174
- description: 'Variável binária/dicotômica'
175
- };
176
- }
177
-
178
- // ordinal
179
- if (this.isOrdinalVariable(uniqueValues)) {
180
- return {
181
- name,
182
- type: 'ordinal',
183
- categories: uniqueValues,
184
- uniqueCount: uniqueValues.length,
185
- description: 'Variável ordinal'
186
- };
187
- }
188
-
189
- // qualitativa
190
- return {
191
- name,
192
- type: 'qualitative',
193
- subtype: uniqueValues.length > 10 ? 'nominal_many' : 'nominal',
194
- categories: uniqueValues.slice(0, 20),
195
- uniqueCount: uniqueValues.length,
196
- description: `Variável qualitativa nominal (${uniqueValues.length} categorias)`
197
- };
198
- }
199
-
200
- // =======================
201
- // Descritiva
202
- // =======================
203
- performDescriptiveAnalysis(dataset, variableTypes) {
204
- const results = {};
205
-
206
- // Análise quantitativa
207
- variableTypes.quantitative.forEach(variable => {
208
- const values = dataset.data
209
- .map(row => row[variable.name])
210
- .filter(v => typeof v === 'number' && !isNaN(v));
211
-
212
- if (values.length > 0) {
213
- const n = values.length;
214
- const canSkew = n >= 3;
215
- const canKurt = n >= 4;
216
-
217
- results[variable.name] = {
218
- type: 'quantitative',
219
- count: n,
220
- mean: this.stats.mean(values),
221
- median: this.stats.median(values),
222
- standardDeviation: this.stats.standardDeviation(values),
223
- min: Math.min(...values),
224
- max: Math.max(...values),
225
- quartiles: this.stats.quartiles(values),
226
- skewness: canSkew ? this.stats.skewness(values) : null,
227
- kurtosis: canKurt ? this.stats.kurtosis(values) : null
228
- };
229
- }
230
- });
231
-
232
- // Análise qualitativa (inclui variáveis binárias)
233
- [...variableTypes.qualitative, ...variableTypes.binary].forEach(variable => {
234
- const values = dataset.data
235
- .map(row => row[variable.name])
236
- .filter(v => v != null);
237
-
238
- if (values.length > 0) {
239
- const frequencyTable = this.stats.frequencyTable(values);
240
-
241
- results[variable.name] = {
242
- type: 'qualitative',
243
- count: values.length,
244
- uniqueValues: variable.uniqueCount,
245
- frequencyTable: frequencyTable.slice(0, 10), // top 10 categorias
246
- mostFrequent: frequencyTable[0],
247
- concentration: this.calculateConcentration(frequencyTable)
248
- };
249
- }
250
- });
251
-
252
- return results;
253
- }
254
-
255
-
256
- // =======================
257
- // Correlação
258
- // =======================
259
- performCorrelationAnalysis(dataset, quantitativeVars, config) {
260
- if (quantitativeVars.length < 2) {
261
- return { message: 'Insuficientes variáveis quantitativas para análise de correlação' };
262
- }
263
-
264
- // Usa a StatLibrary -> correlationMatrix(dataset)
265
- const correlationMatrix = this.stats.correlationMatrix(dataset);
266
- const strongCorrelations = (correlationMatrix.strongCorrelations || [])
267
- .filter(corr => Math.abs(corr.correlation) >= config.minCorrelationThreshold);
268
-
269
- const insights = strongCorrelations.map(corr => {
270
- const strength = this.getCorrelationStrength(Math.abs(corr.correlation));
271
- const direction = corr.correlation > 0 ? 'positiva' : 'negativa';
272
-
273
- return {
274
- type: 'correlation',
275
- priority: Math.abs(corr.correlation) > 0.7 ? 'high' : 'medium',
276
- title: `Correlação ${strength} entre ${corr.variable1} e ${corr.variable2}`,
277
- description: `Correlação ${direction} de ${corr.correlation.toFixed(3)}`,
278
- variables: [corr.variable1, corr.variable2],
279
- correlation: corr.correlation,
280
- significance: corr.pValue != null ? (corr.pValue < config.significanceLevel) : undefined
281
- };
282
- });
283
-
284
- return {
285
- matrix: correlationMatrix.correlations || correlationMatrix,
286
- strongCorrelations,
287
- insights,
288
- summary: `Encontradas ${strongCorrelations.length} correlações ≥ ${config.minCorrelationThreshold}`
289
- };
290
- }
291
-
292
- // =======================
293
- // Regressão
294
- // =======================
295
- performRegressionAnalysis(dataset, quantitativeVars, correlationAnalysis, config) {
296
- const regressionResults = [];
297
-
298
- if (correlationAnalysis.strongCorrelations) {
299
- correlationAnalysis.strongCorrelations
300
- .filter(corr => Math.abs(corr.correlation) > 0.5)
301
- .slice(0, 5)
302
- .forEach(corr => {
303
- try {
304
- const xValues = dataset.data.map(row => row[corr.variable1])
305
- .filter(v => typeof v === 'number' && !isNaN(v));
306
- const yValues = dataset.data.map(row => row[corr.variable2])
307
- .filter(v => typeof v === 'number' && !isNaN(v));
308
-
309
- if (xValues.length === yValues.length && xValues.length > 10) {
310
- const regression = this.stats.linearRegression(xValues, yValues);
311
-
312
- regressionResults.push({
313
- independent: corr.variable1,
314
- dependent: corr.variable2,
315
- equation: regression.equation,
316
- rSquared: regression.rSquared,
317
- significant: regression.pValueModel < config.significanceLevel,
318
- interpretation: this.interpretRegressionResult(regression),
319
- details: regression
320
- });
321
- }
322
- } catch (error) {
323
- console.warn(`Erro na regressão ${corr.variable1} -> ${corr.variable2}:`, error.message);
324
- }
325
- });
326
- }
327
-
328
- return {
329
- models: regressionResults,
330
- summary: `${regressionResults.length} modelos de regressão analisados`
331
- };
332
- }
333
-
334
- // =======================
335
- // Distribuições / Normalidade
336
- // =======================
337
- performDistributionAnalysis(dataset, variableTypes) {
338
- const results = {};
339
-
340
- variableTypes.quantitative.forEach(variable => {
341
- const values = dataset.data
342
- .map(row => row[variable.name])
343
- .filter(v => typeof v === 'number' && !isNaN(v));
344
-
345
- if (values.length > 10) {
346
- try {
347
- const normalityTest = this.stats.shapiroWilkTest(values);
348
- const skewness = this.stats.skewness(values);
349
- const kurtosis = this.stats.kurtosis(values);
350
-
351
- results[variable.name] = {
352
- isNormal: normalityTest.isNormal,
353
- normalityPValue: normalityTest.pValue,
354
- skewness,
355
- kurtosis,
356
- distributionType: this.classifyDistributionType(skewness, kurtosis, normalityTest.isNormal),
357
- recommendation: this.getDistributionRecommendation(skewness, kurtosis, normalityTest.isNormal)
358
- };
359
- } catch (error) {
360
- results[variable.name] = {
361
- error: 'Não foi possível analisar a distribuição',
362
- reason: error.message
363
- };
364
- }
365
- }
366
- });
367
-
368
- return results;
369
- }
370
-
371
- // =======================
372
- // Outliers
373
- // =======================
374
- performOutlierAnalysis(dataset, quantitativeVars) {
375
- const results = {};
376
-
377
- quantitativeVars.forEach(variable => {
378
- const values = dataset.data
379
- .map(row => row[variable.name])
380
- .filter(v => typeof v === 'number' && !isNaN(v));
381
-
382
- if (values.length > 5) {
383
- const outliers = this.stats.detectOutliers(values, 'iqr');
384
- results[variable.name] = {
385
- count: outliers.count,
386
- percentage: outliers.percentage,
387
- severity: this.classifyOutlierSeverity(outliers.percentage),
388
- values: outliers.outliers.slice(0, 10),
389
- recommendation: this.getOutlierRecommendation(outliers.percentage)
390
- };
391
- }
392
- });
393
-
394
- return results;
395
- }
396
-
397
- // =======================
398
- // Temporal
399
- // =======================
400
- performTemporalAnalysis(dataset, variableTypes) {
401
- if (variableTypes.datetime.length === 0) {
402
- return { message: 'Nenhuma variável temporal detectada' };
403
- }
404
-
405
- const results = {};
406
-
407
- variableTypes.datetime.forEach(dateVar => {
408
- const dates = dataset.data
409
- .map(row => new Date(row[dateVar.name]))
410
- .filter(date => !isNaN(date.getTime()))
411
- .sort((a, b) => a - b);
412
-
413
- if (dates.length > 2) {
414
- const timeSpan = dates[dates.length - 1] - dates[0];
415
- const avgInterval = timeSpan / (dates.length - 1);
416
-
417
- results[dateVar.name] = {
418
- span: `${Math.floor(timeSpan / (1000 * 60 * 60 * 24))} dias`,
419
- frequency: this.determineFrequency(avgInterval),
420
- earliest: dates[0].toISOString().split('T')[0],
421
- latest: dates[dates.length - 1].toISOString().split('T')[0],
422
- dataPoints: dates.length
423
- };
424
- }
425
- });
426
-
427
- return results;
428
- }
429
-
430
- // =======================
431
- // Insights / Visualizações / Sumário
432
- // =======================
433
- generateAutoInsights(dataset, analyses, config) {
434
- const insights = [];
435
-
436
- const { quantitative, qualitative } = analyses.variableTypes;
437
- insights.push({
438
- category: 'overview',
439
- priority: 'high',
440
- title: 'Composição do Dataset',
441
- description: `Dataset com ${dataset.length} registros, ${quantitative.length} variáveis numéricas e ${qualitative.length} categóricas`,
442
- icon: '📊'
443
- });
444
-
445
- if (analyses.correlationAnalysis.insights) {
446
- insights.push(...analyses.correlationAnalysis.insights);
447
- }
448
-
449
- Object.entries(analyses.distributionAnalysis).forEach(([variable, analysis]) => {
450
- if (analysis.distributionType && analysis.distributionType !== 'normal') {
451
- insights.push({
452
- category: 'distribution',
453
- priority: 'medium',
454
- title: `Distribuição não-normal: ${variable}`,
455
- description: analysis.recommendation,
456
- variable,
457
- icon: '📈'
458
- });
459
- }
460
- });
461
-
462
- Object.entries(analyses.outlierAnalysis).forEach(([variable, analysis]) => {
463
- if (analysis.severity === 'high') {
464
- insights.push({
465
- category: 'quality',
466
- priority: 'high',
467
- title: `Outliers significativos em ${variable}`,
468
- description: `${analysis.count} outliers (${analysis.percentage.toFixed(1)}%) detectados`,
469
- recommendation: analysis.recommendation,
470
- variable,
471
- icon: '⚠️'
472
- });
473
- }
474
- });
475
-
476
- analyses.regressionAnalysis.models?.forEach(model => {
477
- if (model.significant && model.rSquared > 0.5) {
478
- insights.push({
479
- category: 'modeling',
480
- priority: 'high',
481
- title: `Modelo preditivo viável: ${model.dependent}`,
482
- description: `${model.independent} explica ${(model.rSquared * 100).toFixed(1)}% da variação em ${model.dependent}`,
483
- variables: [model.independent, model.dependent],
484
- rSquared: model.rSquared,
485
- icon: '🎯'
486
- });
487
- }
488
- });
489
-
490
- const priorityOrder = { high: 3, medium: 2, low: 1 };
491
- return insights.sort((a, b) => priorityOrder[b.priority] - priorityOrder[a.priority]);
492
- }
493
-
494
- suggestVisualizations(variableTypes, correlationAnalysis, distributionAnalysis) {
495
- const suggestions = [];
496
-
497
- variableTypes.quantitative.forEach(variable => {
498
- suggestions.push({
499
- type: 'histogram',
500
- variable: variable.name,
501
- title: `Distribuição de ${variable.name}`,
502
- description: 'Histogram mostrando a distribuição dos valores',
503
- priority: 'medium'
504
- });
505
- });
506
-
507
- if (correlationAnalysis.strongCorrelations) {
508
- correlationAnalysis.strongCorrelations
509
- .filter(corr => Math.abs(corr.correlation) > 0.5)
510
- .slice(0, 3)
511
- .forEach(corr => {
512
- suggestions.push({
513
- type: 'scatter',
514
- variables: [corr.variable1, corr.variable2],
515
- title: `${corr.variable1} vs ${corr.variable2}`,
516
- description: `Scatter plot mostrando correlação ${corr.correlation > 0 ? 'positiva' : 'negativa'}`,
517
- priority: 'high'
518
- });
519
- });
520
- }
521
-
522
- [...variableTypes.qualitative, ...variableTypes.binary].forEach(variable => {
523
- if (variable.uniqueCount <= 20) {
524
- suggestions.push({
525
- type: 'bar',
526
- variable: variable.name,
527
- title: `Frequência de ${variable.name}`,
528
- description: 'Gráfico de barras mostrando a distribuição das categorias',
529
- priority: 'medium'
530
- });
531
- }
532
- });
533
-
534
- variableTypes.quantitative.forEach(variable => {
535
- suggestions.push({
536
- type: 'boxplot',
537
- variable: variable.name,
538
- title: `Box Plot de ${variable.name}`,
539
- description: 'Box plot para identificar outliers e quartis',
540
- priority: 'low'
541
- });
542
- });
543
-
544
- const priorityOrder = { high: 3, medium: 2, low: 1 };
545
- return suggestions.sort((a, b) => priorityOrder[b.priority] - priorityOrder[a.priority]);
546
- }
547
-
548
- generateExecutiveSummary(insights) {
549
- const highPriority = insights.filter(i => i.priority === 'high');
550
- const categories = [...new Set(insights.map(i => i.category))];
551
-
552
- return {
553
- totalInsights: insights.length,
554
- highPriorityInsights: highPriority.length,
555
- categoriesCovered: categories,
556
- keyFindings: highPriority.slice(0, 3).map(i => ({
557
- title: i.title,
558
- description: i.description
559
- })),
560
- recommendations: this.generateTopRecommendations(insights)
561
- };
562
- }
563
-
564
- // =======================
565
- // Helpers
566
- // =======================
567
- isDateTimeColumn(values) {
568
- const sampleSize = Math.min(values.length, 20);
569
- const sample = values.slice(0, sampleSize);
570
- const dateCount = sample.filter(val => {
571
- if (typeof val === 'string') {
572
- const date = new Date(val);
573
- return !isNaN(date.getTime());
574
- }
575
- return false;
576
- }).length;
577
-
578
- return dateCount / sampleSize > 0.7;
579
- }
580
-
581
- determineQuantitativeSubtype(values) {
582
- const integers = values.filter(v => Number.isInteger(v));
583
- const integerRatio = integers.length / values.length;
584
- return integerRatio > 0.9 ? 'discrete' : 'continuous';
585
- }
586
-
587
- isOrdinalVariable(uniqueValues) {
588
- const ordinalPatterns = [
589
- /^(baixo|médio|alto)$/i,
590
- /^(pequeno|grande)$/i,
591
- /^(ruim|regular|bom|ótimo)$/i,
592
- /^[1-5]$/,
593
- /^(primeiro|segundo|terceiro)$/i
594
- ];
595
- return ordinalPatterns.some(pattern =>
596
- uniqueValues.every(val => pattern.test(String(val)))
597
- );
598
- }
599
-
600
- calculateConcentration(frequencyTable) {
601
- if (frequencyTable.length === 0) return 0;
602
- return frequencyTable[0].percentage;
603
- }
604
-
605
- getCorrelationStrength(correlation) {
606
- if (correlation >= 0.8) return 'muito forte';
607
- if (correlation >= 0.6) return 'forte';
608
- if (correlation >= 0.4) return 'moderada';
609
- if (correlation >= 0.2) return 'fraca';
610
- return 'muito fraca';
611
- }
612
-
613
- interpretRegressionResult(regression) {
614
- const r2Percent = (regression.rSquared * 100).toFixed(1);
615
- const significant = regression.pValueModel < 0.05;
616
- return {
617
- quality: regression.rSquared > 0.7 ? 'excelente'
618
- : regression.rSquared > 0.5 ? 'boa'
619
- : regression.rSquared > 0.3 ? 'moderada' : 'fraca',
620
- explanation: `O modelo explica ${r2Percent}% da variação`,
621
- isSignificant: significant
622
- };
623
- }
624
-
625
- classifyDistributionType(skewness, kurtosis, isNormal) {
626
- if (isNormal) return 'normal';
627
- if (Math.abs(skewness) > 1) {
628
- return skewness > 0 ? 'assimétrica_direita' : 'assimétrica_esquerda';
629
- }
630
- if (Math.abs(kurtosis) > 1) {
631
- return kurtosis > 0 ? 'leptocúrtica' : 'platicúrtica';
632
- }
633
- return 'aproximadamente_normal';
634
- }
635
-
636
- getDistributionRecommendation(skewness, kurtosis, isNormal) {
637
- if (isNormal) return 'Distribuição normal - ideal para testes paramétricos';
638
- if (Math.abs(skewness) > 1) return 'Considere transformação logarítmica para normalizar';
639
- if (Math.abs(kurtosis) > 1) return 'Distribuição com caudas atípicas - use testes robustos';
640
- return 'Distribuição aproximadamente normal';
641
- }
642
-
643
- classifyOutlierSeverity(percentage) {
644
- if (percentage > 10) return 'high';
645
- if (percentage > 5) return 'medium';
646
- return 'low';
647
- }
648
-
649
- getOutlierRecommendation(percentage) {
650
- if (percentage > 10) return 'Investigar e possivelmente remover outliers';
651
- if (percentage > 5) return 'Verificar se outliers são valores legítimos';
652
- return 'Poucos outliers - monitorar';
653
- }
654
-
655
- determineFrequency(avgInterval) {
656
- const day = 24 * 60 * 60 * 1000;
657
- if (avgInterval < day) return 'diária';
658
- if (avgInterval < day * 7) return 'semanal';
659
- if (avgInterval < day * 30) return 'mensal';
660
- return 'anual';
661
- }
662
-
663
- generateTopRecommendations(insights) {
664
- const recommendations = [];
665
-
666
- const correlationInsights = insights.filter(i => i.category === 'correlation');
667
- if (correlationInsights.length > 0) {
668
- recommendations.push('Explore as correlações identificadas para possível modelagem preditiva');
669
- }
670
-
671
- const qualityInsights = insights.filter(i => i.category === 'quality' && i.priority === 'high');
672
- if (qualityInsights.length > 0) {
673
- recommendations.push('Trate os outliers identificados antes de prosseguir com análises');
674
- }
675
-
676
- const distributionInsights = insights.filter(i => i.category === 'distribution');
677
- if (distributionInsights.length > 0) {
678
- recommendations.push('Considere transformações para normalizar distribuições assimétricas');
679
- }
680
-
681
- return recommendations;
682
- }
683
- }
684
-
685
- export default AutoAnalyzer;