autoscholar-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +212 -0
- package/dist/agents/euler.js +261 -0
- package/dist/agents/fisher.js +348 -0
- package/dist/agents/gauss.js +177 -0
- package/dist/agents/governor.js +201 -0
- package/dist/agents/newton.js +207 -0
- package/dist/agents/turing.js +307 -0
- package/dist/cli/banner.js +136 -0
- package/dist/cli/configCommand.js +125 -0
- package/dist/cli/interactive.js +115 -0
- package/dist/cli/outputsCommand.js +191 -0
- package/dist/cli/resumeCommand.js +78 -0
- package/dist/cli/runCommand.js +91 -0
- package/dist/config/loader.js +154 -0
- package/dist/config/setup.js +179 -0
- package/dist/connectors/academic.js +307 -0
- package/dist/connectors/eodhd.js +90 -0
- package/dist/connectors/firecrawl.js +94 -0
- package/dist/connectors/fmp.js +115 -0
- package/dist/connectors/fred.js +82 -0
- package/dist/connectors/index.js +24 -0
- package/dist/connectors/websearch.js +117 -0
- package/dist/index.js +72 -0
- package/dist/latex/generator.js +413 -0
- package/dist/python/runner.js +141 -0
- package/dist/utils/llm.js +73 -0
- package/dist/utils/logger.js +83 -0
- package/dist/utils/project.js +100 -0
- package/package.json +63 -0
- package/python/analysis/garch_template.py +131 -0
- package/python/clients/eodhd_client.py +78 -0
- package/python/clients/fmp_client.py +64 -0
- package/python/clients/fred_client.py +57 -0
- package/python/clients/macro_clients.py +81 -0
- package/python/requirements.txt +23 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GovernorEngine = void 0;
|
|
4
|
+
const ERROR_PATTERNS = [
|
|
5
|
+
{ pattern: /ModuleNotFoundError/i, advice: 'Install missing Python package or use alternative library' },
|
|
6
|
+
{ pattern: /MemoryError/i, advice: 'Reduce dataset size, sample data, or process in chunks' },
|
|
7
|
+
{ pattern: /KeyError/i, advice: 'Check column names match actual DataFrame columns' },
|
|
8
|
+
{ pattern: /ValueError.*shapes/i, advice: 'Check array dimensions match for operations' },
|
|
9
|
+
{ pattern: /TimeoutError/i, advice: 'Reduce API call scope or add retry logic' },
|
|
10
|
+
{ pattern: /SingularMatrix/i, advice: 'Remove collinear variables or use regularization' },
|
|
11
|
+
{ pattern: /ConvergenceWarning/i, advice: 'Increase max_iter or simplify model' },
|
|
12
|
+
{ pattern: /empty dataframe/i, advice: 'Check API returned data; try different date range or ticker' },
|
|
13
|
+
{ pattern: /all NaN/i, advice: 'Check data quality; use forward/backward fill or drop NA columns' },
|
|
14
|
+
{ pattern: /zero variance/i, advice: 'Remove constant columns before regression' },
|
|
15
|
+
];
|
|
16
|
+
const STAGE_ORDER = {
|
|
17
|
+
gauss: 0,
|
|
18
|
+
turing: 1,
|
|
19
|
+
newton: 2,
|
|
20
|
+
fisher: 3,
|
|
21
|
+
euler_write: 4,
|
|
22
|
+
euler_compile: 5,
|
|
23
|
+
popper: 6,
|
|
24
|
+
complete: 7,
|
|
25
|
+
};
|
|
26
|
+
class GovernorEngine {
|
|
27
|
+
constructor() {
|
|
28
|
+
this.currentStage = 'gauss';
|
|
29
|
+
this.currentProfile = 'default';
|
|
30
|
+
this.confidence = 0.8;
|
|
31
|
+
this.decisions = [];
|
|
32
|
+
this.figureCount = 0;
|
|
33
|
+
this.wordCounts = {};
|
|
34
|
+
this.retryBudget = {
|
|
35
|
+
python: { used: 0, max: 5 },
|
|
36
|
+
data: { used: 0, max: 3 },
|
|
37
|
+
latex: { used: 0, max: 3 },
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
evaluate(event) {
|
|
41
|
+
const prevProfile = this.currentProfile;
|
|
42
|
+
const prevConfidence = this.confidence;
|
|
43
|
+
if (event.stage) {
|
|
44
|
+
this.currentStage = event.stage;
|
|
45
|
+
}
|
|
46
|
+
if (event.figuresGenerated) {
|
|
47
|
+
this.figureCount += event.figuresGenerated;
|
|
48
|
+
}
|
|
49
|
+
if (event.sectionName && event.wordCount) {
|
|
50
|
+
this.wordCounts[event.sectionName] = event.wordCount;
|
|
51
|
+
}
|
|
52
|
+
let action = { type: 'continue' };
|
|
53
|
+
let trigger = '';
|
|
54
|
+
let reasoning = '';
|
|
55
|
+
if (event.success) {
|
|
56
|
+
this.confidence = Math.min(1.0, this.confidence + 0.02);
|
|
57
|
+
if (event.stage === 'fisher' && this.currentProfile !== 'academic_figures') {
|
|
58
|
+
action = { type: 'switch_profile', profile: 'academic_figures' };
|
|
59
|
+
this.currentProfile = 'academic_figures';
|
|
60
|
+
trigger = 'Entered Fisher stage';
|
|
61
|
+
reasoning = 'Switch to academic_figures profile for publication-quality outputs';
|
|
62
|
+
}
|
|
63
|
+
else if (event.stage === 'euler_write' && this.currentProfile !== 'jf_rewrite') {
|
|
64
|
+
action = { type: 'switch_profile', profile: 'jf_rewrite' };
|
|
65
|
+
this.currentProfile = 'jf_rewrite';
|
|
66
|
+
trigger = 'Entered Euler write stage';
|
|
67
|
+
reasoning = 'Switch to Journal of Finance writing style';
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
trigger = `${event.stage || this.currentStage} succeeded`;
|
|
71
|
+
reasoning = 'Continue normally';
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
else {
|
|
75
|
+
const output = event.output || '';
|
|
76
|
+
const matchedError = ERROR_PATTERNS.find((ep) => ep.pattern.test(output));
|
|
77
|
+
if (matchedError) {
|
|
78
|
+
this.confidence = Math.max(0, this.confidence - 0.05);
|
|
79
|
+
this.retryBudget.python.used++;
|
|
80
|
+
if (this.retryBudget.python.used >= this.retryBudget.python.max) {
|
|
81
|
+
action = { type: 'force_finalize', reason: 'Python retry budget exhausted' };
|
|
82
|
+
trigger = 'Python errors exceeded budget';
|
|
83
|
+
reasoning = `${this.retryBudget.python.used} Python errors — forcing finalization`;
|
|
84
|
+
}
|
|
85
|
+
else {
|
|
86
|
+
action = {
|
|
87
|
+
type: 'switch_profile',
|
|
88
|
+
profile: 'debug_recovery',
|
|
89
|
+
directive: matchedError.advice,
|
|
90
|
+
};
|
|
91
|
+
this.currentProfile = 'debug_recovery';
|
|
92
|
+
trigger = `Python error: ${matchedError.pattern.source}`;
|
|
93
|
+
reasoning = matchedError.advice;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
else if (/empty|no data|0 rows|NaN/i.test(output)) {
|
|
97
|
+
this.confidence = Math.max(0, this.confidence - 0.10);
|
|
98
|
+
this.retryBudget.data.used++;
|
|
99
|
+
action = {
|
|
100
|
+
type: 'switch_profile',
|
|
101
|
+
profile: 'conservative',
|
|
102
|
+
directive: 'Widen date range, try alternate data source, or sample smaller subset',
|
|
103
|
+
};
|
|
104
|
+
this.currentProfile = 'conservative';
|
|
105
|
+
trigger = 'Data quality issue detected';
|
|
106
|
+
reasoning = 'Switching to conservative data retrieval approach';
|
|
107
|
+
}
|
|
108
|
+
else if (/LaTeX|pdflatex|Undefined control|Missing/i.test(output)) {
|
|
109
|
+
this.confidence = Math.max(0, this.confidence - 0.05);
|
|
110
|
+
this.retryBudget.latex.used++;
|
|
111
|
+
action = {
|
|
112
|
+
type: 'switch_profile',
|
|
113
|
+
profile: 'latex_fallback',
|
|
114
|
+
directive: 'Simplify LaTeX: remove complex environments, use basic tables',
|
|
115
|
+
};
|
|
116
|
+
this.currentProfile = 'latex_fallback';
|
|
117
|
+
trigger = 'LaTeX compilation error';
|
|
118
|
+
reasoning = 'Simplifying LaTeX constructs for reliable compilation';
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
this.confidence = Math.max(0, this.confidence - 0.03);
|
|
122
|
+
trigger = `Unknown error in ${this.currentStage}`;
|
|
123
|
+
reasoning = 'Generic failure — will retry with caution';
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
const decision = {
|
|
127
|
+
stage: this.currentStage,
|
|
128
|
+
trigger,
|
|
129
|
+
action,
|
|
130
|
+
reasoning,
|
|
131
|
+
profileBefore: prevProfile,
|
|
132
|
+
profileAfter: this.currentProfile,
|
|
133
|
+
confidenceBefore: prevConfidence,
|
|
134
|
+
confidenceAfter: this.confidence,
|
|
135
|
+
};
|
|
136
|
+
this.decisions.push(decision);
|
|
137
|
+
return decision;
|
|
138
|
+
}
|
|
139
|
+
evaluatePopper(review) {
|
|
140
|
+
const prevProfile = this.currentProfile;
|
|
141
|
+
const prevConfidence = this.confidence;
|
|
142
|
+
this.currentStage = 'popper';
|
|
143
|
+
let action;
|
|
144
|
+
let trigger;
|
|
145
|
+
let reasoning;
|
|
146
|
+
if (review.overall >= 70 && review.verdict !== 'reject') {
|
|
147
|
+
action = { type: 'continue' };
|
|
148
|
+
trigger = `Popper review: ${review.overall}/100 — ${review.verdict}`;
|
|
149
|
+
reasoning = 'Paper passes quality threshold';
|
|
150
|
+
this.confidence = Math.min(1.0, this.confidence + 0.05);
|
|
151
|
+
}
|
|
152
|
+
else if (review.overall >= 50) {
|
|
153
|
+
action = {
|
|
154
|
+
type: 'force_revision',
|
|
155
|
+
severity: 'minor',
|
|
156
|
+
issues: [`Overall score: ${review.overall}/100`, `Verdict: ${review.verdict}`],
|
|
157
|
+
};
|
|
158
|
+
trigger = `Popper review: ${review.overall}/100 — needs revision`;
|
|
159
|
+
reasoning = 'Paper needs minor revisions before publication';
|
|
160
|
+
this.confidence = Math.max(0, this.confidence - 0.15);
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
action = {
|
|
164
|
+
type: 'block_publication',
|
|
165
|
+
reason: `Quality too low: ${review.overall}/100`,
|
|
166
|
+
};
|
|
167
|
+
trigger = `Popper review: ${review.overall}/100 — reject`;
|
|
168
|
+
reasoning = 'Paper quality below minimum threshold';
|
|
169
|
+
this.confidence = Math.max(0, this.confidence - 0.25);
|
|
170
|
+
}
|
|
171
|
+
const decision = {
|
|
172
|
+
stage: 'popper',
|
|
173
|
+
trigger,
|
|
174
|
+
action,
|
|
175
|
+
reasoning,
|
|
176
|
+
profileBefore: prevProfile,
|
|
177
|
+
profileAfter: this.currentProfile,
|
|
178
|
+
confidenceBefore: prevConfidence,
|
|
179
|
+
confidenceAfter: this.confidence,
|
|
180
|
+
};
|
|
181
|
+
this.decisions.push(decision);
|
|
182
|
+
return decision;
|
|
183
|
+
}
|
|
184
|
+
getState() {
|
|
185
|
+
return {
|
|
186
|
+
stage: this.currentStage,
|
|
187
|
+
profile: this.currentProfile,
|
|
188
|
+
confidence: this.confidence,
|
|
189
|
+
figureCount: this.figureCount,
|
|
190
|
+
wordCounts: { ...this.wordCounts },
|
|
191
|
+
retryBudget: JSON.parse(JSON.stringify(this.retryBudget)),
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
getDecisionLog() {
|
|
195
|
+
return [...this.decisions];
|
|
196
|
+
}
|
|
197
|
+
resetProfile() {
|
|
198
|
+
this.currentProfile = 'default';
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
exports.GovernorEngine = GovernorEngine;
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
36
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
37
|
+
};
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.runNewton = runNewton;
|
|
40
|
+
const ora_1 = __importDefault(require("ora"));
|
|
41
|
+
const fs = __importStar(require("fs"));
|
|
42
|
+
const path = __importStar(require("path"));
|
|
43
|
+
const llm_1 = require("../utils/llm");
|
|
44
|
+
const banner_1 = require("../cli/banner");
|
|
45
|
+
const project_1 = require("../utils/project");
|
|
46
|
+
const runner_1 = require("../python/runner");
|
|
47
|
+
async function runNewton(topic, projectId, config, logger, dataInfo) {
|
|
48
|
+
const spinner = (0, ora_1.default)({ text: 'Newton: Designing dataset construction...', indent: 2 }).start();
|
|
49
|
+
logger.info(`[NEWTON] Starting dataset construction with ${dataInfo.datasets.length} source files`);
|
|
50
|
+
const projectDir = (0, project_1.getProjectDir)(projectId);
|
|
51
|
+
const dataDir = path.join(projectDir, 'data');
|
|
52
|
+
const dataDescription = dataInfo.datasets.map((d) => `File: ${d.filename} (${d.rows} rows)\n Columns: ${d.columns.join(', ')}\n Path: ${d.filePath}`).join('\n\n');
|
|
53
|
+
spinner.text = 'Newton: Planning data transformations...';
|
|
54
|
+
const plan = await (0, llm_1.callLLM)(config, `You are Newton, the dataset construction agent of AutoScholar.
|
|
55
|
+
Your job is to write a Python script that:
|
|
56
|
+
1. Loads all source CSV files
|
|
57
|
+
2. Cleans missing values (forward fill, then drop remaining NaN)
|
|
58
|
+
3. Merges data sources on date/time columns
|
|
59
|
+
4. Creates derived variables (returns, log returns, rolling stats, lags)
|
|
60
|
+
5. Handles outliers (winsorize at 1st/99th percentile)
|
|
61
|
+
6. Constructs the final analysis-ready dataset
|
|
62
|
+
7. Saves the result as final_dataset.csv
|
|
63
|
+
|
|
64
|
+
IMPORTANT RULES:
|
|
65
|
+
- Use only pandas, numpy, scipy (always available)
|
|
66
|
+
- Read CSVs from the exact paths given
|
|
67
|
+
- Save final dataset to: ${path.join(dataDir, 'final_dataset.csv')}
|
|
68
|
+
- Print a JSON summary at the end with keys: rows, columns, transformations, dataDictionary
|
|
69
|
+
|
|
70
|
+
Write ONLY the Python code, nothing else. Start with imports.`, `Research topic: ${topic}\n\nAvailable datasets:\n${dataDescription}`, { maxTokens: 8192 });
|
|
71
|
+
let pythonCode = plan;
|
|
72
|
+
const codeMatch = plan.match(/```python\s*\n?([\s\S]*?)```/);
|
|
73
|
+
if (codeMatch) {
|
|
74
|
+
pythonCode = codeMatch[1];
|
|
75
|
+
}
|
|
76
|
+
else if (plan.startsWith('import') || plan.startsWith('#')) {
|
|
77
|
+
pythonCode = plan;
|
|
78
|
+
}
|
|
79
|
+
(0, project_1.saveProjectFile)(projectId, 'code/newton_build_dataset.py', pythonCode);
|
|
80
|
+
spinner.text = 'Newton: Building dataset...';
|
|
81
|
+
logger.info(`[NEWTON] Executing dataset construction script`);
|
|
82
|
+
const pyResult = await (0, runner_1.runPython)(pythonCode, projectDir);
|
|
83
|
+
if (!pyResult.success) {
|
|
84
|
+
spinner.stop();
|
|
85
|
+
logger.error(`[NEWTON] Python error: ${pyResult.error}`);
|
|
86
|
+
spinner.start();
|
|
87
|
+
spinner.text = 'Newton: Retrying with simplified approach...';
|
|
88
|
+
const simplifiedCode = generateSimplifiedMerge(dataInfo.datasets, dataDir);
|
|
89
|
+
(0, project_1.saveProjectFile)(projectId, 'code/newton_build_dataset_v2.py', simplifiedCode);
|
|
90
|
+
const retryResult = await (0, runner_1.runPython)(simplifiedCode, projectDir);
|
|
91
|
+
if (!retryResult.success) {
|
|
92
|
+
spinner.stop();
|
|
93
|
+
logger.error(`[NEWTON] Retry failed: ${retryResult.error}`);
|
|
94
|
+
return {
|
|
95
|
+
success: false,
|
|
96
|
+
finalDatasetPath: '',
|
|
97
|
+
rows: 0,
|
|
98
|
+
columns: [],
|
|
99
|
+
description: 'Dataset construction failed',
|
|
100
|
+
transformations: [],
|
|
101
|
+
dataDictionary: {},
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
return parseNewtonOutput(retryResult.output, projectId, dataDir, logger, spinner);
|
|
105
|
+
}
|
|
106
|
+
return parseNewtonOutput(pyResult.output, projectId, dataDir, logger, spinner);
|
|
107
|
+
}
|
|
108
|
+
function parseNewtonOutput(output, projectId, dataDir, logger, spinner) {
|
|
109
|
+
spinner.stop();
|
|
110
|
+
const jsonMatch = output.match(/\{[\s\S]*"rows"[\s\S]*\}/);
|
|
111
|
+
if (jsonMatch) {
|
|
112
|
+
try {
|
|
113
|
+
const summary = JSON.parse(jsonMatch[0]);
|
|
114
|
+
const finalPath = path.join(dataDir, 'final_dataset.csv');
|
|
115
|
+
(0, banner_1.printSuccess)(`Dataset built: ${summary.rows} rows, ${(summary.columns || []).length} columns`);
|
|
116
|
+
if (summary.transformations) {
|
|
117
|
+
for (const t of summary.transformations) {
|
|
118
|
+
(0, banner_1.printInfo)(` Transform: ${t}`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
logger.info(`[NEWTON] Complete — ${summary.rows} rows, ${(summary.columns || []).length} columns`);
|
|
122
|
+
return {
|
|
123
|
+
success: true,
|
|
124
|
+
finalDatasetPath: finalPath,
|
|
125
|
+
rows: summary.rows || 0,
|
|
126
|
+
columns: summary.columns || [],
|
|
127
|
+
description: summary.description || 'Merged analysis dataset',
|
|
128
|
+
transformations: summary.transformations || [],
|
|
129
|
+
dataDictionary: summary.dataDictionary || {},
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
catch { }
|
|
133
|
+
}
|
|
134
|
+
const finalPath = path.join(dataDir, 'final_dataset.csv');
|
|
135
|
+
if (fs.existsSync(finalPath)) {
|
|
136
|
+
const content = fs.readFileSync(finalPath, 'utf-8');
|
|
137
|
+
const lines = content.split('\n').filter((l) => l.trim());
|
|
138
|
+
const headers = lines[0]?.split(',') || [];
|
|
139
|
+
const rows = Math.max(0, lines.length - 1);
|
|
140
|
+
(0, banner_1.printSuccess)(`Dataset built: ${rows} rows, ${headers.length} columns`);
|
|
141
|
+
logger.info(`[NEWTON] Complete (fallback) — ${rows} rows`);
|
|
142
|
+
return {
|
|
143
|
+
success: true,
|
|
144
|
+
finalDatasetPath: finalPath,
|
|
145
|
+
rows,
|
|
146
|
+
columns: headers,
|
|
147
|
+
description: 'Merged analysis dataset',
|
|
148
|
+
transformations: ['merge', 'clean'],
|
|
149
|
+
dataDictionary: {},
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
return {
|
|
153
|
+
success: false,
|
|
154
|
+
finalDatasetPath: '',
|
|
155
|
+
rows: 0,
|
|
156
|
+
columns: [],
|
|
157
|
+
description: 'Dataset construction failed',
|
|
158
|
+
transformations: [],
|
|
159
|
+
dataDictionary: {},
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
function generateSimplifiedMerge(datasets, dataDir) {
|
|
163
|
+
const loads = datasets.map((d, i) => {
|
|
164
|
+
return `df${i} = pd.read_csv("${d.filePath}")
|
|
165
|
+
print(f"Loaded ${d.filename}: {df${i}.shape}")`;
|
|
166
|
+
}).join('\n');
|
|
167
|
+
return `import pandas as pd
|
|
168
|
+
import numpy as np
|
|
169
|
+
import json
|
|
170
|
+
|
|
171
|
+
${loads}
|
|
172
|
+
|
|
173
|
+
# Find common date column
|
|
174
|
+
dfs = [${datasets.map((_, i) => `df${i}`).join(', ')}]
|
|
175
|
+
merged = dfs[0]
|
|
176
|
+
for df in dfs[1:]:
|
|
177
|
+
# Try to find common column for merge
|
|
178
|
+
common_cols = set(merged.columns) & set(df.columns)
|
|
179
|
+
date_cols = [c for c in common_cols if 'date' in c.lower() or 'time' in c.lower()]
|
|
180
|
+
if date_cols:
|
|
181
|
+
merged = pd.merge(merged, df, on=date_cols[0], how='outer', suffixes=('', '_dup'))
|
|
182
|
+
else:
|
|
183
|
+
# Concatenate if no common date
|
|
184
|
+
merged = pd.concat([merged, df], axis=1)
|
|
185
|
+
|
|
186
|
+
# Drop duplicate columns
|
|
187
|
+
merged = merged.loc[:, ~merged.columns.duplicated()]
|
|
188
|
+
|
|
189
|
+
# Clean
|
|
190
|
+
merged = merged.dropna(how='all')
|
|
191
|
+
for col in merged.select_dtypes(include=[np.number]).columns:
|
|
192
|
+
merged[col] = merged[col].ffill().bfill()
|
|
193
|
+
|
|
194
|
+
# Save
|
|
195
|
+
output_path = "${path.join(dataDir, 'final_dataset.csv')}"
|
|
196
|
+
merged.to_csv(output_path, index=False)
|
|
197
|
+
|
|
198
|
+
summary = {
|
|
199
|
+
"rows": len(merged),
|
|
200
|
+
"columns": list(merged.columns),
|
|
201
|
+
"transformations": ["merge", "ffill", "bfill", "dropna"],
|
|
202
|
+
"dataDictionary": {col: str(merged[col].dtype) for col in merged.columns},
|
|
203
|
+
"description": "Merged dataset from ${datasets.length} sources"
|
|
204
|
+
}
|
|
205
|
+
print(json.dumps(summary))
|
|
206
|
+
`;
|
|
207
|
+
}
|