ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Time Series & Forecasting Tools
|
|
3
|
+
Tools for time series analysis, forecasting, seasonality detection, and feature engineering.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import Dict, Any, List, Optional
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
import warnings
|
|
13
|
+
|
|
14
|
+
warnings.filterwarnings('ignore')
|
|
15
|
+
|
|
16
|
+
# Add parent directory to path for imports
|
|
17
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
18
|
+
|
|
19
|
+
# Lazy imports - only import when needed to avoid blocking app startup
|
|
20
|
+
# from statsmodels.tsa.arima.model import ARIMA
|
|
21
|
+
# from statsmodels.tsa.statespace.sarimax import SARIMAX
|
|
22
|
+
# from statsmodels.tsa.holtwinters import ExponentialSmoothing
|
|
23
|
+
# from statsmodels.tsa.seasonal import seasonal_decompose, STL
|
|
24
|
+
# from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
|
|
25
|
+
# from prophet import Prophet
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
from ds_agent.utils.polars_helpers import load_dataframe, save_dataframe
|
|
29
|
+
from ds_agent.utils.validation import validate_file_exists, validate_file_format, validate_dataframe, validate_column_exists
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def forecast_time_series(
|
|
33
|
+
file_path: str,
|
|
34
|
+
time_col: str,
|
|
35
|
+
target_col: str,
|
|
36
|
+
forecast_horizon: int = 30,
|
|
37
|
+
method: str = "prophet",
|
|
38
|
+
seasonal_period: Optional[int] = None,
|
|
39
|
+
output_path: Optional[str] = None
|
|
40
|
+
) -> Dict[str, Any]:
|
|
41
|
+
"""
|
|
42
|
+
Forecast time series using ARIMA, SARIMA, Prophet, or Exponential Smoothing.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
file_path: Path to time series dataset
|
|
46
|
+
time_col: Time/date column name
|
|
47
|
+
target_col: Target variable to forecast
|
|
48
|
+
forecast_horizon: Number of periods to forecast ahead
|
|
49
|
+
method: Forecasting method ('arima', 'auto_arima', 'sarima', 'prophet', 'exponential_smoothing')
|
|
50
|
+
seasonal_period: Seasonal period (e.g., 7 for weekly, 12 for monthly)
|
|
51
|
+
output_path: Path to save forecast results
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Dictionary with forecast values and metrics
|
|
55
|
+
"""
|
|
56
|
+
# Load data
|
|
57
|
+
df = load_dataframe(file_path)
|
|
58
|
+
validate_dataframe(df)
|
|
59
|
+
validate_column_exists(df, time_col)
|
|
60
|
+
validate_column_exists(df, target_col)
|
|
61
|
+
|
|
62
|
+
# Sort by time
|
|
63
|
+
df = df.sort(time_col)
|
|
64
|
+
|
|
65
|
+
# Lazy import of time series libraries
|
|
66
|
+
try:
|
|
67
|
+
if method == "prophet":
|
|
68
|
+
from prophet import Prophet
|
|
69
|
+
elif method in ["arima", "sarima"]:
|
|
70
|
+
from statsmodels.tsa.arima.model import ARIMA
|
|
71
|
+
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
|
72
|
+
elif method == "exponential_smoothing":
|
|
73
|
+
from statsmodels.tsa.holtwinters import ExponentialSmoothing
|
|
74
|
+
except ImportError as e:
|
|
75
|
+
return {
|
|
76
|
+
'status': 'error',
|
|
77
|
+
'message': f"Required library not installed for {method}: {str(e)}"
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
print(f"📈 Forecasting with {method} (horizon={forecast_horizon})...")
|
|
81
|
+
|
|
82
|
+
# Convert to pandas for time series libraries
|
|
83
|
+
df_pd = df.to_pandas()
|
|
84
|
+
|
|
85
|
+
if method == "prophet":
|
|
86
|
+
# Prophet requires 'ds' and 'y' columns
|
|
87
|
+
prophet_df = pd.DataFrame({
|
|
88
|
+
'ds': pd.to_datetime(df_pd[time_col]),
|
|
89
|
+
'y': df_pd[target_col]
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
|
|
93
|
+
model.fit(prophet_df)
|
|
94
|
+
|
|
95
|
+
# Create future dataframe
|
|
96
|
+
future = model.make_future_dataframe(periods=forecast_horizon)
|
|
97
|
+
forecast = model.predict(future)
|
|
98
|
+
|
|
99
|
+
# Extract forecast values
|
|
100
|
+
forecast_values = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(forecast_horizon)
|
|
101
|
+
|
|
102
|
+
result = {
|
|
103
|
+
'method': 'prophet',
|
|
104
|
+
'forecast': forecast_values.to_dict('records'),
|
|
105
|
+
'model_components': {
|
|
106
|
+
'trend': forecast['trend'].tail(forecast_horizon).tolist(),
|
|
107
|
+
'weekly': forecast.get('weekly', pd.Series([0]*forecast_horizon)).tail(forecast_horizon).tolist()
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
elif method == "auto_arima":
|
|
112
|
+
# Auto ARIMA using pmdarima - automatically finds best (p,d,q) order
|
|
113
|
+
try:
|
|
114
|
+
import pmdarima as pm
|
|
115
|
+
except ImportError:
|
|
116
|
+
return {
|
|
117
|
+
'status': 'error',
|
|
118
|
+
'message': 'pmdarima not installed. Install with: pip install pmdarima>=2.0'
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
ts_data = df_pd.set_index(time_col)[target_col]
|
|
122
|
+
|
|
123
|
+
print("🔧 Running auto_arima to find optimal ARIMA order...")
|
|
124
|
+
auto_model = pm.auto_arima(
|
|
125
|
+
ts_data,
|
|
126
|
+
seasonal=bool(seasonal_period),
|
|
127
|
+
m=seasonal_period or 1,
|
|
128
|
+
stepwise=True,
|
|
129
|
+
suppress_warnings=True,
|
|
130
|
+
error_action='ignore',
|
|
131
|
+
max_p=5, max_q=5, max_d=2,
|
|
132
|
+
max_P=2, max_Q=2, max_D=1,
|
|
133
|
+
trace=False
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Forecast
|
|
137
|
+
forecast_vals, conf_int = auto_model.predict(
|
|
138
|
+
n_periods=forecast_horizon,
|
|
139
|
+
return_conf_int=True
|
|
140
|
+
)
|
|
141
|
+
forecast_index = pd.date_range(start=ts_data.index[-1], periods=forecast_horizon+1, freq='D')[1:]
|
|
142
|
+
|
|
143
|
+
result = {
|
|
144
|
+
'method': 'auto_arima',
|
|
145
|
+
'order': str(auto_model.order),
|
|
146
|
+
'seasonal_order': str(auto_model.seasonal_order) if seasonal_period else None,
|
|
147
|
+
'forecast': [
|
|
148
|
+
{
|
|
149
|
+
'date': str(date),
|
|
150
|
+
'value': float(val),
|
|
151
|
+
'lower_ci': float(ci[0]),
|
|
152
|
+
'upper_ci': float(ci[1])
|
|
153
|
+
}
|
|
154
|
+
for date, val, ci in zip(forecast_index, forecast_vals, conf_int)
|
|
155
|
+
],
|
|
156
|
+
'aic': float(auto_model.aic()),
|
|
157
|
+
'bic': float(auto_model.bic()),
|
|
158
|
+
'model_summary': str(auto_model.summary())
|
|
159
|
+
}
|
|
160
|
+
print(f" ✅ Best order: {auto_model.order} | AIC: {auto_model.aic():.2f}")
|
|
161
|
+
|
|
162
|
+
elif method == "arima":
|
|
163
|
+
# ARIMA model
|
|
164
|
+
ts_data = df_pd.set_index(time_col)[target_col]
|
|
165
|
+
|
|
166
|
+
# Auto-determine order (p,d,q) - simplified version
|
|
167
|
+
model = ARIMA(ts_data, order=(1, 1, 1))
|
|
168
|
+
fitted_model = model.fit()
|
|
169
|
+
|
|
170
|
+
# Forecast
|
|
171
|
+
forecast = fitted_model.forecast(steps=forecast_horizon)
|
|
172
|
+
forecast_index = pd.date_range(start=ts_data.index[-1], periods=forecast_horizon+1, freq='D')[1:]
|
|
173
|
+
|
|
174
|
+
result = {
|
|
175
|
+
'method': 'arima',
|
|
176
|
+
'order': '(1,1,1)',
|
|
177
|
+
'forecast': [{'date': str(date), 'value': float(val)} for date, val in zip(forecast_index, forecast)],
|
|
178
|
+
'aic': float(fitted_model.aic),
|
|
179
|
+
'bic': float(fitted_model.bic)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
elif method == "sarima":
|
|
183
|
+
if not seasonal_period:
|
|
184
|
+
seasonal_period = 7 # Default weekly
|
|
185
|
+
|
|
186
|
+
ts_data = df_pd.set_index(time_col)[target_col]
|
|
187
|
+
|
|
188
|
+
# SARIMA model
|
|
189
|
+
model = SARIMAX(ts_data, order=(1, 1, 1), seasonal_order=(1, 1, 1, seasonal_period))
|
|
190
|
+
fitted_model = model.fit(disp=False)
|
|
191
|
+
|
|
192
|
+
# Forecast
|
|
193
|
+
forecast = fitted_model.forecast(steps=forecast_horizon)
|
|
194
|
+
forecast_index = pd.date_range(start=ts_data.index[-1], periods=forecast_horizon+1, freq='D')[1:]
|
|
195
|
+
|
|
196
|
+
result = {
|
|
197
|
+
'method': 'sarima',
|
|
198
|
+
'order': '(1,1,1)',
|
|
199
|
+
'seasonal_order': f'(1,1,1,{seasonal_period})',
|
|
200
|
+
'forecast': [{'date': str(date), 'value': float(val)} for date, val in zip(forecast_index, forecast)],
|
|
201
|
+
'aic': float(fitted_model.aic)
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
elif method == "exponential_smoothing":
|
|
205
|
+
ts_data = df_pd.set_index(time_col)[target_col]
|
|
206
|
+
|
|
207
|
+
# Exponential Smoothing
|
|
208
|
+
model = ExponentialSmoothing(
|
|
209
|
+
ts_data,
|
|
210
|
+
seasonal_periods=seasonal_period if seasonal_period else 12,
|
|
211
|
+
trend='add',
|
|
212
|
+
seasonal='add' if seasonal_period else None
|
|
213
|
+
)
|
|
214
|
+
fitted_model = model.fit()
|
|
215
|
+
|
|
216
|
+
# Forecast
|
|
217
|
+
forecast = fitted_model.forecast(steps=forecast_horizon)
|
|
218
|
+
forecast_index = pd.date_range(start=ts_data.index[-1], periods=forecast_horizon+1, freq='D')[1:]
|
|
219
|
+
|
|
220
|
+
result = {
|
|
221
|
+
'method': 'exponential_smoothing',
|
|
222
|
+
'forecast': [{'date': str(date), 'value': float(val)} for date, val in zip(forecast_index, forecast)]
|
|
223
|
+
}
|
|
224
|
+
else:
|
|
225
|
+
raise ValueError(f"Unsupported method: {method}")
|
|
226
|
+
|
|
227
|
+
# Save forecast
|
|
228
|
+
if output_path:
|
|
229
|
+
forecast_df = pl.DataFrame(result['forecast'])
|
|
230
|
+
save_dataframe(forecast_df, output_path)
|
|
231
|
+
print(f"💾 Forecast saved to: {output_path}")
|
|
232
|
+
|
|
233
|
+
result['status'] = 'success'
|
|
234
|
+
result['forecast_horizon'] = forecast_horizon
|
|
235
|
+
result['output_path'] = output_path
|
|
236
|
+
|
|
237
|
+
return result
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def detect_seasonality_trends(
|
|
241
|
+
file_path: str,
|
|
242
|
+
time_col: str,
|
|
243
|
+
target_col: str,
|
|
244
|
+
period: Optional[int] = None,
|
|
245
|
+
method: str = "stl",
|
|
246
|
+
output_path: Optional[str] = None
|
|
247
|
+
) -> Dict[str, Any]:
|
|
248
|
+
"""
|
|
249
|
+
Detect seasonality and trends in time series using STL decomposition.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
file_path: Path to time series dataset
|
|
253
|
+
time_col: Time/date column
|
|
254
|
+
target_col: Target variable
|
|
255
|
+
period: Seasonal period (None = auto-detect)
|
|
256
|
+
method: Decomposition method ('stl', 'classical')
|
|
257
|
+
output_path: Path to save decomposition results
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Dictionary with trend, seasonal, and residual components
|
|
261
|
+
"""
|
|
262
|
+
# Load data
|
|
263
|
+
df = load_dataframe(file_path)
|
|
264
|
+
validate_dataframe(df)
|
|
265
|
+
validate_column_exists(df, time_col)
|
|
266
|
+
validate_column_exists(df, target_col)
|
|
267
|
+
|
|
268
|
+
# Sort by time
|
|
269
|
+
df = df.sort(time_col)
|
|
270
|
+
|
|
271
|
+
# Lazy import of time series libraries
|
|
272
|
+
try:
|
|
273
|
+
if method == "stl":
|
|
274
|
+
from statsmodels.tsa.seasonal import STL
|
|
275
|
+
else:
|
|
276
|
+
from statsmodels.tsa.seasonal import seasonal_decompose
|
|
277
|
+
except ImportError as e:
|
|
278
|
+
return {
|
|
279
|
+
'status': 'error',
|
|
280
|
+
'message': f"Required library not installed: {str(e)}"
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
print(f"🔍 Detecting seasonality and trends using {method}...")
|
|
284
|
+
|
|
285
|
+
# Convert to pandas
|
|
286
|
+
df_pd = df.to_pandas()
|
|
287
|
+
ts_data = df_pd.set_index(time_col)[target_col]
|
|
288
|
+
|
|
289
|
+
# Auto-detect period using FFT if not provided
|
|
290
|
+
if period is None:
|
|
291
|
+
from scipy.fft import fft
|
|
292
|
+
from scipy.signal import find_peaks
|
|
293
|
+
|
|
294
|
+
# Remove trend
|
|
295
|
+
detrended = ts_data - ts_data.rolling(window=min(len(ts_data)//10, 30), center=True).mean()
|
|
296
|
+
detrended = detrended.fillna(method='bfill').fillna(method='ffill')
|
|
297
|
+
|
|
298
|
+
# FFT
|
|
299
|
+
fft_vals = np.abs(fft(detrended.values))
|
|
300
|
+
freqs = np.fft.fftfreq(len(fft_vals))
|
|
301
|
+
|
|
302
|
+
# Find peaks
|
|
303
|
+
peaks, _ = find_peaks(fft_vals[:len(fft_vals)//2], height=np.max(fft_vals)*0.1)
|
|
304
|
+
|
|
305
|
+
if len(peaks) > 0:
|
|
306
|
+
# Get dominant frequency
|
|
307
|
+
dominant_freq = freqs[peaks[0]]
|
|
308
|
+
period = int(1 / abs(dominant_freq)) if dominant_freq != 0 else 7
|
|
309
|
+
else:
|
|
310
|
+
period = 7 # Default weekly
|
|
311
|
+
|
|
312
|
+
print(f"📊 Auto-detected period: {period}")
|
|
313
|
+
|
|
314
|
+
# Perform decomposition
|
|
315
|
+
if method == "stl":
|
|
316
|
+
# STL decomposition (more robust)
|
|
317
|
+
stl = STL(ts_data, seasonal=period*2+1, trend=period*4+1)
|
|
318
|
+
result_decomp = stl.fit()
|
|
319
|
+
|
|
320
|
+
trend = result_decomp.trend
|
|
321
|
+
seasonal = result_decomp.seasonal
|
|
322
|
+
residual = result_decomp.resid
|
|
323
|
+
|
|
324
|
+
else:
|
|
325
|
+
# Classical decomposition
|
|
326
|
+
result_decomp = seasonal_decompose(ts_data, model='additive', period=period)
|
|
327
|
+
trend = result_decomp.trend
|
|
328
|
+
seasonal = result_decomp.seasonal
|
|
329
|
+
residual = result_decomp.resid
|
|
330
|
+
|
|
331
|
+
# Calculate seasonality strength
|
|
332
|
+
var_resid = np.var(residual.dropna())
|
|
333
|
+
var_seasonal_resid = np.var((seasonal + residual).dropna())
|
|
334
|
+
seasonality_strength = 1 - (var_resid / var_seasonal_resid) if var_seasonal_resid > 0 else 0
|
|
335
|
+
|
|
336
|
+
# Calculate trend strength
|
|
337
|
+
var_detrended = np.var((ts_data - trend).dropna())
|
|
338
|
+
trend_strength = 1 - (var_resid / var_detrended) if var_detrended > 0 else 0
|
|
339
|
+
|
|
340
|
+
# Autocorrelation analysis
|
|
341
|
+
from statsmodels.tsa.stattools import acf
|
|
342
|
+
acf_values = acf(ts_data.dropna(), nlags=min(40, len(ts_data)//2))
|
|
343
|
+
|
|
344
|
+
# Create decomposition dataframe
|
|
345
|
+
decomp_df = pl.DataFrame({
|
|
346
|
+
'time': df[time_col].to_list(),
|
|
347
|
+
'original': ts_data.values,
|
|
348
|
+
'trend': trend.fillna(0).values,
|
|
349
|
+
'seasonal': seasonal.fillna(0).values,
|
|
350
|
+
'residual': residual.fillna(0).values
|
|
351
|
+
})
|
|
352
|
+
|
|
353
|
+
# Save if output path provided
|
|
354
|
+
if output_path:
|
|
355
|
+
save_dataframe(decomp_df, output_path)
|
|
356
|
+
print(f"💾 Decomposition saved to: {output_path}")
|
|
357
|
+
|
|
358
|
+
return {
|
|
359
|
+
'status': 'success',
|
|
360
|
+
'method': method,
|
|
361
|
+
'detected_period': period,
|
|
362
|
+
'seasonality_strength': float(seasonality_strength),
|
|
363
|
+
'trend_strength': float(trend_strength),
|
|
364
|
+
'interpretation': {
|
|
365
|
+
'seasonality': 'strong' if seasonality_strength > 0.6 else 'moderate' if seasonality_strength > 0.3 else 'weak',
|
|
366
|
+
'trend': 'strong' if trend_strength > 0.6 else 'moderate' if trend_strength > 0.3 else 'weak'
|
|
367
|
+
},
|
|
368
|
+
'autocorrelation': acf_values[:min(10, len(acf_values))].tolist(),
|
|
369
|
+
'output_path': output_path
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def create_time_series_features(
|
|
374
|
+
file_path: str,
|
|
375
|
+
time_col: str,
|
|
376
|
+
target_col: str,
|
|
377
|
+
lag_periods: Optional[List[int]] = None,
|
|
378
|
+
rolling_windows: Optional[List[int]] = None,
|
|
379
|
+
add_holiday_features: bool = True,
|
|
380
|
+
country: str = "US",
|
|
381
|
+
output_path: Optional[str] = None
|
|
382
|
+
) -> Dict[str, Any]:
|
|
383
|
+
"""
|
|
384
|
+
Create comprehensive time series features including lags, rolling stats, and calendar features.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
file_path: Path to time series dataset
|
|
388
|
+
time_col: Time/date column
|
|
389
|
+
target_col: Target variable
|
|
390
|
+
lag_periods: Lag periods to create (e.g., [1, 7, 30])
|
|
391
|
+
rolling_windows: Rolling window sizes (e.g., [7, 14, 30])
|
|
392
|
+
add_holiday_features: Add holiday indicators
|
|
393
|
+
country: Country for holiday calendar
|
|
394
|
+
output_path: Path to save dataset with new features
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
Dictionary with feature engineering results
|
|
398
|
+
"""
|
|
399
|
+
# Load data
|
|
400
|
+
df = load_dataframe(file_path)
|
|
401
|
+
validate_dataframe(df)
|
|
402
|
+
validate_column_exists(df, time_col)
|
|
403
|
+
validate_column_exists(df, target_col)
|
|
404
|
+
|
|
405
|
+
# Sort by time
|
|
406
|
+
df = df.sort(time_col)
|
|
407
|
+
|
|
408
|
+
print("⏰ Creating time series features...")
|
|
409
|
+
|
|
410
|
+
# Convert to pandas for easier datetime handling
|
|
411
|
+
df_pd = df.to_pandas()
|
|
412
|
+
df_pd[time_col] = pd.to_datetime(df_pd[time_col])
|
|
413
|
+
df_pd = df_pd.set_index(time_col)
|
|
414
|
+
|
|
415
|
+
created_features = []
|
|
416
|
+
|
|
417
|
+
# Lag features
|
|
418
|
+
if lag_periods is None:
|
|
419
|
+
lag_periods = [1, 7, 14, 30]
|
|
420
|
+
|
|
421
|
+
for lag in lag_periods:
|
|
422
|
+
df_pd[f'{target_col}_lag_{lag}'] = df_pd[target_col].shift(lag)
|
|
423
|
+
created_features.append(f'{target_col}_lag_{lag}')
|
|
424
|
+
|
|
425
|
+
# Rolling window features
|
|
426
|
+
if rolling_windows is None:
|
|
427
|
+
rolling_windows = [7, 14, 30]
|
|
428
|
+
|
|
429
|
+
for window in rolling_windows:
|
|
430
|
+
df_pd[f'{target_col}_rolling_mean_{window}'] = df_pd[target_col].rolling(window=window).mean()
|
|
431
|
+
df_pd[f'{target_col}_rolling_std_{window}'] = df_pd[target_col].rolling(window=window).std()
|
|
432
|
+
df_pd[f'{target_col}_rolling_min_{window}'] = df_pd[target_col].rolling(window=window).min()
|
|
433
|
+
df_pd[f'{target_col}_rolling_max_{window}'] = df_pd[target_col].rolling(window=window).max()
|
|
434
|
+
|
|
435
|
+
created_features.extend([
|
|
436
|
+
f'{target_col}_rolling_mean_{window}',
|
|
437
|
+
f'{target_col}_rolling_std_{window}',
|
|
438
|
+
f'{target_col}_rolling_min_{window}',
|
|
439
|
+
f'{target_col}_rolling_max_{window}'
|
|
440
|
+
])
|
|
441
|
+
|
|
442
|
+
# Exponential moving average
|
|
443
|
+
df_pd[f'{target_col}_ema_7'] = df_pd[target_col].ewm(span=7).mean()
|
|
444
|
+
df_pd[f'{target_col}_ema_30'] = df_pd[target_col].ewm(span=30).mean()
|
|
445
|
+
created_features.extend([f'{target_col}_ema_7', f'{target_col}_ema_30'])
|
|
446
|
+
|
|
447
|
+
# Calendar features
|
|
448
|
+
df_pd['year'] = df_pd.index.year
|
|
449
|
+
df_pd['month'] = df_pd.index.month
|
|
450
|
+
df_pd['day'] = df_pd.index.day
|
|
451
|
+
df_pd['dayofweek'] = df_pd.index.dayofweek
|
|
452
|
+
df_pd['dayofyear'] = df_pd.index.dayofyear
|
|
453
|
+
df_pd['quarter'] = df_pd.index.quarter
|
|
454
|
+
df_pd['is_weekend'] = (df_pd.index.dayofweek >= 5).astype(int)
|
|
455
|
+
df_pd['is_month_start'] = df_pd.index.is_month_start.astype(int)
|
|
456
|
+
df_pd['is_month_end'] = df_pd.index.is_month_end.astype(int)
|
|
457
|
+
|
|
458
|
+
# Cyclical encoding for periodic features
|
|
459
|
+
df_pd['month_sin'] = np.sin(2 * np.pi * df_pd['month'] / 12)
|
|
460
|
+
df_pd['month_cos'] = np.cos(2 * np.pi * df_pd['month'] / 12)
|
|
461
|
+
df_pd['day_sin'] = np.sin(2 * np.pi * df_pd['day'] / 31)
|
|
462
|
+
df_pd['day_cos'] = np.cos(2 * np.pi * df_pd['day'] / 31)
|
|
463
|
+
df_pd['dayofweek_sin'] = np.sin(2 * np.pi * df_pd['dayofweek'] / 7)
|
|
464
|
+
df_pd['dayofweek_cos'] = np.cos(2 * np.pi * df_pd['dayofweek'] / 7)
|
|
465
|
+
|
|
466
|
+
created_features.extend([
|
|
467
|
+
'year', 'month', 'day', 'dayofweek', 'dayofyear', 'quarter',
|
|
468
|
+
'is_weekend', 'is_month_start', 'is_month_end',
|
|
469
|
+
'month_sin', 'month_cos', 'day_sin', 'day_cos',
|
|
470
|
+
'dayofweek_sin', 'dayofweek_cos'
|
|
471
|
+
])
|
|
472
|
+
|
|
473
|
+
# Holiday features
|
|
474
|
+
if add_holiday_features:
|
|
475
|
+
try:
|
|
476
|
+
import holidays
|
|
477
|
+
country_holidays = holidays.country_holidays(country)
|
|
478
|
+
df_pd['is_holiday'] = df_pd.index.map(lambda x: 1 if x in country_holidays else 0)
|
|
479
|
+
|
|
480
|
+
# Days until next holiday
|
|
481
|
+
holiday_dates = sorted([date for date in country_holidays if date >= df_pd.index.min()])
|
|
482
|
+
df_pd['days_to_next_holiday'] = df_pd.index.map(
|
|
483
|
+
lambda x: min([abs((hol - x).days) for hol in holiday_dates if hol >= x], default=365)
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
created_features.extend(['is_holiday', 'days_to_next_holiday'])
|
|
487
|
+
except Exception as e:
|
|
488
|
+
print(f"⚠️ Could not add holiday features: {str(e)}")
|
|
489
|
+
|
|
490
|
+
# Convert back to polars
|
|
491
|
+
df_pd = df_pd.reset_index()
|
|
492
|
+
df_result = pl.from_pandas(df_pd)
|
|
493
|
+
|
|
494
|
+
# Save if output path provided
|
|
495
|
+
if output_path:
|
|
496
|
+
save_dataframe(df_result, output_path)
|
|
497
|
+
print(f"💾 Dataset with time series features saved to: {output_path}")
|
|
498
|
+
|
|
499
|
+
return {
|
|
500
|
+
'status': 'success',
|
|
501
|
+
'features_created': len(created_features),
|
|
502
|
+
'feature_names': created_features,
|
|
503
|
+
'lag_periods': lag_periods,
|
|
504
|
+
'rolling_windows': rolling_windows,
|
|
505
|
+
'holiday_features_added': add_holiday_features,
|
|
506
|
+
'output_path': output_path
|
|
507
|
+
}
|