oracle-ads 2.13.17__py3-none-any.whl → 2.13.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ads/aqua/cli.py +7 -5
  2. ads/aqua/common/entities.py +88 -29
  3. ads/aqua/common/enums.py +6 -0
  4. ads/aqua/common/errors.py +5 -0
  5. ads/aqua/common/utils.py +49 -7
  6. ads/aqua/constants.py +3 -0
  7. ads/aqua/extension/deployment_handler.py +36 -0
  8. ads/aqua/modeldeployment/constants.py +1 -0
  9. ads/aqua/modeldeployment/deployment.py +83 -12
  10. ads/aqua/modeldeployment/entities.py +3 -0
  11. ads/aqua/resources/gpu_shapes_index.json +315 -26
  12. ads/aqua/shaperecommend/__init__.py +6 -0
  13. ads/aqua/shaperecommend/constants.py +116 -0
  14. ads/aqua/shaperecommend/estimator.py +384 -0
  15. ads/aqua/shaperecommend/llm_config.py +283 -0
  16. ads/aqua/shaperecommend/recommend.py +493 -0
  17. ads/aqua/shaperecommend/shape_report.py +233 -0
  18. ads/aqua/version.json +1 -1
  19. ads/cli.py +9 -1
  20. ads/jobs/builders/infrastructure/dsc_job.py +1 -0
  21. ads/jobs/builders/infrastructure/dsc_job_runtime.py +9 -1
  22. ads/model/service/oci_datascience_model_deployment.py +46 -19
  23. ads/opctl/operator/lowcode/common/data.py +7 -2
  24. ads/opctl/operator/lowcode/common/transformations.py +207 -0
  25. ads/opctl/operator/lowcode/common/utils.py +8 -0
  26. ads/opctl/operator/lowcode/forecast/__init__.py +3 -0
  27. ads/opctl/operator/lowcode/forecast/__main__.py +53 -3
  28. ads/opctl/operator/lowcode/forecast/const.py +2 -0
  29. ads/opctl/operator/lowcode/forecast/errors.py +5 -0
  30. ads/opctl/operator/lowcode/forecast/meta_selector.py +310 -0
  31. ads/opctl/operator/lowcode/forecast/model/automlx.py +1 -1
  32. ads/opctl/operator/lowcode/forecast/model/base_model.py +119 -30
  33. ads/opctl/operator/lowcode/forecast/model/factory.py +33 -2
  34. ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +54 -17
  35. ads/opctl/operator/lowcode/forecast/model_evaluator.py +6 -1
  36. ads/opctl/operator/lowcode/forecast/schema.yaml +1 -0
  37. ads/pipeline/ads_pipeline.py +13 -9
  38. {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/METADATA +1 -1
  39. {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/RECORD +42 -35
  40. {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/WHEEL +0 -0
  41. {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/entry_points.txt +0 -0
  42. {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env python
2
+ # Copyright (c) 2025 Oracle and/or its affiliates.
3
+ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
4
+
5
+ from typing import List, Optional
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from ads.aqua.common.entities import ComputeShapeSummary
10
+ from ads.aqua.shaperecommend.constants import QUANT_MAPPING
11
+ from ads.aqua.shaperecommend.estimator import MemoryEstimator
12
+ from ads.config import COMPARTMENT_OCID
13
+
14
+
15
+ class RequestRecommend(BaseModel):
16
+ """
17
+ A request to recommend compute shapes and parameters for a given model.
18
+ """
19
+
20
+ model_id: str = Field(
21
+ ..., description="The OCID of the model to recommend feasible compute shapes."
22
+ )
23
+ generate_table: Optional[bool] = (
24
+ Field(
25
+ True,
26
+ description="True - to generate the rich diff Table, False - generate the JSON response",
27
+ ),
28
+ )
29
+ compartment_id: Optional[str] = Field(
30
+ COMPARTMENT_OCID, description="The OCID of user's compartment"
31
+ )
32
+
33
+ class Config:
34
+ protected_namespaces = ()
35
+
36
+
37
+ class DeploymentParams(BaseModel): # noqa: N801
38
+ """
39
+ Recommended parameters for deployment and model inferencing (specific to compute shape & model).
40
+ """
41
+
42
+ quantization: Optional[str] = Field(
43
+ None, description="Type of quantization (e.g. 4bit)."
44
+ )
45
+ max_model_len: int = Field(..., description="Maximum length of input sequence.")
46
+ params: str = Field(
47
+ ..., description="Runtime parameters for deployment with vLLM, etc."
48
+ )
49
+
50
+
51
+ class ModelDetail(BaseModel):
52
+ """
53
+ The estimated memory footprint of a model, KV cache, and its total (model + KV cache).
54
+ """
55
+
56
+ model_size_gb: float = Field(..., description="Size of the model in GB.")
57
+ kv_cache_size_gb: float = Field(..., description="Size of KV cache in GB.")
58
+ total_model_gb: float = Field(
59
+ ..., description="Total size of model and cache in GB."
60
+ )
61
+
62
+ class Config:
63
+ protected_namespaces = ()
64
+
65
+
66
+ class ModelConfig(BaseModel):
67
+ """
68
+ The configuration for a model based on specific set of deployment parameters and memory capacity of shape.
69
+ """
70
+
71
+ model_details: ModelDetail = Field(..., description="Details about the model.")
72
+ deployment_params: DeploymentParams = Field(
73
+ ..., description="Parameters for deployment."
74
+ )
75
+ recommendation: str = Field(..., description="GPU recommendation for the model.")
76
+
77
+ class Config:
78
+ protected_namespaces = ()
79
+
80
+ @classmethod
81
+ def constuct_model_config(
82
+ cls, estimator: MemoryEstimator, allowed_gpu_memory: float
83
+ ) -> "ModelConfig":
84
+ """
85
+ Assembles a complete ModelConfig, including model details, deployment parameters (vLLM), and recommendations.
86
+
87
+ Parameters
88
+ ----------
89
+ shape_quantization : set[str]
90
+ Allowed quantization methods for the compute shape
91
+
92
+ Returns
93
+ -------
94
+ ModelConfig
95
+ Contains round-tripped model size, kv cache, total, vLLM parameters, and recommendations.
96
+
97
+ Notes
98
+ -----
99
+ - Rounds all sizes to 3 decimal digits.
100
+ - Computes a recommendation string using `limiting_factor`.
101
+ """
102
+ c = estimator.llm_config
103
+ deployment_params = DeploymentParams(
104
+ quantization=c.quantization or c.in_flight_quantization or c.weight_dtype,
105
+ max_model_len=getattr(estimator, "seq_len", None),
106
+ params=estimator.construct_deployment_params(),
107
+ )
108
+ model_detail = ModelDetail(
109
+ model_size_gb=round(getattr(estimator, "model_memory", 0.0), 2),
110
+ kv_cache_size_gb=round(getattr(estimator, "kv_cache_memory", 0.0), 2),
111
+ total_model_gb=round(getattr(estimator, "total_memory", 0.0), 2),
112
+ )
113
+ return ModelConfig(
114
+ model_details=model_detail,
115
+ deployment_params=deployment_params,
116
+ recommendation=estimator.limiting_factor(allowed_gpu_memory),
117
+ )
118
+
119
+
120
+ class ShapeReport(BaseModel):
121
+ """
122
+ The feasible deployment configurations for the model per shape.
123
+ """
124
+
125
+ shape_details: "ComputeShapeSummary" = Field(
126
+ ..., description="Details about the compute shape (ex. VM.GPU.A10.2)."
127
+ )
128
+ configurations: List["ModelConfig"] = Field(
129
+ default_factory=list, description="List of model configurations."
130
+ )
131
+
132
+ def is_dominated(self, others: List["ShapeReport"]) -> bool:
133
+ """
134
+ Determines whether this shape is dominated by any other shape in a Pareto sense.
135
+
136
+ Parameters
137
+ ----------
138
+ others : list of ShapeReport
139
+ List of other shape/deployment configurations to compare against.
140
+
141
+ Returns
142
+ -------
143
+ bool
144
+ True if this shape is dominated by at least one other, False otherwise.
145
+
146
+ Notes
147
+ -----
148
+ A shape is dominated if there exists another configuration that is
149
+ at least as good in all criteria and strictly better in at least one.
150
+ Criteria:
151
+ - Cost (to be minimized)
152
+ - Performance, quantization level, max sequence length (to be maximized)
153
+ """
154
+ try:
155
+ cand_cost = self.shape_details.gpu_specs.ranking.cost
156
+ cand_perf = self.shape_details.gpu_specs.ranking.performance
157
+ cand_quant = QUANT_MAPPING.get(
158
+ self.configurations[0].deployment_params.quantization, 0
159
+ )
160
+ cand_maxlen = self.configurations[0].deployment_params.max_model_len
161
+
162
+ for other in others:
163
+ other_cost = other.shape_details.gpu_specs.ranking.cost
164
+ other_perf = other.shape_details.gpu_specs.ranking.performance
165
+ other_quant = QUANT_MAPPING.get(
166
+ other.configurations[0].deployment_params.quantization, 0
167
+ )
168
+ other_maxlen = other.configurations[0].deployment_params.max_model_len
169
+ if (
170
+ other_cost <= cand_cost
171
+ and other_perf >= cand_perf
172
+ and other_quant >= cand_quant
173
+ and other_maxlen >= cand_maxlen
174
+ and (
175
+ other_cost < cand_cost
176
+ or other_perf > cand_perf
177
+ or other_quant > cand_quant
178
+ or other_maxlen > cand_maxlen
179
+ )
180
+ ):
181
+ return True
182
+ return False
183
+ except AttributeError:
184
+ return False
185
+
186
+ @classmethod
187
+ def pareto_front(cls, shapes: List["ShapeReport"]) -> List["ShapeReport"]:
188
+ """
189
+ Filters a list of shapes/configurations to those on the Pareto frontier.
190
+
191
+ Parameters
192
+ ----------
193
+ shapes : list of ShapeReport
194
+ List of candidate shape/configuration reports to evaluate.
195
+
196
+ Returns
197
+ -------
198
+ list of ShapeReport
199
+ Subset of input shapes that are not dominated by any other (the Pareto front).
200
+
201
+ Notes
202
+ -----
203
+ The returned set contains non-dominated deployments for maximizing
204
+ performance, quantization, and model length, while minimizing cost.
205
+ """
206
+ return [
207
+ shape
208
+ for shape in shapes
209
+ if not shape.is_dominated([s for s in shapes if s != shape])
210
+ ]
211
+
212
+
213
+ class ShapeRecommendationReport(BaseModel):
214
+ """
215
+ Full report of shape fit recommendations and troubleshooting, if applicable.
216
+
217
+ Attributes:
218
+ recommendations (List[DeploymentShapeSummary]): Recommended deployment shapes
219
+ for each tested batch size and max sequence length combination.
220
+ troubleshoot (Optional[TroubleshootShapeSummary]): Troubleshooting information
221
+ if no valid deployment shapes are available.
222
+ """
223
+
224
+ display_name: Optional[str] = Field(
225
+ "", description="Name of the model used for recommendations."
226
+ )
227
+ recommendations: List[ShapeReport] = Field(
228
+ default_factory=list, description="List of shape fit recommendations."
229
+ )
230
+ troubleshoot: Optional[str] = Field(
231
+ None,
232
+ description="Details for troubleshooting if no shapes fit the current model.",
233
+ )
ads/aqua/version.json CHANGED
@@ -1,3 +1,3 @@
1
1
  {
2
- "aqua": "1.0.7a"
2
+ "aqua": "1.0.8"
3
3
  }
ads/cli.py CHANGED
@@ -7,6 +7,8 @@ import logging
7
7
  import sys
8
8
  import traceback
9
9
  import uuid
10
+ from rich.console import Console
11
+ from rich.table import Table
10
12
 
11
13
  import fire
12
14
  from pydantic import BaseModel
@@ -92,6 +94,12 @@ def serialize(data):
92
94
  print(str(item))
93
95
  elif isinstance(data, BaseModel):
94
96
  print(json.dumps(data.dict(), indent=4))
97
+ elif isinstance(data, Table):
98
+ console = Console()
99
+ console.print(data)
100
+ return
101
+ elif data is None:
102
+ return
95
103
  else:
96
104
  print(str(data))
97
105
 
@@ -131,7 +139,7 @@ def exit_program(ex: Exception, logger: "logging.Logger") -> None:
131
139
 
132
140
  request_id = str(uuid.uuid4())
133
141
  logger.debug(f"Error Request ID: {request_id}\nError: {traceback.format_exc()}")
134
- logger.error(f"Error Request ID: {request_id}\n" f"Error: {str(ex)}")
142
+ logger.error(f"Error Request ID: {request_id}\nError: {str(ex)}")
135
143
 
136
144
  exit_code = getattr(ex, "exit_code", 1)
137
145
  logger.error(f"Exit code: {exit_code}")
@@ -1751,6 +1751,7 @@ class DataScienceJob(Infrastructure):
1751
1751
  return (
1752
1752
  MULTI_NODE_JOB_SUPPORT
1753
1753
  and isinstance(runtime, MultiNodeRuntime)
1754
+ and runtime.replica
1754
1755
  and runtime.replica > 1
1755
1756
  )
1756
1757
 
@@ -365,6 +365,11 @@ class RuntimeHandler:
365
365
  dsc_job,
366
366
  "job_node_configuration_details.job_node_group_configuration_details_list",
367
367
  )
368
+ if node_groups is None:
369
+ node_groups = get_value(
370
+ dsc_job,
371
+ "job_node_configuration_details.jobNodeGroupConfigurationDetailsList",
372
+ )
368
373
  if node_groups and len(node_groups) == 1:
369
374
  return node_groups[0]
370
375
  return None
@@ -373,6 +378,7 @@ class RuntimeHandler:
373
378
  node_group = self._get_node_group(dsc_job)
374
379
  if node_group:
375
380
  replica = get_value(node_group, "replicas")
381
+ envs.pop(self.CONST_NODE_COUNT, None)
376
382
  elif not envs:
377
383
  replica = None
378
384
  elif self.CONST_WORKER_COUNT in envs:
@@ -399,7 +405,9 @@ class RuntimeHandler:
399
405
  env_attr = "job_configuration_details.environment_variables"
400
406
  node_group = self._get_node_group(dsc_job)
401
407
  if node_group:
402
- envs = get_value(node_group, env_attr)
408
+ envs = get_value(node_group, env_attr) or get_value(
409
+ node_group, "jobConfigurationDetails.environment_variables"
410
+ )
403
411
  else:
404
412
  envs = get_value(dsc_job, env_attr)
405
413
  if envs:
@@ -1,23 +1,24 @@
1
1
  #!/usr/bin/env python
2
- # -*- coding: utf-8; -*-
3
2
 
4
- # Copyright (c) 2024 Oracle and/or its affiliates.
3
+ # Copyright (c) 2024, 2025 Oracle and/or its affiliates.
5
4
  # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6
5
 
7
- from functools import wraps
8
6
  import logging
9
- from typing import Callable, List
10
- from ads.common.oci_datascience import OCIDataScienceMixin
11
- from ads.common.work_request import DataScienceWorkRequest
12
- from ads.config import PROJECT_OCID
13
- from ads.model.deployment.common.utils import OCIClientManager, State
14
- import oci
7
+ from functools import wraps
8
+ from typing import Callable, List, Optional
15
9
 
10
+ import oci
16
11
  from oci.data_science.models import (
17
12
  CreateModelDeploymentDetails,
13
+ ModelDeploymentShapeSummary,
18
14
  UpdateModelDeploymentDetails,
19
15
  )
20
16
 
17
+ from ads.common.oci_datascience import OCIDataScienceMixin
18
+ from ads.common.work_request import DataScienceWorkRequest
19
+ from ads.config import COMPARTMENT_OCID, PROJECT_OCID
20
+ from ads.model.deployment.common.utils import OCIClientManager, State
21
+
21
22
  DEFAULT_WAIT_TIME = 1200
22
23
  DEFAULT_POLL_INTERVAL = 10
23
24
  ALLOWED_STATUS = [
@@ -185,14 +186,13 @@ class OCIDataScienceModelDeployment(
185
186
  self.id,
186
187
  )
187
188
 
188
-
189
189
  self.workflow_req_id = response.headers.get("opc-work-request-id", None)
190
190
  if wait_for_completion:
191
191
  try:
192
192
  DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
193
193
  progress_bar_description="Activating model deployment",
194
- max_wait_time=max_wait_time,
195
- poll_interval=poll_interval
194
+ max_wait_time=max_wait_time,
195
+ poll_interval=poll_interval,
196
196
  )
197
197
  except Exception as e:
198
198
  logger.error(
@@ -239,8 +239,8 @@ class OCIDataScienceModelDeployment(
239
239
  try:
240
240
  DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
241
241
  progress_bar_description="Creating model deployment",
242
- max_wait_time=max_wait_time,
243
- poll_interval=poll_interval
242
+ max_wait_time=max_wait_time,
243
+ poll_interval=poll_interval,
244
244
  )
245
245
  except Exception as e:
246
246
  logger.error("Error while trying to create model deployment: " + str(e))
@@ -290,8 +290,8 @@ class OCIDataScienceModelDeployment(
290
290
  try:
291
291
  DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
292
292
  progress_bar_description="Deactivating model deployment",
293
- max_wait_time=max_wait_time,
294
- poll_interval=poll_interval
293
+ max_wait_time=max_wait_time,
294
+ poll_interval=poll_interval,
295
295
  )
296
296
  except Exception as e:
297
297
  logger.error(
@@ -351,14 +351,14 @@ class OCIDataScienceModelDeployment(
351
351
  response = self.client.delete_model_deployment(
352
352
  self.id,
353
353
  )
354
-
354
+
355
355
  self.workflow_req_id = response.headers.get("opc-work-request-id", None)
356
356
  if wait_for_completion:
357
357
  try:
358
358
  DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
359
359
  progress_bar_description="Deleting model deployment",
360
- max_wait_time=max_wait_time,
361
- poll_interval=poll_interval
360
+ max_wait_time=max_wait_time,
361
+ poll_interval=poll_interval,
362
362
  )
363
363
  except Exception as e:
364
364
  logger.error("Error while trying to delete model deployment: " + str(e))
@@ -493,3 +493,30 @@ class OCIDataScienceModelDeployment(
493
493
  An instance of `OCIDataScienceModelDeployment`.
494
494
  """
495
495
  return super().from_ocid(model_deployment_id)
496
+
497
+ @classmethod
498
+ def shapes(
499
+ cls,
500
+ compartment_id: Optional[str] = None,
501
+ **kwargs,
502
+ ) -> List[ModelDeploymentShapeSummary]:
503
+ """
504
+ Retrieves all available model deployment shapes in the given compartment.
505
+
506
+ This method uses OCI's pagination utility to fetch all pages of model
507
+ deployment shape summaries available in the specified compartment.
508
+
509
+ Args:
510
+ compartment_id (Optional[str]): The OCID of the compartment. If not provided,
511
+ the default COMPARTMENT_ID extracted form env variables is used.
512
+ **kwargs: Additional keyword arguments to pass to the list_model_deployments call.
513
+
514
+ Returns:
515
+ List[ModelDeploymentShapeSummary]: A list of all model deployment shape summaries.
516
+ """
517
+ client = cls().client
518
+ compartment_id = compartment_id or COMPARTMENT_OCID
519
+
520
+ return oci.pagination.list_call_get_all_results(
521
+ client.list_model_deployment_shapes, compartment_id, **kwargs
522
+ ).data
@@ -19,16 +19,21 @@ from .transformations import Transformations
19
19
 
20
20
 
21
21
  class AbstractData(ABC):
22
- def __init__(self, spec, name="input_data", data=None):
22
+ def __init__(self, spec, name="input_data", data=None, subset=None):
23
23
  self.Transformations = Transformations
24
24
  self.data = None
25
25
  self._data_dict = dict()
26
26
  self.name = name
27
27
  self.spec = spec
28
+ self.subset = subset
28
29
  if data is not None:
29
30
  self.data = data
30
31
  else:
31
32
  self.load_transform_ingest_data(spec)
33
+ # Subset by series if requested
34
+ # if self.subset is not None and hasattr(self, 'data') and self.data is not None:
35
+ # subset_str = [str(s) for s in self.subset]
36
+ # self.data = self.data[self.data.index.get_level_values(DataColumns.Series).isin(subset_str)]
32
37
 
33
38
  def get_raw_data_by_cat(self, category):
34
39
  mapping = self._data_transformer.get_target_category_columns_map()
@@ -72,7 +77,7 @@ class AbstractData(ABC):
72
77
  def _load_data(self, data_spec, **kwargs):
73
78
  loading_start_time = time.time()
74
79
  try:
75
- raw_data = load_data(data_spec)
80
+ raw_data = load_data(data_spec, subset=self.subset if self.subset else None, target_category_columns=self.spec.target_category_columns)
76
81
  except InvalidParameterError as e:
77
82
  e.args = e.args + (f"Invalid Parameter: {self.name}",)
78
83
  raise e
@@ -294,3 +294,210 @@ class Transformations(ABC):
294
294
  def _fill_na(self, df: pd.DataFrame, na_value=0) -> pd.DataFrame:
295
295
  """Fill nans in dataframe"""
296
296
  return df.fillna(value=na_value)
297
+
298
+ def build_fforms_meta_features(self, data, target_col=None, group_cols=None):
299
+ """
300
+ Build meta-features for time series based on FFORMS paper and add them to the original DataFrame.
301
+
302
+ Parameters
303
+ ----------
304
+ data : pandas.DataFrame
305
+ Input DataFrame containing time series data
306
+ target_col : str, optional
307
+ Name of the target column to calculate meta-features for.
308
+ If None, uses the target column specified in dataset_info.
309
+ group_cols : list of str, optional
310
+ List of columns to group by before calculating meta-features.
311
+ If None, calculates features for the entire series.
312
+
313
+ Returns
314
+ -------
315
+ pandas.DataFrame
316
+ Original DataFrame with additional meta-feature columns
317
+
318
+ References
319
+ ----------
320
+ Talagala, T. S., Hyndman, R. J., & Athanasopoulos, G. (2023).
321
+ Meta-learning how to forecast time series. Journal of Forecasting, 42(6), 1476-1501.
322
+ """
323
+ if not isinstance(data, pd.DataFrame):
324
+ raise ValueError("Input must be a pandas DataFrame")
325
+
326
+ # Use target column from dataset_info if not specified
327
+ if target_col is None:
328
+ target_col = self.target_column_name
329
+ if target_col not in data.columns:
330
+ raise ValueError(f"Target column '{target_col}' not found in DataFrame")
331
+
332
+ # Check if group_cols are provided and valid
333
+ if group_cols is not None:
334
+ if not isinstance(group_cols, list):
335
+ raise ValueError("group_cols must be a list of column names")
336
+ for col in group_cols:
337
+ if col not in data.columns:
338
+ raise ValueError(f"Group column '{col}' not found in DataFrame")
339
+
340
+ # If no group_cols, get the target_category_columns else treat the entire DataFrame as a single series
341
+ if not group_cols:
342
+ group_cols = self.target_category_columns if self.target_category_columns else []
343
+
344
+ # Calculate meta-features for each series
345
+ def calculate_series_features(series):
346
+ """Calculate features for a single series"""
347
+ n = len(series)
348
+ values = series.values
349
+
350
+ # Basic statistics
351
+ mean = series.mean()
352
+ std = series.std()
353
+ variance = series.var()
354
+ skewness = series.skew()
355
+ kurtosis = series.kurtosis()
356
+ cv = std / mean if mean != 0 else np.inf
357
+
358
+ # Trend features
359
+ X = np.vstack([np.arange(n), np.ones(n)]).T
360
+ trend_coef = np.linalg.lstsq(X, values, rcond=None)[0][0]
361
+ trend_pred = X.dot(np.linalg.lstsq(X, values, rcond=None)[0])
362
+ residuals = values - trend_pred
363
+ std_residuals = np.std(residuals)
364
+
365
+ # Turning points
366
+ turning_points = 0
367
+ for i in range(1, n-1):
368
+ if (values[i-1] < values[i] and values[i] > values[i+1]) or \
369
+ (values[i-1] > values[i] and values[i] < values[i+1]):
370
+ turning_points += 1
371
+ turning_points_rate = turning_points / (n-2) if n > 2 else 0
372
+
373
+ # Serial correlation
374
+ acf1 = series.autocorr(lag=1) if n > 1 else 0
375
+ acf2 = series.autocorr(lag=2) if n > 2 else 0
376
+ acf10 = series.autocorr(lag=10) if n > 10 else 0
377
+
378
+ # Seasonality features
379
+ seasonal_strength = 0
380
+ seasonal_peak_strength = 0
381
+ if n >= 12:
382
+ seasonal_lags = [12, 24, 36]
383
+ seasonal_acfs = []
384
+ for lag in seasonal_lags:
385
+ if n > lag:
386
+ acf_val = series.autocorr(lag=lag)
387
+ seasonal_acfs.append(abs(acf_val))
388
+ seasonal_peak_strength = max(seasonal_acfs) if seasonal_acfs else 0
389
+
390
+ ma = series.rolling(window=12, center=True).mean()
391
+ seasonal_comp = series - ma
392
+ seasonal_strength = 1 - np.var(seasonal_comp.dropna()) / np.var(series)
393
+
394
+ # Stability and volatility features
395
+ values_above_mean = values >= mean
396
+ crossing_points = np.sum(values_above_mean[1:] != values_above_mean[:-1])
397
+ crossing_rate = crossing_points / (n - 1) if n > 1 else 0
398
+
399
+ # First and second differences
400
+ diff1 = np.diff(values)
401
+ diff2 = np.diff(diff1) if len(diff1) > 1 else np.array([])
402
+
403
+ diff1_mean = np.mean(np.abs(diff1)) if len(diff1) > 0 else 0
404
+ diff1_var = np.var(diff1) if len(diff1) > 0 else 0
405
+ diff2_mean = np.mean(np.abs(diff2)) if len(diff2) > 0 else 0
406
+ diff2_var = np.var(diff2) if len(diff2) > 0 else 0
407
+
408
+ # Nonlinearity features
409
+ if n > 3:
410
+ X = values[:-1].reshape(-1, 1)
411
+ y = values[1:]
412
+ X2 = X * X
413
+ X3 = X * X * X
414
+ X_aug = np.hstack([X, X2, X3])
415
+ nonlinearity = np.linalg.lstsq(X_aug, y, rcond=None)[1][0] if len(y) > 0 else 0
416
+ else:
417
+ nonlinearity = 0
418
+
419
+ # Long-term trend features
420
+ if n >= 10:
421
+ mid = n // 2
422
+ trend_change = np.mean(values[mid:]) - np.mean(values[:mid])
423
+ else:
424
+ trend_change = 0
425
+
426
+ # Step changes and spikes
427
+ step_changes = np.abs(diff1).max() if len(diff1) > 0 else 0
428
+ spikes = np.sum(np.abs(values - mean) > 2 * std) / n if std != 0 else 0
429
+
430
+ # Hurst exponent and entropy
431
+ lag = min(10, n // 2)
432
+ variance_ratio = np.var(series.diff(lag)) / (lag * np.var(series.diff())) if n > lag else 0
433
+ hurst = np.log(variance_ratio) / (2 * np.log(lag)) if variance_ratio > 0 and lag > 1 else 0
434
+
435
+ hist, _ = np.histogram(series, bins='auto', density=True)
436
+ entropy = -np.sum(hist[hist > 0] * np.log(hist[hist > 0]))
437
+
438
+ return pd.Series({
439
+ 'ts_n_obs': n,
440
+ 'ts_mean': mean,
441
+ 'ts_std': std,
442
+ 'ts_variance': variance,
443
+ 'ts_cv': cv,
444
+ 'ts_skewness': skewness,
445
+ 'ts_kurtosis': kurtosis,
446
+ 'ts_trend': trend_coef,
447
+ 'ts_trend_change': trend_change,
448
+ 'ts_std_residuals': std_residuals,
449
+ 'ts_turning_points_rate': turning_points_rate,
450
+ 'ts_seasonal_strength': seasonal_strength,
451
+ 'ts_seasonal_peak_strength': seasonal_peak_strength,
452
+ 'ts_acf1': acf1,
453
+ 'ts_acf2': acf2,
454
+ 'ts_acf10': acf10,
455
+ 'ts_crossing_rate': crossing_rate,
456
+ 'ts_diff1_mean': diff1_mean,
457
+ 'ts_diff1_variance': diff1_var,
458
+ 'ts_diff2_mean': diff2_mean,
459
+ 'ts_diff2_variance': diff2_var,
460
+ 'ts_nonlinearity': nonlinearity,
461
+ 'ts_step_max': step_changes,
462
+ 'ts_spikes_rate': spikes,
463
+ 'ts_hurst': hurst,
464
+ 'ts_entropy': entropy
465
+ })
466
+
467
+ # Create copy of input DataFrame
468
+ result_df = data.copy()
469
+
470
+ if group_cols:
471
+ # Calculate features for each group
472
+ features = []
473
+ # Sort by date within each group if date column exists
474
+ date_col = self.dt_column_name if self.dt_column_name else 'Date'
475
+ if date_col in data.columns:
476
+ data = data.sort_values([date_col] + group_cols)
477
+
478
+ for name, group in data.groupby(group_cols):
479
+ # Sort group by date if exists
480
+ if date_col in group.columns:
481
+ group = group.sort_values(date_col)
482
+ group_features = calculate_series_features(group[target_col])
483
+ if isinstance(name, tuple):
484
+ feature_row = dict(zip(group_cols, name))
485
+ else:
486
+ feature_row = {group_cols[0]: name}
487
+ feature_row.update(group_features)
488
+ features.append(feature_row)
489
+
490
+ # Create features DataFrame without merging
491
+ features_df = pd.DataFrame(features)
492
+ # Return only the meta-features DataFrame with group columns
493
+ return features_df
494
+ else:
495
+ # Sort by date if exists and calculate features for entire series
496
+ date_col = self.dt_column_name if self.dt_column_name else 'Date'
497
+ if date_col in data.columns:
498
+ data = data.sort_values(date_col)
499
+ features = calculate_series_features(data[target_col])
500
+ # Return single row DataFrame with meta-features
501
+ return pd.DataFrame([features])
502
+
503
+ return result_df
@@ -124,6 +124,14 @@ def load_data(data_spec, storage_options=None, **kwargs):
124
124
  data = data[columns]
125
125
  if limit:
126
126
  data = data[:limit]
127
+ # Filtering by subset if provided
128
+ subset = kwargs.get('subset', None)
129
+ if subset is not None:
130
+ target_category_columns = kwargs.get('target_category_columns', None)
131
+ mask = False
132
+ for col in target_category_columns:
133
+ mask = mask | data[col].isin(subset)
134
+ data = data[mask]
127
135
  return data
128
136
 
129
137