oracle-ads 2.13.17__py3-none-any.whl → 2.13.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ads/aqua/cli.py +7 -5
- ads/aqua/common/entities.py +88 -29
- ads/aqua/common/enums.py +6 -0
- ads/aqua/common/errors.py +5 -0
- ads/aqua/common/utils.py +49 -7
- ads/aqua/constants.py +3 -0
- ads/aqua/extension/deployment_handler.py +36 -0
- ads/aqua/modeldeployment/constants.py +1 -0
- ads/aqua/modeldeployment/deployment.py +83 -12
- ads/aqua/modeldeployment/entities.py +3 -0
- ads/aqua/resources/gpu_shapes_index.json +315 -26
- ads/aqua/shaperecommend/__init__.py +6 -0
- ads/aqua/shaperecommend/constants.py +116 -0
- ads/aqua/shaperecommend/estimator.py +384 -0
- ads/aqua/shaperecommend/llm_config.py +283 -0
- ads/aqua/shaperecommend/recommend.py +493 -0
- ads/aqua/shaperecommend/shape_report.py +233 -0
- ads/aqua/version.json +1 -1
- ads/cli.py +9 -1
- ads/jobs/builders/infrastructure/dsc_job.py +1 -0
- ads/jobs/builders/infrastructure/dsc_job_runtime.py +9 -1
- ads/model/service/oci_datascience_model_deployment.py +46 -19
- ads/opctl/operator/lowcode/common/data.py +7 -2
- ads/opctl/operator/lowcode/common/transformations.py +207 -0
- ads/opctl/operator/lowcode/common/utils.py +8 -0
- ads/opctl/operator/lowcode/forecast/__init__.py +3 -0
- ads/opctl/operator/lowcode/forecast/__main__.py +53 -3
- ads/opctl/operator/lowcode/forecast/const.py +2 -0
- ads/opctl/operator/lowcode/forecast/errors.py +5 -0
- ads/opctl/operator/lowcode/forecast/meta_selector.py +310 -0
- ads/opctl/operator/lowcode/forecast/model/automlx.py +1 -1
- ads/opctl/operator/lowcode/forecast/model/base_model.py +119 -30
- ads/opctl/operator/lowcode/forecast/model/factory.py +33 -2
- ads/opctl/operator/lowcode/forecast/model/forecast_datasets.py +54 -17
- ads/opctl/operator/lowcode/forecast/model_evaluator.py +6 -1
- ads/opctl/operator/lowcode/forecast/schema.yaml +1 -0
- ads/pipeline/ads_pipeline.py +13 -9
- {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/METADATA +1 -1
- {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/RECORD +42 -35
- {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/WHEEL +0 -0
- {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/entry_points.txt +0 -0
- {oracle_ads-2.13.17.dist-info → oracle_ads-2.13.18.dist-info}/licenses/LICENSE.txt +0 -0
@@ -0,0 +1,233 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# Copyright (c) 2025 Oracle and/or its affiliates.
|
3
|
+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
4
|
+
|
5
|
+
from typing import List, Optional
|
6
|
+
|
7
|
+
from pydantic import BaseModel, Field
|
8
|
+
|
9
|
+
from ads.aqua.common.entities import ComputeShapeSummary
|
10
|
+
from ads.aqua.shaperecommend.constants import QUANT_MAPPING
|
11
|
+
from ads.aqua.shaperecommend.estimator import MemoryEstimator
|
12
|
+
from ads.config import COMPARTMENT_OCID
|
13
|
+
|
14
|
+
|
15
|
+
class RequestRecommend(BaseModel):
|
16
|
+
"""
|
17
|
+
A request to recommend compute shapes and parameters for a given model.
|
18
|
+
"""
|
19
|
+
|
20
|
+
model_id: str = Field(
|
21
|
+
..., description="The OCID of the model to recommend feasible compute shapes."
|
22
|
+
)
|
23
|
+
generate_table: Optional[bool] = (
|
24
|
+
Field(
|
25
|
+
True,
|
26
|
+
description="True - to generate the rich diff Table, False - generate the JSON response",
|
27
|
+
),
|
28
|
+
)
|
29
|
+
compartment_id: Optional[str] = Field(
|
30
|
+
COMPARTMENT_OCID, description="The OCID of user's compartment"
|
31
|
+
)
|
32
|
+
|
33
|
+
class Config:
|
34
|
+
protected_namespaces = ()
|
35
|
+
|
36
|
+
|
37
|
+
class DeploymentParams(BaseModel): # noqa: N801
|
38
|
+
"""
|
39
|
+
Recommended parameters for deployment and model inferencing (specific to compute shape & model).
|
40
|
+
"""
|
41
|
+
|
42
|
+
quantization: Optional[str] = Field(
|
43
|
+
None, description="Type of quantization (e.g. 4bit)."
|
44
|
+
)
|
45
|
+
max_model_len: int = Field(..., description="Maximum length of input sequence.")
|
46
|
+
params: str = Field(
|
47
|
+
..., description="Runtime parameters for deployment with vLLM, etc."
|
48
|
+
)
|
49
|
+
|
50
|
+
|
51
|
+
class ModelDetail(BaseModel):
|
52
|
+
"""
|
53
|
+
The estimated memory footprint of a model, KV cache, and its total (model + KV cache).
|
54
|
+
"""
|
55
|
+
|
56
|
+
model_size_gb: float = Field(..., description="Size of the model in GB.")
|
57
|
+
kv_cache_size_gb: float = Field(..., description="Size of KV cache in GB.")
|
58
|
+
total_model_gb: float = Field(
|
59
|
+
..., description="Total size of model and cache in GB."
|
60
|
+
)
|
61
|
+
|
62
|
+
class Config:
|
63
|
+
protected_namespaces = ()
|
64
|
+
|
65
|
+
|
66
|
+
class ModelConfig(BaseModel):
|
67
|
+
"""
|
68
|
+
The configuration for a model based on specific set of deployment parameters and memory capacity of shape.
|
69
|
+
"""
|
70
|
+
|
71
|
+
model_details: ModelDetail = Field(..., description="Details about the model.")
|
72
|
+
deployment_params: DeploymentParams = Field(
|
73
|
+
..., description="Parameters for deployment."
|
74
|
+
)
|
75
|
+
recommendation: str = Field(..., description="GPU recommendation for the model.")
|
76
|
+
|
77
|
+
class Config:
|
78
|
+
protected_namespaces = ()
|
79
|
+
|
80
|
+
@classmethod
|
81
|
+
def constuct_model_config(
|
82
|
+
cls, estimator: MemoryEstimator, allowed_gpu_memory: float
|
83
|
+
) -> "ModelConfig":
|
84
|
+
"""
|
85
|
+
Assembles a complete ModelConfig, including model details, deployment parameters (vLLM), and recommendations.
|
86
|
+
|
87
|
+
Parameters
|
88
|
+
----------
|
89
|
+
shape_quantization : set[str]
|
90
|
+
Allowed quantization methods for the compute shape
|
91
|
+
|
92
|
+
Returns
|
93
|
+
-------
|
94
|
+
ModelConfig
|
95
|
+
Contains round-tripped model size, kv cache, total, vLLM parameters, and recommendations.
|
96
|
+
|
97
|
+
Notes
|
98
|
+
-----
|
99
|
+
- Rounds all sizes to 3 decimal digits.
|
100
|
+
- Computes a recommendation string using `limiting_factor`.
|
101
|
+
"""
|
102
|
+
c = estimator.llm_config
|
103
|
+
deployment_params = DeploymentParams(
|
104
|
+
quantization=c.quantization or c.in_flight_quantization or c.weight_dtype,
|
105
|
+
max_model_len=getattr(estimator, "seq_len", None),
|
106
|
+
params=estimator.construct_deployment_params(),
|
107
|
+
)
|
108
|
+
model_detail = ModelDetail(
|
109
|
+
model_size_gb=round(getattr(estimator, "model_memory", 0.0), 2),
|
110
|
+
kv_cache_size_gb=round(getattr(estimator, "kv_cache_memory", 0.0), 2),
|
111
|
+
total_model_gb=round(getattr(estimator, "total_memory", 0.0), 2),
|
112
|
+
)
|
113
|
+
return ModelConfig(
|
114
|
+
model_details=model_detail,
|
115
|
+
deployment_params=deployment_params,
|
116
|
+
recommendation=estimator.limiting_factor(allowed_gpu_memory),
|
117
|
+
)
|
118
|
+
|
119
|
+
|
120
|
+
class ShapeReport(BaseModel):
|
121
|
+
"""
|
122
|
+
The feasible deployment configurations for the model per shape.
|
123
|
+
"""
|
124
|
+
|
125
|
+
shape_details: "ComputeShapeSummary" = Field(
|
126
|
+
..., description="Details about the compute shape (ex. VM.GPU.A10.2)."
|
127
|
+
)
|
128
|
+
configurations: List["ModelConfig"] = Field(
|
129
|
+
default_factory=list, description="List of model configurations."
|
130
|
+
)
|
131
|
+
|
132
|
+
def is_dominated(self, others: List["ShapeReport"]) -> bool:
|
133
|
+
"""
|
134
|
+
Determines whether this shape is dominated by any other shape in a Pareto sense.
|
135
|
+
|
136
|
+
Parameters
|
137
|
+
----------
|
138
|
+
others : list of ShapeReport
|
139
|
+
List of other shape/deployment configurations to compare against.
|
140
|
+
|
141
|
+
Returns
|
142
|
+
-------
|
143
|
+
bool
|
144
|
+
True if this shape is dominated by at least one other, False otherwise.
|
145
|
+
|
146
|
+
Notes
|
147
|
+
-----
|
148
|
+
A shape is dominated if there exists another configuration that is
|
149
|
+
at least as good in all criteria and strictly better in at least one.
|
150
|
+
Criteria:
|
151
|
+
- Cost (to be minimized)
|
152
|
+
- Performance, quantization level, max sequence length (to be maximized)
|
153
|
+
"""
|
154
|
+
try:
|
155
|
+
cand_cost = self.shape_details.gpu_specs.ranking.cost
|
156
|
+
cand_perf = self.shape_details.gpu_specs.ranking.performance
|
157
|
+
cand_quant = QUANT_MAPPING.get(
|
158
|
+
self.configurations[0].deployment_params.quantization, 0
|
159
|
+
)
|
160
|
+
cand_maxlen = self.configurations[0].deployment_params.max_model_len
|
161
|
+
|
162
|
+
for other in others:
|
163
|
+
other_cost = other.shape_details.gpu_specs.ranking.cost
|
164
|
+
other_perf = other.shape_details.gpu_specs.ranking.performance
|
165
|
+
other_quant = QUANT_MAPPING.get(
|
166
|
+
other.configurations[0].deployment_params.quantization, 0
|
167
|
+
)
|
168
|
+
other_maxlen = other.configurations[0].deployment_params.max_model_len
|
169
|
+
if (
|
170
|
+
other_cost <= cand_cost
|
171
|
+
and other_perf >= cand_perf
|
172
|
+
and other_quant >= cand_quant
|
173
|
+
and other_maxlen >= cand_maxlen
|
174
|
+
and (
|
175
|
+
other_cost < cand_cost
|
176
|
+
or other_perf > cand_perf
|
177
|
+
or other_quant > cand_quant
|
178
|
+
or other_maxlen > cand_maxlen
|
179
|
+
)
|
180
|
+
):
|
181
|
+
return True
|
182
|
+
return False
|
183
|
+
except AttributeError:
|
184
|
+
return False
|
185
|
+
|
186
|
+
@classmethod
|
187
|
+
def pareto_front(cls, shapes: List["ShapeReport"]) -> List["ShapeReport"]:
|
188
|
+
"""
|
189
|
+
Filters a list of shapes/configurations to those on the Pareto frontier.
|
190
|
+
|
191
|
+
Parameters
|
192
|
+
----------
|
193
|
+
shapes : list of ShapeReport
|
194
|
+
List of candidate shape/configuration reports to evaluate.
|
195
|
+
|
196
|
+
Returns
|
197
|
+
-------
|
198
|
+
list of ShapeReport
|
199
|
+
Subset of input shapes that are not dominated by any other (the Pareto front).
|
200
|
+
|
201
|
+
Notes
|
202
|
+
-----
|
203
|
+
The returned set contains non-dominated deployments for maximizing
|
204
|
+
performance, quantization, and model length, while minimizing cost.
|
205
|
+
"""
|
206
|
+
return [
|
207
|
+
shape
|
208
|
+
for shape in shapes
|
209
|
+
if not shape.is_dominated([s for s in shapes if s != shape])
|
210
|
+
]
|
211
|
+
|
212
|
+
|
213
|
+
class ShapeRecommendationReport(BaseModel):
|
214
|
+
"""
|
215
|
+
Full report of shape fit recommendations and troubleshooting, if applicable.
|
216
|
+
|
217
|
+
Attributes:
|
218
|
+
recommendations (List[DeploymentShapeSummary]): Recommended deployment shapes
|
219
|
+
for each tested batch size and max sequence length combination.
|
220
|
+
troubleshoot (Optional[TroubleshootShapeSummary]): Troubleshooting information
|
221
|
+
if no valid deployment shapes are available.
|
222
|
+
"""
|
223
|
+
|
224
|
+
display_name: Optional[str] = Field(
|
225
|
+
"", description="Name of the model used for recommendations."
|
226
|
+
)
|
227
|
+
recommendations: List[ShapeReport] = Field(
|
228
|
+
default_factory=list, description="List of shape fit recommendations."
|
229
|
+
)
|
230
|
+
troubleshoot: Optional[str] = Field(
|
231
|
+
None,
|
232
|
+
description="Details for troubleshooting if no shapes fit the current model.",
|
233
|
+
)
|
ads/aqua/version.json
CHANGED
ads/cli.py
CHANGED
@@ -7,6 +7,8 @@ import logging
|
|
7
7
|
import sys
|
8
8
|
import traceback
|
9
9
|
import uuid
|
10
|
+
from rich.console import Console
|
11
|
+
from rich.table import Table
|
10
12
|
|
11
13
|
import fire
|
12
14
|
from pydantic import BaseModel
|
@@ -92,6 +94,12 @@ def serialize(data):
|
|
92
94
|
print(str(item))
|
93
95
|
elif isinstance(data, BaseModel):
|
94
96
|
print(json.dumps(data.dict(), indent=4))
|
97
|
+
elif isinstance(data, Table):
|
98
|
+
console = Console()
|
99
|
+
console.print(data)
|
100
|
+
return
|
101
|
+
elif data is None:
|
102
|
+
return
|
95
103
|
else:
|
96
104
|
print(str(data))
|
97
105
|
|
@@ -131,7 +139,7 @@ def exit_program(ex: Exception, logger: "logging.Logger") -> None:
|
|
131
139
|
|
132
140
|
request_id = str(uuid.uuid4())
|
133
141
|
logger.debug(f"Error Request ID: {request_id}\nError: {traceback.format_exc()}")
|
134
|
-
logger.error(f"Error Request ID: {request_id}\
|
142
|
+
logger.error(f"Error Request ID: {request_id}\nError: {str(ex)}")
|
135
143
|
|
136
144
|
exit_code = getattr(ex, "exit_code", 1)
|
137
145
|
logger.error(f"Exit code: {exit_code}")
|
@@ -365,6 +365,11 @@ class RuntimeHandler:
|
|
365
365
|
dsc_job,
|
366
366
|
"job_node_configuration_details.job_node_group_configuration_details_list",
|
367
367
|
)
|
368
|
+
if node_groups is None:
|
369
|
+
node_groups = get_value(
|
370
|
+
dsc_job,
|
371
|
+
"job_node_configuration_details.jobNodeGroupConfigurationDetailsList",
|
372
|
+
)
|
368
373
|
if node_groups and len(node_groups) == 1:
|
369
374
|
return node_groups[0]
|
370
375
|
return None
|
@@ -373,6 +378,7 @@ class RuntimeHandler:
|
|
373
378
|
node_group = self._get_node_group(dsc_job)
|
374
379
|
if node_group:
|
375
380
|
replica = get_value(node_group, "replicas")
|
381
|
+
envs.pop(self.CONST_NODE_COUNT, None)
|
376
382
|
elif not envs:
|
377
383
|
replica = None
|
378
384
|
elif self.CONST_WORKER_COUNT in envs:
|
@@ -399,7 +405,9 @@ class RuntimeHandler:
|
|
399
405
|
env_attr = "job_configuration_details.environment_variables"
|
400
406
|
node_group = self._get_node_group(dsc_job)
|
401
407
|
if node_group:
|
402
|
-
envs = get_value(node_group, env_attr)
|
408
|
+
envs = get_value(node_group, env_attr) or get_value(
|
409
|
+
node_group, "jobConfigurationDetails.environment_variables"
|
410
|
+
)
|
403
411
|
else:
|
404
412
|
envs = get_value(dsc_job, env_attr)
|
405
413
|
if envs:
|
@@ -1,23 +1,24 @@
|
|
1
1
|
#!/usr/bin/env python
|
2
|
-
# -*- coding: utf-8; -*-
|
3
2
|
|
4
|
-
# Copyright (c) 2024 Oracle and/or its affiliates.
|
3
|
+
# Copyright (c) 2024, 2025 Oracle and/or its affiliates.
|
5
4
|
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
|
6
5
|
|
7
|
-
from functools import wraps
|
8
6
|
import logging
|
9
|
-
from
|
10
|
-
from
|
11
|
-
from ads.common.work_request import DataScienceWorkRequest
|
12
|
-
from ads.config import PROJECT_OCID
|
13
|
-
from ads.model.deployment.common.utils import OCIClientManager, State
|
14
|
-
import oci
|
7
|
+
from functools import wraps
|
8
|
+
from typing import Callable, List, Optional
|
15
9
|
|
10
|
+
import oci
|
16
11
|
from oci.data_science.models import (
|
17
12
|
CreateModelDeploymentDetails,
|
13
|
+
ModelDeploymentShapeSummary,
|
18
14
|
UpdateModelDeploymentDetails,
|
19
15
|
)
|
20
16
|
|
17
|
+
from ads.common.oci_datascience import OCIDataScienceMixin
|
18
|
+
from ads.common.work_request import DataScienceWorkRequest
|
19
|
+
from ads.config import COMPARTMENT_OCID, PROJECT_OCID
|
20
|
+
from ads.model.deployment.common.utils import OCIClientManager, State
|
21
|
+
|
21
22
|
DEFAULT_WAIT_TIME = 1200
|
22
23
|
DEFAULT_POLL_INTERVAL = 10
|
23
24
|
ALLOWED_STATUS = [
|
@@ -185,14 +186,13 @@ class OCIDataScienceModelDeployment(
|
|
185
186
|
self.id,
|
186
187
|
)
|
187
188
|
|
188
|
-
|
189
189
|
self.workflow_req_id = response.headers.get("opc-work-request-id", None)
|
190
190
|
if wait_for_completion:
|
191
191
|
try:
|
192
192
|
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
|
193
193
|
progress_bar_description="Activating model deployment",
|
194
|
-
max_wait_time=max_wait_time,
|
195
|
-
poll_interval=poll_interval
|
194
|
+
max_wait_time=max_wait_time,
|
195
|
+
poll_interval=poll_interval,
|
196
196
|
)
|
197
197
|
except Exception as e:
|
198
198
|
logger.error(
|
@@ -239,8 +239,8 @@ class OCIDataScienceModelDeployment(
|
|
239
239
|
try:
|
240
240
|
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
|
241
241
|
progress_bar_description="Creating model deployment",
|
242
|
-
max_wait_time=max_wait_time,
|
243
|
-
poll_interval=poll_interval
|
242
|
+
max_wait_time=max_wait_time,
|
243
|
+
poll_interval=poll_interval,
|
244
244
|
)
|
245
245
|
except Exception as e:
|
246
246
|
logger.error("Error while trying to create model deployment: " + str(e))
|
@@ -290,8 +290,8 @@ class OCIDataScienceModelDeployment(
|
|
290
290
|
try:
|
291
291
|
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
|
292
292
|
progress_bar_description="Deactivating model deployment",
|
293
|
-
max_wait_time=max_wait_time,
|
294
|
-
poll_interval=poll_interval
|
293
|
+
max_wait_time=max_wait_time,
|
294
|
+
poll_interval=poll_interval,
|
295
295
|
)
|
296
296
|
except Exception as e:
|
297
297
|
logger.error(
|
@@ -351,14 +351,14 @@ class OCIDataScienceModelDeployment(
|
|
351
351
|
response = self.client.delete_model_deployment(
|
352
352
|
self.id,
|
353
353
|
)
|
354
|
-
|
354
|
+
|
355
355
|
self.workflow_req_id = response.headers.get("opc-work-request-id", None)
|
356
356
|
if wait_for_completion:
|
357
357
|
try:
|
358
358
|
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
|
359
359
|
progress_bar_description="Deleting model deployment",
|
360
|
-
max_wait_time=max_wait_time,
|
361
|
-
poll_interval=poll_interval
|
360
|
+
max_wait_time=max_wait_time,
|
361
|
+
poll_interval=poll_interval,
|
362
362
|
)
|
363
363
|
except Exception as e:
|
364
364
|
logger.error("Error while trying to delete model deployment: " + str(e))
|
@@ -493,3 +493,30 @@ class OCIDataScienceModelDeployment(
|
|
493
493
|
An instance of `OCIDataScienceModelDeployment`.
|
494
494
|
"""
|
495
495
|
return super().from_ocid(model_deployment_id)
|
496
|
+
|
497
|
+
@classmethod
|
498
|
+
def shapes(
|
499
|
+
cls,
|
500
|
+
compartment_id: Optional[str] = None,
|
501
|
+
**kwargs,
|
502
|
+
) -> List[ModelDeploymentShapeSummary]:
|
503
|
+
"""
|
504
|
+
Retrieves all available model deployment shapes in the given compartment.
|
505
|
+
|
506
|
+
This method uses OCI's pagination utility to fetch all pages of model
|
507
|
+
deployment shape summaries available in the specified compartment.
|
508
|
+
|
509
|
+
Args:
|
510
|
+
compartment_id (Optional[str]): The OCID of the compartment. If not provided,
|
511
|
+
the default COMPARTMENT_ID extracted form env variables is used.
|
512
|
+
**kwargs: Additional keyword arguments to pass to the list_model_deployments call.
|
513
|
+
|
514
|
+
Returns:
|
515
|
+
List[ModelDeploymentShapeSummary]: A list of all model deployment shape summaries.
|
516
|
+
"""
|
517
|
+
client = cls().client
|
518
|
+
compartment_id = compartment_id or COMPARTMENT_OCID
|
519
|
+
|
520
|
+
return oci.pagination.list_call_get_all_results(
|
521
|
+
client.list_model_deployment_shapes, compartment_id, **kwargs
|
522
|
+
).data
|
@@ -19,16 +19,21 @@ from .transformations import Transformations
|
|
19
19
|
|
20
20
|
|
21
21
|
class AbstractData(ABC):
|
22
|
-
def __init__(self, spec, name="input_data", data=None):
|
22
|
+
def __init__(self, spec, name="input_data", data=None, subset=None):
|
23
23
|
self.Transformations = Transformations
|
24
24
|
self.data = None
|
25
25
|
self._data_dict = dict()
|
26
26
|
self.name = name
|
27
27
|
self.spec = spec
|
28
|
+
self.subset = subset
|
28
29
|
if data is not None:
|
29
30
|
self.data = data
|
30
31
|
else:
|
31
32
|
self.load_transform_ingest_data(spec)
|
33
|
+
# Subset by series if requested
|
34
|
+
# if self.subset is not None and hasattr(self, 'data') and self.data is not None:
|
35
|
+
# subset_str = [str(s) for s in self.subset]
|
36
|
+
# self.data = self.data[self.data.index.get_level_values(DataColumns.Series).isin(subset_str)]
|
32
37
|
|
33
38
|
def get_raw_data_by_cat(self, category):
|
34
39
|
mapping = self._data_transformer.get_target_category_columns_map()
|
@@ -72,7 +77,7 @@ class AbstractData(ABC):
|
|
72
77
|
def _load_data(self, data_spec, **kwargs):
|
73
78
|
loading_start_time = time.time()
|
74
79
|
try:
|
75
|
-
raw_data = load_data(data_spec)
|
80
|
+
raw_data = load_data(data_spec, subset=self.subset if self.subset else None, target_category_columns=self.spec.target_category_columns)
|
76
81
|
except InvalidParameterError as e:
|
77
82
|
e.args = e.args + (f"Invalid Parameter: {self.name}",)
|
78
83
|
raise e
|
@@ -294,3 +294,210 @@ class Transformations(ABC):
|
|
294
294
|
def _fill_na(self, df: pd.DataFrame, na_value=0) -> pd.DataFrame:
|
295
295
|
"""Fill nans in dataframe"""
|
296
296
|
return df.fillna(value=na_value)
|
297
|
+
|
298
|
+
def build_fforms_meta_features(self, data, target_col=None, group_cols=None):
|
299
|
+
"""
|
300
|
+
Build meta-features for time series based on FFORMS paper and add them to the original DataFrame.
|
301
|
+
|
302
|
+
Parameters
|
303
|
+
----------
|
304
|
+
data : pandas.DataFrame
|
305
|
+
Input DataFrame containing time series data
|
306
|
+
target_col : str, optional
|
307
|
+
Name of the target column to calculate meta-features for.
|
308
|
+
If None, uses the target column specified in dataset_info.
|
309
|
+
group_cols : list of str, optional
|
310
|
+
List of columns to group by before calculating meta-features.
|
311
|
+
If None, calculates features for the entire series.
|
312
|
+
|
313
|
+
Returns
|
314
|
+
-------
|
315
|
+
pandas.DataFrame
|
316
|
+
Original DataFrame with additional meta-feature columns
|
317
|
+
|
318
|
+
References
|
319
|
+
----------
|
320
|
+
Talagala, T. S., Hyndman, R. J., & Athanasopoulos, G. (2023).
|
321
|
+
Meta-learning how to forecast time series. Journal of Forecasting, 42(6), 1476-1501.
|
322
|
+
"""
|
323
|
+
if not isinstance(data, pd.DataFrame):
|
324
|
+
raise ValueError("Input must be a pandas DataFrame")
|
325
|
+
|
326
|
+
# Use target column from dataset_info if not specified
|
327
|
+
if target_col is None:
|
328
|
+
target_col = self.target_column_name
|
329
|
+
if target_col not in data.columns:
|
330
|
+
raise ValueError(f"Target column '{target_col}' not found in DataFrame")
|
331
|
+
|
332
|
+
# Check if group_cols are provided and valid
|
333
|
+
if group_cols is not None:
|
334
|
+
if not isinstance(group_cols, list):
|
335
|
+
raise ValueError("group_cols must be a list of column names")
|
336
|
+
for col in group_cols:
|
337
|
+
if col not in data.columns:
|
338
|
+
raise ValueError(f"Group column '{col}' not found in DataFrame")
|
339
|
+
|
340
|
+
# If no group_cols, get the target_category_columns else treat the entire DataFrame as a single series
|
341
|
+
if not group_cols:
|
342
|
+
group_cols = self.target_category_columns if self.target_category_columns else []
|
343
|
+
|
344
|
+
# Calculate meta-features for each series
|
345
|
+
def calculate_series_features(series):
|
346
|
+
"""Calculate features for a single series"""
|
347
|
+
n = len(series)
|
348
|
+
values = series.values
|
349
|
+
|
350
|
+
# Basic statistics
|
351
|
+
mean = series.mean()
|
352
|
+
std = series.std()
|
353
|
+
variance = series.var()
|
354
|
+
skewness = series.skew()
|
355
|
+
kurtosis = series.kurtosis()
|
356
|
+
cv = std / mean if mean != 0 else np.inf
|
357
|
+
|
358
|
+
# Trend features
|
359
|
+
X = np.vstack([np.arange(n), np.ones(n)]).T
|
360
|
+
trend_coef = np.linalg.lstsq(X, values, rcond=None)[0][0]
|
361
|
+
trend_pred = X.dot(np.linalg.lstsq(X, values, rcond=None)[0])
|
362
|
+
residuals = values - trend_pred
|
363
|
+
std_residuals = np.std(residuals)
|
364
|
+
|
365
|
+
# Turning points
|
366
|
+
turning_points = 0
|
367
|
+
for i in range(1, n-1):
|
368
|
+
if (values[i-1] < values[i] and values[i] > values[i+1]) or \
|
369
|
+
(values[i-1] > values[i] and values[i] < values[i+1]):
|
370
|
+
turning_points += 1
|
371
|
+
turning_points_rate = turning_points / (n-2) if n > 2 else 0
|
372
|
+
|
373
|
+
# Serial correlation
|
374
|
+
acf1 = series.autocorr(lag=1) if n > 1 else 0
|
375
|
+
acf2 = series.autocorr(lag=2) if n > 2 else 0
|
376
|
+
acf10 = series.autocorr(lag=10) if n > 10 else 0
|
377
|
+
|
378
|
+
# Seasonality features
|
379
|
+
seasonal_strength = 0
|
380
|
+
seasonal_peak_strength = 0
|
381
|
+
if n >= 12:
|
382
|
+
seasonal_lags = [12, 24, 36]
|
383
|
+
seasonal_acfs = []
|
384
|
+
for lag in seasonal_lags:
|
385
|
+
if n > lag:
|
386
|
+
acf_val = series.autocorr(lag=lag)
|
387
|
+
seasonal_acfs.append(abs(acf_val))
|
388
|
+
seasonal_peak_strength = max(seasonal_acfs) if seasonal_acfs else 0
|
389
|
+
|
390
|
+
ma = series.rolling(window=12, center=True).mean()
|
391
|
+
seasonal_comp = series - ma
|
392
|
+
seasonal_strength = 1 - np.var(seasonal_comp.dropna()) / np.var(series)
|
393
|
+
|
394
|
+
# Stability and volatility features
|
395
|
+
values_above_mean = values >= mean
|
396
|
+
crossing_points = np.sum(values_above_mean[1:] != values_above_mean[:-1])
|
397
|
+
crossing_rate = crossing_points / (n - 1) if n > 1 else 0
|
398
|
+
|
399
|
+
# First and second differences
|
400
|
+
diff1 = np.diff(values)
|
401
|
+
diff2 = np.diff(diff1) if len(diff1) > 1 else np.array([])
|
402
|
+
|
403
|
+
diff1_mean = np.mean(np.abs(diff1)) if len(diff1) > 0 else 0
|
404
|
+
diff1_var = np.var(diff1) if len(diff1) > 0 else 0
|
405
|
+
diff2_mean = np.mean(np.abs(diff2)) if len(diff2) > 0 else 0
|
406
|
+
diff2_var = np.var(diff2) if len(diff2) > 0 else 0
|
407
|
+
|
408
|
+
# Nonlinearity features
|
409
|
+
if n > 3:
|
410
|
+
X = values[:-1].reshape(-1, 1)
|
411
|
+
y = values[1:]
|
412
|
+
X2 = X * X
|
413
|
+
X3 = X * X * X
|
414
|
+
X_aug = np.hstack([X, X2, X3])
|
415
|
+
nonlinearity = np.linalg.lstsq(X_aug, y, rcond=None)[1][0] if len(y) > 0 else 0
|
416
|
+
else:
|
417
|
+
nonlinearity = 0
|
418
|
+
|
419
|
+
# Long-term trend features
|
420
|
+
if n >= 10:
|
421
|
+
mid = n // 2
|
422
|
+
trend_change = np.mean(values[mid:]) - np.mean(values[:mid])
|
423
|
+
else:
|
424
|
+
trend_change = 0
|
425
|
+
|
426
|
+
# Step changes and spikes
|
427
|
+
step_changes = np.abs(diff1).max() if len(diff1) > 0 else 0
|
428
|
+
spikes = np.sum(np.abs(values - mean) > 2 * std) / n if std != 0 else 0
|
429
|
+
|
430
|
+
# Hurst exponent and entropy
|
431
|
+
lag = min(10, n // 2)
|
432
|
+
variance_ratio = np.var(series.diff(lag)) / (lag * np.var(series.diff())) if n > lag else 0
|
433
|
+
hurst = np.log(variance_ratio) / (2 * np.log(lag)) if variance_ratio > 0 and lag > 1 else 0
|
434
|
+
|
435
|
+
hist, _ = np.histogram(series, bins='auto', density=True)
|
436
|
+
entropy = -np.sum(hist[hist > 0] * np.log(hist[hist > 0]))
|
437
|
+
|
438
|
+
return pd.Series({
|
439
|
+
'ts_n_obs': n,
|
440
|
+
'ts_mean': mean,
|
441
|
+
'ts_std': std,
|
442
|
+
'ts_variance': variance,
|
443
|
+
'ts_cv': cv,
|
444
|
+
'ts_skewness': skewness,
|
445
|
+
'ts_kurtosis': kurtosis,
|
446
|
+
'ts_trend': trend_coef,
|
447
|
+
'ts_trend_change': trend_change,
|
448
|
+
'ts_std_residuals': std_residuals,
|
449
|
+
'ts_turning_points_rate': turning_points_rate,
|
450
|
+
'ts_seasonal_strength': seasonal_strength,
|
451
|
+
'ts_seasonal_peak_strength': seasonal_peak_strength,
|
452
|
+
'ts_acf1': acf1,
|
453
|
+
'ts_acf2': acf2,
|
454
|
+
'ts_acf10': acf10,
|
455
|
+
'ts_crossing_rate': crossing_rate,
|
456
|
+
'ts_diff1_mean': diff1_mean,
|
457
|
+
'ts_diff1_variance': diff1_var,
|
458
|
+
'ts_diff2_mean': diff2_mean,
|
459
|
+
'ts_diff2_variance': diff2_var,
|
460
|
+
'ts_nonlinearity': nonlinearity,
|
461
|
+
'ts_step_max': step_changes,
|
462
|
+
'ts_spikes_rate': spikes,
|
463
|
+
'ts_hurst': hurst,
|
464
|
+
'ts_entropy': entropy
|
465
|
+
})
|
466
|
+
|
467
|
+
# Create copy of input DataFrame
|
468
|
+
result_df = data.copy()
|
469
|
+
|
470
|
+
if group_cols:
|
471
|
+
# Calculate features for each group
|
472
|
+
features = []
|
473
|
+
# Sort by date within each group if date column exists
|
474
|
+
date_col = self.dt_column_name if self.dt_column_name else 'Date'
|
475
|
+
if date_col in data.columns:
|
476
|
+
data = data.sort_values([date_col] + group_cols)
|
477
|
+
|
478
|
+
for name, group in data.groupby(group_cols):
|
479
|
+
# Sort group by date if exists
|
480
|
+
if date_col in group.columns:
|
481
|
+
group = group.sort_values(date_col)
|
482
|
+
group_features = calculate_series_features(group[target_col])
|
483
|
+
if isinstance(name, tuple):
|
484
|
+
feature_row = dict(zip(group_cols, name))
|
485
|
+
else:
|
486
|
+
feature_row = {group_cols[0]: name}
|
487
|
+
feature_row.update(group_features)
|
488
|
+
features.append(feature_row)
|
489
|
+
|
490
|
+
# Create features DataFrame without merging
|
491
|
+
features_df = pd.DataFrame(features)
|
492
|
+
# Return only the meta-features DataFrame with group columns
|
493
|
+
return features_df
|
494
|
+
else:
|
495
|
+
# Sort by date if exists and calculate features for entire series
|
496
|
+
date_col = self.dt_column_name if self.dt_column_name else 'Date'
|
497
|
+
if date_col in data.columns:
|
498
|
+
data = data.sort_values(date_col)
|
499
|
+
features = calculate_series_features(data[target_col])
|
500
|
+
# Return single row DataFrame with meta-features
|
501
|
+
return pd.DataFrame([features])
|
502
|
+
|
503
|
+
return result_df
|
@@ -124,6 +124,14 @@ def load_data(data_spec, storage_options=None, **kwargs):
|
|
124
124
|
data = data[columns]
|
125
125
|
if limit:
|
126
126
|
data = data[:limit]
|
127
|
+
# Filtering by subset if provided
|
128
|
+
subset = kwargs.get('subset', None)
|
129
|
+
if subset is not None:
|
130
|
+
target_category_columns = kwargs.get('target_category_columns', None)
|
131
|
+
mask = False
|
132
|
+
for col in target_category_columns:
|
133
|
+
mask = mask | data[col].isin(subset)
|
134
|
+
data = data[mask]
|
127
135
|
return data
|
128
136
|
|
129
137
|
|