experiment-configuration-agent 0.1.6__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- experiment_config_agent/config.py +2 -2
- experiment_config_agent/constants.py +119 -2
- experiment_config_agent/models.py +10 -6
- {experiment_configuration_agent-0.1.6.dist-info → experiment_configuration_agent-0.1.9.dist-info}/METADATA +1 -1
- experiment_configuration_agent-0.1.9.dist-info/RECORD +9 -0
- {experiment_configuration_agent-0.1.6.dist-info → experiment_configuration_agent-0.1.9.dist-info}/WHEEL +1 -1
- experiment_configuration_agent-0.1.6.dist-info/RECORD +0 -9
- {experiment_configuration_agent-0.1.6.dist-info → experiment_configuration_agent-0.1.9.dist-info}/top_level.txt +0 -0
|
@@ -19,8 +19,8 @@ class GluonConfig(BaseSettings):
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
# This will now read from LM_PROVIDER environment variable
|
|
22
|
-
provider: str = Field(default=os.getenv('
|
|
23
|
-
model: str = Field(default=os.getenv('LLM_MODEL_NAME', 'gpt-
|
|
22
|
+
provider: str = Field(default=os.getenv('LLM_SERVICE_PROVIDER', 'openai'), description="AI provider to use")
|
|
23
|
+
model: str = Field(default=os.getenv('LLM_MODEL_NAME', 'gpt-4.1-mini'), description="AI model to use")
|
|
24
24
|
temperature: float = Field(default=0.3, ge=0.0, le=0.5, description="AI model temperature")
|
|
25
25
|
max_tokens: int = Field(default=4000, ge=0, le=8000, description="Maximum tokens for AI response")
|
|
26
26
|
|
|
@@ -27,6 +27,121 @@ CORE MODEL CONCEPTS:
|
|
|
27
27
|
- If bagging is enabled (num_bag_folds > 0), 'split_test_size' is ignored as CV is used.
|
|
28
28
|
- If bagging is 0, 'split_test_size' (e.g., 0.1 to 0.2) is mandatory to monitor overfitting.
|
|
29
29
|
|
|
30
|
+
|
|
31
|
+
5. TRAINING TIME LIMIT CALCULATION — MANDATORY EXECUTION
|
|
32
|
+
|
|
33
|
+
You MUST calculate time_limit exactly using the following steps:
|
|
34
|
+
|
|
35
|
+
STEP 1: Calculate BASE TIME from row count
|
|
36
|
+
------------------------------------------
|
|
37
|
+
Read rows from dataset_insights["rows"]
|
|
38
|
+
|
|
39
|
+
IF 0 < rows ≤ 500000:
|
|
40
|
+
base_time = 60 + (rows / 500000) * 540 → Range: 60-600 seconds
|
|
41
|
+
|
|
42
|
+
ELSE IF 500000 < rows ≤ 7000000:
|
|
43
|
+
base_time = 600 + (rows / 7000000) * 1200 → Range: 600-1800 seconds
|
|
44
|
+
|
|
45
|
+
ELSE (rows > 7000000):
|
|
46
|
+
base_time = 1800 + (rows / 7000000) * 1800 → Range: 1800-3600 seconds
|
|
47
|
+
|
|
48
|
+
STEP 2: Apply COLUMN COUNT multiplier
|
|
49
|
+
--------------------------------------
|
|
50
|
+
Count columns from dataset_insights["feature_columns"]
|
|
51
|
+
|
|
52
|
+
IF 0 < columns ≤ 50:
|
|
53
|
+
column_multiplier = 1.0
|
|
54
|
+
ELSE IF 50 < columns ≤ 100:
|
|
55
|
+
column_multiplier = 1.2
|
|
56
|
+
ELSE IF 100 < columns ≤ 150:
|
|
57
|
+
column_multiplier = 1.4
|
|
58
|
+
ELSE IF 150 < columns ≤ 200:
|
|
59
|
+
column_multiplier = 1.6
|
|
60
|
+
ELSE (columns > 200):
|
|
61
|
+
column_multiplier = 1.8
|
|
62
|
+
|
|
63
|
+
STEP 3: Apply MODEL COUNT multiplier
|
|
64
|
+
------------------------------------
|
|
65
|
+
Count the number of models selected in the 'models' list:
|
|
66
|
+
|
|
67
|
+
IF num_models = 1:
|
|
68
|
+
model_multiplier = 0.6
|
|
69
|
+
ELSE IF num_models = 2:
|
|
70
|
+
model_multiplier = 0.8
|
|
71
|
+
ELSE IF num_models = 3:
|
|
72
|
+
model_multiplier = 1.0
|
|
73
|
+
ELSE IF num_models = 4:
|
|
74
|
+
model_multiplier = 1.2
|
|
75
|
+
ELSE IF num_models = 5:
|
|
76
|
+
model_multiplier = 1.4
|
|
77
|
+
ELSE (num_models >= 6):
|
|
78
|
+
model_multiplier = 1.5
|
|
79
|
+
|
|
80
|
+
STEP 4: Apply ENSEMBLE STRATEGY multiplier
|
|
81
|
+
------------------------------------------
|
|
82
|
+
Based on num_bag_folds, num_bag_sets, and num_stack_levels:
|
|
83
|
+
|
|
84
|
+
A. Bagging folds multiplier (from num_bag_folds):
|
|
85
|
+
IF num_bag_folds = 0:
|
|
86
|
+
bag_folds_multiplier = 1.0
|
|
87
|
+
ELSE IF num_bag_folds <= 5:
|
|
88
|
+
bag_folds_multiplier = 1.1
|
|
89
|
+
ELSE IF num_bag_folds <= 8:
|
|
90
|
+
bag_folds_multiplier = 1.3
|
|
91
|
+
ELSE (num_bag_folds > 8):
|
|
92
|
+
bag_folds_multiplier = 1.5
|
|
93
|
+
|
|
94
|
+
B. Bagging sets multiplier (from num_bag_sets):
|
|
95
|
+
IF num_bag_sets = 1:
|
|
96
|
+
bag_sets_multiplier = 1.0
|
|
97
|
+
ELSE IF num_bag_sets = 2:
|
|
98
|
+
bag_sets_multiplier = 1.2
|
|
99
|
+
ELSE (num_bag_sets >= 3):
|
|
100
|
+
bag_sets_multiplier = 1.4
|
|
101
|
+
|
|
102
|
+
C. Stacking multiplier (from num_stack_levels):
|
|
103
|
+
IF num_stack_levels = 0:
|
|
104
|
+
stack_multiplier = 1.0
|
|
105
|
+
ELSE IF num_stack_levels = 1:
|
|
106
|
+
stack_multiplier = 1.5
|
|
107
|
+
ELSE (num_stack_levels >= 2):
|
|
108
|
+
stack_multiplier = 2.0
|
|
109
|
+
|
|
110
|
+
ensemble_multiplier = bag_folds_multiplier * bag_sets_multiplier * stack_multiplier
|
|
111
|
+
|
|
112
|
+
STEP 5: Calculate FINAL time_limit
|
|
113
|
+
----------------------------------
|
|
114
|
+
time_limit = base_time * column_multiplier * model_multiplier * ensemble_multiplier
|
|
115
|
+
|
|
116
|
+
STEP 6: Apply bounds and round
|
|
117
|
+
------------------------------
|
|
118
|
+
MIN_TIME = 60
|
|
119
|
+
MAX_TIME = 3600
|
|
120
|
+
|
|
121
|
+
IF time_limit < MIN_TIME → time_limit = MIN_TIME
|
|
122
|
+
IF time_limit > MAX_TIME → time_limit = MAX_TIME
|
|
123
|
+
|
|
124
|
+
Round UP to nearest multiple of 5.
|
|
125
|
+
|
|
126
|
+
EXAMPLE CALCULATION:
|
|
127
|
+
-------------------
|
|
128
|
+
Given: rows=250000, columns=45, models=['GBM','CAT','XGB'], num_bag_folds=5, num_bag_sets=2, num_stack_levels=1
|
|
129
|
+
|
|
130
|
+
base_time = 60 + (250000/500000) * 540 = 330 seconds
|
|
131
|
+
column_multiplier = 1.2 (60 columns)
|
|
132
|
+
model_multiplier = 1.0 (3 models)
|
|
133
|
+
bag_folds_multiplier = 1.3 (5 folds)
|
|
134
|
+
bag_sets_multiplier = 1.5 (2 sets)
|
|
135
|
+
stack_multiplier = 1.5 (1 level)
|
|
136
|
+
ensemble_multiplier = 1.3 * 1.5 * 1.5 = 2.925
|
|
137
|
+
|
|
138
|
+
time_limit = 330 * 1.2 * 1.0 * 2.925 = 1158.3 seconds
|
|
139
|
+
Rounded up to: 1160 seconds
|
|
140
|
+
|
|
141
|
+
- You are NOT allowed to choose a different value than time_limit.
|
|
142
|
+
- You are NOT allowed to approximate, proceed with the calculated value.
|
|
143
|
+
- You MUST output the computed value exactly.
|
|
144
|
+
|
|
30
145
|
PRESET SELECTION LOGIC (Ordered by Quality/Complexity):
|
|
31
146
|
======================================================
|
|
32
147
|
|
|
@@ -45,7 +160,9 @@ CONSTRAINTS:
|
|
|
45
160
|
- You MUST only use the 6 allowed models ('GBM', 'CAT', 'XGB', 'RF', 'XT', 'KNN').
|
|
46
161
|
- You MUST only use the 5 allowed presets ('extreme_quality', 'best_quality', 'high_quality', 'good_quality', 'medium_quality').
|
|
47
162
|
- You MUST only use the 6 allowed metrics for both eval_metric and additional_metrics.
|
|
48
|
-
|
|
163
|
+
- Do NOT choose mid-range values by default.
|
|
164
|
+
- You MUST calculate time_limit using the formula in Section 5. The minimum is 60 seconds and maximum is 3600 seconds.
|
|
165
|
+
- For each scenario, you MUST show the time_limit calculation steps clearly before stating the final value.
|
|
49
166
|
Provide three distinct scenarios: Max Accuracy (Heavy), Production-Ready (Balanced), and Fast-Track (Speed)."""
|
|
50
167
|
|
|
51
168
|
|
|
@@ -201,7 +318,7 @@ Based on the above information, recommend an optimal AutoGluon configuration tha
|
|
|
201
318
|
|
|
202
319
|
|
|
203
320
|
Consider multiple scenarios:
|
|
204
|
-
- Scenario A: Maximum accuracy (accepting longer training time)
|
|
321
|
+
- Scenario A: Maximum accuracy (accepting longer training time upto the relevant time limit for the given dataset rows)
|
|
205
322
|
- Scenario B: Balanced accuracy and speed (production-ready)
|
|
206
323
|
- Scenario C: Fast training and inference (prototyping/deployment constrained)
|
|
207
324
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from pydantic import BaseModel, Field
|
|
2
|
-
from typing import List, Literal
|
|
2
|
+
from typing import List, Literal, Optional
|
|
3
3
|
|
|
4
4
|
class AutoGluonConfig(BaseModel):
|
|
5
5
|
eval_metric: str = Field(
|
|
@@ -7,11 +7,15 @@ class AutoGluonConfig(BaseModel):
|
|
|
7
7
|
description="Primary metric to optimize. Allowed: 'accuracy', 'log_loss', 'f1', 'roc_auc', 'precision', 'recall'."
|
|
8
8
|
)
|
|
9
9
|
|
|
10
|
-
preset: Literal[
|
|
10
|
+
preset: Optional[Literal[
|
|
11
11
|
'best_quality', 'high_quality', 'good_quality', 'medium_quality'
|
|
12
|
-
] = Field(
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
]] = Field(
|
|
13
|
+
default='good_quality',
|
|
14
|
+
description=(
|
|
15
|
+
"Optional preset configuration. "
|
|
16
|
+
"'best_quality' enables bagging/stacking for maximum accuracy. "
|
|
17
|
+
"If not provided, AutoGluon default behavior will be used."
|
|
18
|
+
)
|
|
15
19
|
)
|
|
16
20
|
|
|
17
21
|
additional_metrics: List[str] = Field(
|
|
@@ -21,7 +25,7 @@ class AutoGluonConfig(BaseModel):
|
|
|
21
25
|
|
|
22
26
|
time_limit: int = Field(
|
|
23
27
|
...,
|
|
24
|
-
description="Total training time in seconds. AutoGluon will distribute this across models. Small datasets:
|
|
28
|
+
description="Total training time in seconds. AutoGluon will distribute this across models. Small datasets (upto 500000 rows): Value between 60 seconds to 600 seconds, Medium (upto 7000000 rows): Value between 601 seconds to 1800 seconds, Large (7000000+ rows): Value between 1800 seconds to 3600 seconds."
|
|
25
29
|
)
|
|
26
30
|
|
|
27
31
|
num_bag_folds: int = Field(
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
experiment_config_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
experiment_config_agent/agent.py,sha256=bjFBjyR0_0hRUePfCDICitGfG7fVuZxBhPkwZZ0gsxM,3700
|
|
3
|
+
experiment_config_agent/config.py,sha256=bzxTH9QQ5ds3SyvxELeDjZg8WKNe7DikOE5-H-CjjN8,1141
|
|
4
|
+
experiment_config_agent/constants.py,sha256=9EYOuiDzaD4Pd-yxiC0pj1t67pxHf7EXZqQcqqhp2OQ,12271
|
|
5
|
+
experiment_config_agent/models.py,sha256=eGftcqwXazqvArSifQNg2XlF1DW3Za8WZ0KCxDlHWRw,2572
|
|
6
|
+
experiment_configuration_agent-0.1.9.dist-info/METADATA,sha256=o7AnkR64hwDtup9-Rk6G-V3Ot71fkyNZjVbBA2MrUw0,4453
|
|
7
|
+
experiment_configuration_agent-0.1.9.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
8
|
+
experiment_configuration_agent-0.1.9.dist-info/top_level.txt,sha256=5c9CyVEjFUlvEf08vJIvi6BkzGuS4wdwtjdmCk2uL2U,24
|
|
9
|
+
experiment_configuration_agent-0.1.9.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
experiment_config_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
experiment_config_agent/agent.py,sha256=bjFBjyR0_0hRUePfCDICitGfG7fVuZxBhPkwZZ0gsxM,3700
|
|
3
|
-
experiment_config_agent/config.py,sha256=yAS1XWdOklcpuHwq5F3u-j2zQmX-ErLowj9IqcgnqH4,1138
|
|
4
|
-
experiment_config_agent/constants.py,sha256=O7fsJQXVmt8Zs-A3sYxTafyNdpQv2H4tbL3E--rJ7Ug,8167
|
|
5
|
-
experiment_config_agent/models.py,sha256=u8bANPWUUE2hlH8S7ZA2N9bRKuN6vp7vGRpVRwXn-aE,2271
|
|
6
|
-
experiment_configuration_agent-0.1.6.dist-info/METADATA,sha256=y30snhqZgHUF5XXxG8DCPR9-1uQ2XplYdu-h6TEfu-I,4453
|
|
7
|
-
experiment_configuration_agent-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
experiment_configuration_agent-0.1.6.dist-info/top_level.txt,sha256=5c9CyVEjFUlvEf08vJIvi6BkzGuS4wdwtjdmCk2uL2U,24
|
|
9
|
-
experiment_configuration_agent-0.1.6.dist-info/RECORD,,
|
|
File without changes
|