ml_approach_suggestion_agent 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ml_approach_suggestion_agent might be problematic. Click here for more details.

@@ -17,7 +17,7 @@ class MLApproachDecisionAgent:
17
17
  self.config = config or MethodologyConfig()
18
18
  self.ai_handler = SFNAIHandler()
19
19
 
20
- def suggest_approach(self, domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights, max_try=1) -> Tuple[MethodologyRecommendation, Dict[str, Any]]:
20
+ def suggest_approach(self, domain_name, domain_description, use_case, column_insights, max_try=1) -> Tuple[MethodologyRecommendation, Dict[str, Any]]:
21
21
  """
22
22
  Suggests a machine learning approach based on the provided domain, use case, and column descriptions.
23
23
  Args:
@@ -36,7 +36,7 @@ class MLApproachDecisionAgent:
36
36
 
37
37
 
38
38
  """
39
- system_prompt, user_prompt = format_approach_prompt(domain_name=domain_name, domain_description=domain_description, use_case=use_case, column_descriptions=column_descriptions, column_insights=column_insights, target_column_name=target_column_name, target_column_insights=target_column_insights)
39
+ system_prompt, user_prompt = format_approach_prompt(domain_name=domain_name, domain_description=domain_description, use_case=use_case, column_insights=column_insights)
40
40
  for _ in range(max_try):
41
41
  try:
42
42
  response, cost_summary = self.ai_handler.route_to(
@@ -66,14 +66,11 @@ class MLApproachDecisionAgent:
66
66
 
67
67
  def execute_task(self, task_data: Dict[str, Any]) -> Dict[str, Any]:
68
68
  self.logger.info("Executing data quality assessment task.")
69
- domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights = (
69
+ domain_name, domain_description, use_case, column_insights = (
70
70
  task_data["domain_name"],
71
71
  task_data["domain_description"],
72
72
  task_data["use_case"],
73
- task_data["column_descriptions"],
74
73
  task_data["column_insights"],
75
- task_data["target_column_name"],
76
- task_data["target_column_insights"],
77
74
  )
78
75
 
79
76
  # Suggest an approach
@@ -81,10 +78,7 @@ class MLApproachDecisionAgent:
81
78
  domain_name=domain_name,
82
79
  domain_description=domain_description,
83
80
  use_case=use_case,
84
- column_descriptions=column_descriptions,
85
81
  column_insights=column_insights,
86
- target_column_name=target_column_name,
87
- target_column_insights=target_column_insights
88
82
  )
89
83
  if not result:
90
84
  return {
@@ -13,7 +13,7 @@ class MethodologyConfig(BaseSettings):
13
13
  )
14
14
 
15
15
  methodology_ai_provider: str = Field(default="openai", description="AI provider to use")
16
- methodology_ai_model: str = Field(default="gpt-4o-mini", description="AI model to use")
16
+ methodology_ai_model: str = Field(default="gpt-5-mini", description="AI model to use")
17
17
  methodology_temperature: float = Field(default=0.3, ge=0.0, le=0.5, description="AI model temperature")
18
18
  methodology_max_tokens: int = Field(default=4000, ge=100, le=8000, description="Maximum tokens for AI response")
19
19
 
@@ -1,102 +1,84 @@
1
- METHODOLOGY_SELECTION_SYSTEM_PROMPT = """
2
- You are an ML methodology advisor. Your task is to analyze the user's problem and recommend the single most appropriate approach.
1
+ METHODOLOGY_SELECTION_SYSTEM_PROMPT = """You are an ML methodology advisor. Analyze the problem and select ONE methodology: binary_classification, time_series_forecasting, or not_applicable.
3
2
 
4
- **Decision Framework:**
3
+ **Simple Decision Rules:**
5
4
 
6
- 1. **Understand the Business Goal:** Start with the `use_case_description` to grasp what the user is trying to achieve.
5
+ 1. **Binary Classification** - Choose when:
6
+ - Use case asks "predict whether", "will X happen", "classify if"
7
+ - Answer is YES/NO, TRUE/FALSE, or 1/0
8
+ - Example: "predict if machine fails", "detect fraud", "identify churn"
7
9
 
8
- 2. **Examine the Target Variable:** Use `target_column_insights` to understand the nature of what needs to be predicted:
9
- - Check `unique_count` to determine if it's binary (2 values), multiclass (>2 values), or continuous
10
- - Check `data_type` to see if it's numerical (int/float) or categorical (str)
11
- - Review `sample_values` to understand the actual values
10
+ 2. **Time Series Forecasting** - Choose when:
11
+ - Use case asks to "forecast", "predict future value", "estimate next"
12
+ - Answer is a NUMERICAL value in the FUTURE
13
+ - Example: "forecast next month sales", "predict tomorrow's temperature"
12
14
 
13
- 3. **Check for Temporal Dependencies:** Look for these indicators in `column_insights`:
14
- - Presence of timestamp/date columns with high unique_count (near row_count)
15
- - Column descriptions mentioning "time", "date", "timestamp", "sequential"
16
- - Use case description mentioning "over time", "forecast", "predict future", "trend", "sequential patterns"
17
- - If temporal column exists AND the prediction depends on historical patterns, consider time series methods
18
-
19
- 4. **Critical Time Series Distinction:**
20
- - **Time Series Forecasting**: Target is NUMERICAL and goal is predicting FUTURE VALUES (e.g., "predict next month's sales", "forecast temperature")
21
- - **Time Series Classification**: Target is CATEGORICAL (even if binary 0/1) and data is SEQUENTIAL (e.g., "classify failure from sensor patterns", "detect activity type from accelerometer sequence")
22
- - **Binary/Multiclass Classification**: Target is categorical BUT data points are INDEPENDENT (no temporal ordering matters)
23
-
24
- 5. **Assess Data Structure:** Review `column_insights`:
25
- - If no target specified or target_column_name is "Not specified" → likely `not_applicable`
26
- - If use case is purely computational/rule-based → `not_applicable`
27
-
28
- 6. **Select ONE Methodology:**
29
- - `binary_classification`: Target has exactly 2 unique values (categorical/binary) AND samples are independent (no temporal dependency)
30
- - `multiclass_classification`: Target has >2 unique categories AND samples are independent
31
- - `time_series_forecasting`: Target is NUMERICAL AND prediction involves FUTURE time periods based on historical patterns
32
- - `time_series_classification`: Target is CATEGORICAL (binary or multiclass) AND data has TEMPORAL ORDERING where sequential patterns are critical for prediction
33
- - `not_applicable`: No clear ML objective, purely rule-based problem, or insufficient information
34
-
35
- 7. **Key Rules to Avoid Mistakes:**
36
- - Binary target (0/1) with timestamp does NOT automatically mean binary_classification - check if SEQUENCE matters
37
- - If use case mentions "based on sensor readings over time", "sequential patterns", "time-series data" → it's time_series_classification
38
- - If use case is just "predict X" with no temporal context and independent samples → it's binary/multiclass_classification
39
- - Presence of timestamp column alone doesn't mean time series - check if the PREDICTION depends on temporal patterns
40
- - If target is numerical but goal is "classify into categories" → still classification, not regression
41
-
42
- 8. **Justify Your Choice:**
43
- - State the business goal clearly
44
- - Identify the target variable type (binary/multiclass/numerical)
45
- - Explain whether temporal dependencies exist and matter for prediction
46
- - Connect these factors to show why the chosen methodology fits
15
+ 3. **Not Applicable** - Choose when:
16
+ - No prediction needed
17
+ - Just data analysis, reporting, or calculations
18
+ - Not enough information
19
+ **Required Output:**
20
+ 1. Select the single best ML methodology from: binary_classification, time_series_forecasting, or not_applicable
21
+ 2. Provide a clear justification explaining:
22
+ - What you understand the business goal to be
23
+ - What type of prediction is needed (binary outcome, numerical forecast, or none)
24
+ - Whether temporal patterns are critical for this prediction
25
+ - Why the selected methodology is the best fit
47
26
 
27
+ **Important:**
28
+ - Having timestamps doesn't mean it's time series forecasting
29
+ - Check WHAT is being predicted: binary outcome OR future number
30
+ - The dataset may contain 1-4 tables - analyze all provided tables together"""
48
31
 
49
- """
50
32
 
51
33
 
52
-
53
- METHODOLOGY_SELECTION_USER_PROMPT = """
54
- **Business Context:**
34
+ METHODOLOGY_SELECTION_USER_PROMPT = """**Business Context:**
55
35
  Domain: {domain_name}
56
36
  {domain_description}
57
37
 
58
38
  **Use Case:**
59
39
  {use_case_description}
60
40
 
61
- **Data Overview:**
62
- Columns:
63
- {column_descriptions}
64
41
 
65
42
  Dataset Characteristics:
66
43
  {column_insights}
67
44
 
68
- **Target Information:**
69
- Target Column: {target_column_name}
70
- Target Details: {target_column_insights}
71
-
72
- **Required Output:**
73
- 1. Select the single best ML methodology
74
- 2. Provide a brief justification explaining why this methodology fits the problem
75
45
  """
76
46
 
77
47
 
78
-
79
- def format_approach_prompt(domain_name, domain_description, use_case, column_descriptions, column_insights, target_column_name, target_column_insights):
48
+ def format_approach_prompt(
49
+ domain_name: str,
50
+ domain_description: str,
51
+ use_case: str,
52
+ column_insights: str
53
+ ) -> tuple[str, str]:
80
54
  """
55
+ Format the methodology selection prompts for the LLM.
56
+
81
57
  Args:
82
- domain (str): The domain of the data.
83
- use_case (str): The use case of the data.
84
- column_descriptions (List[str]): A list of column descriptions.
85
- column_insights (List[str]): A list of column insights.
86
-
58
+ domain_name: The domain of the data (e.g., "Healthcare", "Finance")
59
+ domain_description: Detailed description of the domain context
60
+ use_case: Description of what the user wants to achieve
61
+ column_descriptions: Description of the columns in the dataset
62
+ column_insights: Statistical insights about the columns (data types,
63
+ unique counts, distributions, etc.)
64
+
87
65
  Returns:
88
- Tuple[str, str]: The formatted system prompt and user prompt.
89
- TODO:
90
- - Change prompt write new prompt and involve the supported approaches in new prompt.
66
+ tuple[str, str]: The formatted system prompt and user prompt
67
+
68
+ Example:
69
+ system_prompt, user_prompt = format_approach_prompt(
70
+ domain_name="E-commerce",
71
+ domain_description="Online retail platform with customer transactions",
72
+ use_case="Predict if a customer will make a purchase",
73
+ column_descriptions="user_id, page_views, cart_additions, timestamp",
74
+ column_insights="4 columns, 10000 rows, mixed types"
75
+ )
91
76
  """
92
-
93
77
  user_prompt = METHODOLOGY_SELECTION_USER_PROMPT.format(
94
78
  domain_name=domain_name,
95
79
  domain_description=domain_description,
96
80
  use_case_description=use_case,
97
- column_descriptions=column_descriptions,
98
- column_insights=column_insights,
99
- target_column_name=target_column_name,
100
- target_column_insights=target_column_insights,
81
+ column_insights=column_insights
101
82
  )
102
- return METHODOLOGY_SELECTION_SYSTEM_PROMPT, user_prompt
83
+
84
+ return METHODOLOGY_SELECTION_SYSTEM_PROMPT, user_prompt
@@ -2,6 +2,6 @@ from pydantic import BaseModel, Field
2
2
  from typing import Literal
3
3
 
4
4
  class MethodologyRecommendation(BaseModel):
5
- selected_methodology: Literal["binary_classification", "multiclass_classification", "time_series_forecasting", "time_series_classification", "not_applicable" ] = Field(..., description="The most appropriate ML approach for this problem")
5
+ selected_methodology: Literal[ "binary_classification", "time_series_forecasting", "not_applicable"] = Field(..., description="The most appropriate ML approach for this problem")
6
6
 
7
- justification: str = Field(..., description="Clear explanation connecting the business goal and target variable to the chosen methodology")
7
+ justification: str = Field( ..., description="Clear explanation connecting the business goal and data characteristics to the chosen methodology")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ml_approach_suggestion_agent
3
- Version: 0.1.0
3
+ Version: 0.1.3
4
4
  Summary: Add your description here
5
5
  License-Expression: MIT
6
6
  Classifier: Programming Language :: Python :: 3
@@ -12,7 +12,7 @@ Classifier: Operating System :: OS Independent
12
12
  Requires-Python: >=3.11
13
13
  Description-Content-Type: text/markdown
14
14
  Requires-Dist: pydantic-settings
15
- Requires-Dist: sfn-blueprint>=0.6.15
15
+ Requires-Dist: sfn-blueprint>=0.6.16
16
16
  Provides-Extra: dev
17
17
  Requires-Dist: pytest; extra == "dev"
18
18
  Requires-Dist: pytest-mock; extra == "dev"
@@ -0,0 +1,9 @@
1
+ ml_approach_suggestion_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ ml_approach_suggestion_agent/agent.py,sha256=HdISZ15vU5K7pzHXJpncrCWpgPvEGJLn7_1jTt7Zeuw,5010
3
+ ml_approach_suggestion_agent/config.py,sha256=19zO13zQ2Fmf_0wKqOfc2KfM-WD6EM4YPJLjRrG_jTY,703
4
+ ml_approach_suggestion_agent/constants.py,sha256=EDLzMHXM8lwxb-lVLjkVElCJYChPntXW06WfCQHglJ8,3139
5
+ ml_approach_suggestion_agent/models.py,sha256=cfZbMZPMAcNxeWTuIb_TKmNSr0C99Z4ZMicYPxlScyA,448
6
+ ml_approach_suggestion_agent-0.1.3.dist-info/METADATA,sha256=tdnDGRvv6_VEkgkvLKTe6XcV_qfLiKNB2IE5LOdqD4c,8451
7
+ ml_approach_suggestion_agent-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ ml_approach_suggestion_agent-0.1.3.dist-info/top_level.txt,sha256=3-KHls6umFXtNFJoP7OFCLvb4zd12AWH71PVKNd5Aok,29
9
+ ml_approach_suggestion_agent-0.1.3.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- ml_approach_suggestion_agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- ml_approach_suggestion_agent/agent.py,sha256=4FgDbiFXY1BRQN3hToif2Y-uaceHPdPrwyG3GEVRWMU,5569
3
- ml_approach_suggestion_agent/config.py,sha256=YOAXgdzgY72yB0mNEt9J2w1bdavx-VSCANBJ_4YR4Bo,704
4
- ml_approach_suggestion_agent/constants.py,sha256=znbt7x6xAxJL4eAZXcr8njerD84c8QuKK2ZlsFhsloo,5011
5
- ml_approach_suggestion_agent/models.py,sha256=i4vxGvA9vB3JOAh8IySMfcyAEQ7CWQe9ArgG7Rqo_cU,501
6
- ml_approach_suggestion_agent-0.1.0.dist-info/METADATA,sha256=qha4KlPfYbnoPlkCSJM_1L0COCFeh0CxBC_sr2WLYU0,8451
7
- ml_approach_suggestion_agent-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- ml_approach_suggestion_agent-0.1.0.dist-info/top_level.txt,sha256=3-KHls6umFXtNFJoP7OFCLvb4zd12AWH71PVKNd5Aok,29
9
- ml_approach_suggestion_agent-0.1.0.dist-info/RECORD,,