promptlearn 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptlearn/__init__.py +5 -0
- promptlearn/base.py +130 -0
- promptlearn/classifier.py +142 -0
- promptlearn/regressor.py +79 -0
- promptlearn/version.py +2 -0
- promptlearn-0.1.0.dist-info/METADATA +223 -0
- promptlearn-0.1.0.dist-info/RECORD +13 -0
- promptlearn-0.1.0.dist-info/WHEEL +5 -0
- promptlearn-0.1.0.dist-info/licenses/LICENSE +21 -0
- promptlearn-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_classifier.py +39 -0
- tests/test_regressor.py +12 -0
promptlearn/__init__.py
ADDED
promptlearn/base.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import openai
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List, Union
|
|
5
|
+
from io import StringIO
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import warnings
|
|
9
|
+
|
|
10
|
+
from sklearn.base import BaseEstimator
|
|
11
|
+
from sklearn.utils.validation import check_X_y
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BasePromptEstimator(BaseEstimator):
|
|
15
|
+
model: str
|
|
16
|
+
prompt_template: str
|
|
17
|
+
verbose: bool
|
|
18
|
+
feature_names_in_: List[str]
|
|
19
|
+
heuristic_: str
|
|
20
|
+
target_name_: str
|
|
21
|
+
|
|
22
|
+
def __init__(self, model: str, prompt_template: str, verbose: bool = False):
|
|
23
|
+
self.model = model
|
|
24
|
+
self.prompt_template = prompt_template
|
|
25
|
+
self.verbose = verbose
|
|
26
|
+
|
|
27
|
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
28
|
+
self.llm_client = openai.OpenAI()
|
|
29
|
+
|
|
30
|
+
def _get_feature_names(self, X: Union[np.ndarray, pd.DataFrame]) -> List[str]:
|
|
31
|
+
return X.columns.tolist() if isinstance(X, pd.DataFrame) else [f"x{i+1}" for i in range(X.shape[1])]
|
|
32
|
+
|
|
33
|
+
def _get_target_name(self, y: Union[np.ndarray, pd.Series]) -> str:
|
|
34
|
+
return str(y.name) if isinstance(y, pd.Series) and y.name else "target"
|
|
35
|
+
|
|
36
|
+
def _format_training_data(self, X: np.ndarray, y: Union[np.ndarray, List], feature_names: List[str], target_name: str) -> str:
|
|
37
|
+
rows = ["\t".join(feature_names + [target_name])]
|
|
38
|
+
for xi, yi in zip(X, y):
|
|
39
|
+
row = list(map(str, xi)) + [str(yi)]
|
|
40
|
+
rows.append("\t".join(row))
|
|
41
|
+
return "\n".join(rows)
|
|
42
|
+
|
|
43
|
+
def _format_features(self, x: Union[np.ndarray, pd.Series]) -> str:
|
|
44
|
+
if isinstance(x, pd.Series):
|
|
45
|
+
return ", ".join(
|
|
46
|
+
f"{k}={v:.3f}" if isinstance(v, (int, float)) else f"{k}='{v}'"
|
|
47
|
+
for k, v in x.items()
|
|
48
|
+
)
|
|
49
|
+
else:
|
|
50
|
+
return ", ".join(
|
|
51
|
+
f"{name}={value:.3f}" if isinstance(value, (int, float)) else f"{name}='{value}'"
|
|
52
|
+
for name, value in zip(self.feature_names_in_, x)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def _call_llm(self, prompt: str) -> str:
|
|
56
|
+
if self.verbose:
|
|
57
|
+
logging.debug(f"LLM prompt:\n{prompt}")
|
|
58
|
+
try:
|
|
59
|
+
response = self.llm_client.chat.completions.create(
|
|
60
|
+
model=self.model,
|
|
61
|
+
messages=[{"role": "user", "content": prompt}]
|
|
62
|
+
)
|
|
63
|
+
result = (response.choices[0].message.content or "").strip()
|
|
64
|
+
|
|
65
|
+
if self.verbose:
|
|
66
|
+
logging.info(f"LLM result: {result}")
|
|
67
|
+
return result
|
|
68
|
+
except Exception as e:
|
|
69
|
+
raise RuntimeError(f"LLM call failed: {e}")
|
|
70
|
+
|
|
71
|
+
def _fit_common(self, X, y) -> None:
|
|
72
|
+
if not isinstance(X, pd.DataFrame):
|
|
73
|
+
X, y = check_X_y(X, y)
|
|
74
|
+
|
|
75
|
+
self.feature_names_in_ = self._get_feature_names(X)
|
|
76
|
+
self.target_name_ = self._get_target_name(y)
|
|
77
|
+
X_values = X.values if isinstance(X, pd.DataFrame) else X
|
|
78
|
+
|
|
79
|
+
formatted_data = self._format_training_data(X_values, y, self.feature_names_in_, self.target_name_)
|
|
80
|
+
self.training_prompt_ = self.prompt_template.format(data=formatted_data)
|
|
81
|
+
self.heuristic_ = self._call_llm(self.training_prompt_)
|
|
82
|
+
|
|
83
|
+
def __getstate__(self):
|
|
84
|
+
state = self.__dict__.copy()
|
|
85
|
+
# Remove non-pickleable objects
|
|
86
|
+
if "llm_client" in state:
|
|
87
|
+
del state["llm_client"]
|
|
88
|
+
return state
|
|
89
|
+
|
|
90
|
+
def __setstate__(self, state):
|
|
91
|
+
self.__dict__.update(state)
|
|
92
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
93
|
+
|
|
94
|
+
if not api_key:
|
|
95
|
+
warnings.warn(
|
|
96
|
+
"OPENAI_API_KEY is not set. "
|
|
97
|
+
"This PromptEstimator cannot make predictions until the key is available.",
|
|
98
|
+
RuntimeWarning
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
openai.api_key = api_key
|
|
102
|
+
self.llm_client = openai.OpenAI()
|
|
103
|
+
|
|
104
|
+
def _parse_tsv(self, tsv: str) -> pd.DataFrame:
|
|
105
|
+
"""Parse tab-separated values (TSV) into a pandas DataFrame."""
|
|
106
|
+
try:
|
|
107
|
+
# Clean common LLM output artifacts
|
|
108
|
+
tsv_cleaned = tsv.strip().replace("```", "").strip()
|
|
109
|
+
|
|
110
|
+
# Use StringIO to treat the string like a file
|
|
111
|
+
df = pd.read_csv(StringIO(tsv_cleaned), sep="\t")
|
|
112
|
+
|
|
113
|
+
# Optionally: strip whitespace from column names
|
|
114
|
+
df.columns = df.columns.str.strip()
|
|
115
|
+
|
|
116
|
+
return df
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
raise ValueError(f"Failed to parse TSV output:\n{tsv}\nError: {e}")
|
|
120
|
+
|
|
121
|
+
def sample(self, n: int = 5) -> pd.DataFrame:
|
|
122
|
+
"""Generate n synthetic examples that illustrate the heuristic."""
|
|
123
|
+
prompt = (
|
|
124
|
+
f"{self.heuristic_}\n\n"
|
|
125
|
+
f"Please generate {n} example rows in tabular format with the following columns:\n"
|
|
126
|
+
f"{', '.join(self.feature_names_in_ + [self.target_name_])}.\n"
|
|
127
|
+
f"Use tab-separated format. Do not explain."
|
|
128
|
+
)
|
|
129
|
+
text = self._call_llm(prompt)
|
|
130
|
+
return self._parse_tsv(text)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
import re
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from sklearn.base import ClassifierMixin
|
|
5
|
+
from sklearn.metrics import accuracy_score
|
|
6
|
+
from sklearn.utils.validation import check_array
|
|
7
|
+
from .base import BasePromptEstimator
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
DEFAULT_PROMPT_TEMPLATE = """\\
|
|
11
|
+
You are a seasoned data scientist tasked with building a classification prompt for an LLM.
|
|
12
|
+
|
|
13
|
+
Treat the data as a sample of a much larger problem domain, so don't just memorize the data as-is. Try to minimize prediction error, not just describe dominant patterns. Include exceptions and counterexamples in your logic.
|
|
14
|
+
|
|
15
|
+
Look at the name of the target column and figure out its meaning. It the input features seem to be text or text entities, it is OK to output a prompt that will ask the LLM to reason by itself what the target value could be, but it should always result in an integer value (if it's a boolean then True=1 and False=0).
|
|
16
|
+
|
|
17
|
+
Conduct an analysis based on the following data, and output only the final trained classifier (like a decision tree, human-readable instructions, etc) that will be conveyed in the form of an LLM prompt to another system. The rules will be executed as given so you need to have all the weights, equations, thresholds, etc in your output. The classifier should be able to accurately predict the value (class) of the last column based on the data in the other columns.
|
|
18
|
+
|
|
19
|
+
If you find that a single rule is too broad, break it down into more specific cases to reduce false positives and false negatives.
|
|
20
|
+
|
|
21
|
+
Note that if you can predict the target value using your own logic or built-in knowledge, output a text prompt that will direct the prediction LLM to do so.
|
|
22
|
+
|
|
23
|
+
Data:
|
|
24
|
+
{data}
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
class PromptClassifier(BasePromptEstimator, ClassifierMixin):
|
|
28
|
+
def __init__(self, model: str = "o4-mini", prompt_template: Optional[str] = None,
|
|
29
|
+
verbose: bool = False, chunk_threshold: int = 300,
|
|
30
|
+
force_chunking: bool = False, max_chunks: Optional[int] = None):
|
|
31
|
+
super().__init__(model, prompt_template or DEFAULT_PROMPT_TEMPLATE, verbose)
|
|
32
|
+
self.chunk_threshold = chunk_threshold
|
|
33
|
+
self.force_chunking = force_chunking
|
|
34
|
+
self.max_chunks = max_chunks
|
|
35
|
+
self.heuristic_history_: List[str] = []
|
|
36
|
+
|
|
37
|
+
def _normalize_target_values(self, y):
|
|
38
|
+
values = pd.Series(y)
|
|
39
|
+
classes = sorted(values.unique())
|
|
40
|
+
self.allowed_classes_ = list(map(int, classes))
|
|
41
|
+
return values.astype(int)
|
|
42
|
+
|
|
43
|
+
def _instruction_suffix(self) -> str:
|
|
44
|
+
class_str = ", ".join(str(c) for c in self.allowed_classes_)
|
|
45
|
+
return (
|
|
46
|
+
f"Use only numeric values when predicting {self.target_name_}.\n"
|
|
47
|
+
f"For example: predict {class_str}.\n"
|
|
48
|
+
f"Respond only with one of: {class_str}."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def fit(self, X, y) -> "PromptClassifier":
|
|
52
|
+
self.target_name_ = self._get_target_name(y)
|
|
53
|
+
y = self._normalize_target_values(y)
|
|
54
|
+
if self.force_chunking or (isinstance(X, pd.DataFrame) and len(X) > self.chunk_threshold):
|
|
55
|
+
if self.verbose:
|
|
56
|
+
print(f"🌀 Switching to chunked fit: {len(X)} rows > threshold {self.chunk_threshold}")
|
|
57
|
+
return self.fit_chunked(X, y, max_chunks=self.max_chunks)
|
|
58
|
+
self._fit_common(X, y)
|
|
59
|
+
print(f"🧠 Final heuristic:\n{self.heuristic_}")
|
|
60
|
+
return self
|
|
61
|
+
|
|
62
|
+
def fit_chunked(self, X, y, chunk_size: int = 100, max_chunks: Optional[int] = None) -> "PromptClassifier":
|
|
63
|
+
if not isinstance(X, pd.DataFrame):
|
|
64
|
+
raise ValueError("fit_chunked requires a pandas DataFrame for X")
|
|
65
|
+
|
|
66
|
+
df = X.copy()
|
|
67
|
+
df[self.target_name_] = y.values if hasattr(y, "values") else y
|
|
68
|
+
|
|
69
|
+
total_rows = len(df)
|
|
70
|
+
num_chunks = (total_rows - 1) // chunk_size + 1
|
|
71
|
+
if max_chunks:
|
|
72
|
+
num_chunks = min(num_chunks, max_chunks)
|
|
73
|
+
|
|
74
|
+
scratchpad = ""
|
|
75
|
+
|
|
76
|
+
for i in tqdm(range(num_chunks), desc="🧠 Chunked training"):
|
|
77
|
+
chunk = df.iloc[i * chunk_size : (i + 1) * chunk_size]
|
|
78
|
+
chunk_csv = chunk.to_csv(index=False)
|
|
79
|
+
|
|
80
|
+
prompt = f"""You are analyzing a large dataset in sequential windows.
|
|
81
|
+
|
|
82
|
+
This is the current window (rows {i * chunk_size} to {(i + 1) * chunk_size - 1}):
|
|
83
|
+
|
|
84
|
+
{chunk_csv}
|
|
85
|
+
|
|
86
|
+
Your current scratchpad (classifier-in-progress):
|
|
87
|
+
|
|
88
|
+
{scratchpad}
|
|
89
|
+
|
|
90
|
+
Update the classifier description based on this window. Use rules, thresholds, decision structures, or conditions.
|
|
91
|
+
Do not output predictions for this window. Instead, revise the general classifier logic.
|
|
92
|
+
Respond only with the full classifier description in plain text. The classifier should always output an integer value for each possible path (no string returns).
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
scratchpad = self._call_llm(prompt)
|
|
96
|
+
self.heuristic_history_.append(scratchpad)
|
|
97
|
+
if self.verbose:
|
|
98
|
+
print(f"🧠 Updated heuristic after chunk {i + 1}:\n{scratchpad}\n")
|
|
99
|
+
|
|
100
|
+
print(f"🧠 Final heuristic:\n{scratchpad}")
|
|
101
|
+
self.heuristic_ = scratchpad
|
|
102
|
+
return self
|
|
103
|
+
|
|
104
|
+
def _predict_one(self, x) -> int:
|
|
105
|
+
feature_string = self._format_features(x)
|
|
106
|
+
prompt = (
|
|
107
|
+
self.heuristic_ + "\n\n"
|
|
108
|
+
f"Given: {feature_string}\n"
|
|
109
|
+
f"What is the predicted {self.target_name_}?\n"
|
|
110
|
+
f"{self._instruction_suffix()}"
|
|
111
|
+
)
|
|
112
|
+
response = self._call_llm(prompt).strip()
|
|
113
|
+
|
|
114
|
+
# Try parsing directly
|
|
115
|
+
try:
|
|
116
|
+
return int(response)
|
|
117
|
+
except ValueError:
|
|
118
|
+
# Try regex extraction
|
|
119
|
+
match = re.search(r"\b(\d+)\b", response)
|
|
120
|
+
if match:
|
|
121
|
+
return int(match.group(1))
|
|
122
|
+
|
|
123
|
+
# If both fail, raise a clear error
|
|
124
|
+
raise ValueError(f"⚠️ Could not parse numeric prediction from LLM response: {response}")
|
|
125
|
+
|
|
126
|
+
def predict(self, X) -> List[int]:
|
|
127
|
+
if isinstance(X, pd.DataFrame):
|
|
128
|
+
return [self._predict_one(row) for _, row in tqdm(X.iterrows(), total=len(X), desc="🔮 Predicting")]
|
|
129
|
+
else:
|
|
130
|
+
X_checked = check_array(X)
|
|
131
|
+
return [self._predict_one(x) for x in tqdm(X_checked, desc="🔮 Predicting")]
|
|
132
|
+
|
|
133
|
+
def score(self, X, y, sample_weight=None) -> float:
|
|
134
|
+
y_pred = self.predict(X)
|
|
135
|
+
return float(accuracy_score(y, y_pred, sample_weight=sample_weight))
|
|
136
|
+
|
|
137
|
+
def show_heuristic_evolution(self):
|
|
138
|
+
print("🧠 Heuristic Evolution:\n")
|
|
139
|
+
for i, h in enumerate(self.heuristic_history_):
|
|
140
|
+
print(f"--- After chunk {i+1} ---")
|
|
141
|
+
print(h.strip())
|
|
142
|
+
print()
|
promptlearn/regressor.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import Optional, List
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import re
|
|
5
|
+
from sklearn.base import RegressorMixin
|
|
6
|
+
from sklearn.metrics import mean_squared_error
|
|
7
|
+
from sklearn.utils.validation import check_array
|
|
8
|
+
from .base import BasePromptEstimator
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
|
|
11
|
+
DEFAULT_PROMPT_TEMPLATE = """\
|
|
12
|
+
Analyze the following data and output only the final trained regression function (e.g., a linear or nonlinear equation) that best fits the data. The data has one of more features as input and the last column is the target value.
|
|
13
|
+
|
|
14
|
+
Consider if there are known Laws of Physics or well-established relationships to the target variable and use these to inform or completely determine the output.
|
|
15
|
+
|
|
16
|
+
The function will be evaluated by an LLM, so just have an equation with the names of the variables in the dataset.
|
|
17
|
+
|
|
18
|
+
Your answer should not include explanations, only the final equation. Use ascii characters only.
|
|
19
|
+
|
|
20
|
+
Data:
|
|
21
|
+
{data}
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PromptRegressor(BasePromptEstimator, RegressorMixin):
|
|
26
|
+
def __init__(self, model: str = "o4-mini", prompt_template: Optional[str] = None, verbose: bool = False):
|
|
27
|
+
super().__init__(model, prompt_template or DEFAULT_PROMPT_TEMPLATE, verbose)
|
|
28
|
+
self.failed_predictions_: List[tuple] = []
|
|
29
|
+
self.explanation_: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
def fit(self, X, y) -> "PromptRegressor":
|
|
32
|
+
print("🔧 Fitting PromptRegressor...")
|
|
33
|
+
self._fit_common(X, y)
|
|
34
|
+
self.explanation_ = self.heuristic_
|
|
35
|
+
print("\n📜 Final regression function:")
|
|
36
|
+
print(self.heuristic_)
|
|
37
|
+
return self
|
|
38
|
+
|
|
39
|
+
def _predict_one(self, x) -> float:
|
|
40
|
+
feature_string = self._format_features(x)
|
|
41
|
+
prompt = (
|
|
42
|
+
f"Given: {feature_string}\n"
|
|
43
|
+
f"Answer this question: What is the predicted {self.target_name_}?\n"
|
|
44
|
+
f"Use the following heuristic to calculate it: {self.heuristic_}\n"
|
|
45
|
+
"Respond only with a single number like 13.2 — no units, no formula, no explanation."
|
|
46
|
+
)
|
|
47
|
+
print(prompt)
|
|
48
|
+
raw = self._call_llm(prompt)
|
|
49
|
+
|
|
50
|
+
match = re.search(r"-?\d+(\.\d+)?", raw)
|
|
51
|
+
if match:
|
|
52
|
+
try:
|
|
53
|
+
result = float(match.group())
|
|
54
|
+
print(f"Predicted: {result}")
|
|
55
|
+
return result
|
|
56
|
+
except ValueError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
print("⚠️ Non-numeric LLM response:\n", raw)
|
|
60
|
+
self.failed_predictions_.append((feature_string, raw))
|
|
61
|
+
return np.nan
|
|
62
|
+
|
|
63
|
+
def predict(self, X) -> List[float]:
|
|
64
|
+
if isinstance(X, pd.DataFrame):
|
|
65
|
+
return [self._predict_one(row) for _, row in tqdm(X.iterrows(), total=len(X), desc="🔮 Predicting")]
|
|
66
|
+
else:
|
|
67
|
+
X_checked = check_array(X)
|
|
68
|
+
return [self._predict_one(x) for x in tqdm(X_checked, desc="🔮 Predicting")]
|
|
69
|
+
|
|
70
|
+
def score(self, X, y, sample_weight=None) -> float:
|
|
71
|
+
y_pred = self.predict(X)
|
|
72
|
+
y_pred = np.array(y_pred)
|
|
73
|
+
y_true = np.array(y)
|
|
74
|
+
|
|
75
|
+
mask = ~np.isnan(y_pred)
|
|
76
|
+
if mask.sum() == 0:
|
|
77
|
+
raise ValueError("All predictions were NaN — check LLM output or prompt format.")
|
|
78
|
+
|
|
79
|
+
return -mean_squared_error(y_true[mask], y_pred[mask], sample_weight=sample_weight)
|
promptlearn/version.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptlearn
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM-powered estimators for scikit-learn pipelines
|
|
5
|
+
Home-page: https://github.com/frlinaker/promptlearn
|
|
6
|
+
Author: Fredrik Linaker
|
|
7
|
+
Author-email: fredrik.linaker@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: scikit-learn
|
|
17
|
+
Requires-Dist: openai
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: numpy
|
|
20
|
+
Requires-Dist: joblib
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ⚡️ promptlearn
|
|
36
|
+
|
|
37
|
+
**promptlearn** brings large language models into your scikit-learn workflow.
|
|
38
|
+
It replaces traditional estimators with language-native reasoning systems that learn, adapt, and describe patterns using natural language as the model substrate.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
### 📊 Outperforming Traditional Models with Built-In Knowledge
|
|
43
|
+
|
|
44
|
+
`promptlearn` allows LLMs to internalize both structure and semantics during training. As a result, the models often exceed the capabilities of classical estimators when the task requires reasoning, real-world knowledge, or symbolic understanding.
|
|
45
|
+
|
|
46
|
+
Consider a simple binary classification task: predicting whether an [animal is a mammal](examples/data/mammal_train.csv) based on its name, weight, and lifespan.
|
|
47
|
+
|
|
48
|
+
Traditional models depend solely on the input features. But `promptlearn` models can use their internal understanding of zoology to form highly accurate rules. Even when a label like `"Whale"` is never seen during training, the model knows it belongs to the mammal class.
|
|
49
|
+
|
|
50
|
+
| Model | Accuracy |
|
|
51
|
+
|-----------------------|----------|
|
|
52
|
+
| `promptlearn-o4-mini` | **1.00** |
|
|
53
|
+
| `promptlearn-gpt-4o` | 0.97 |
|
|
54
|
+
| `logistic_regression`| 0.60 |
|
|
55
|
+
| `random_forest` | 0.46 |
|
|
56
|
+
| `dummy` | 0.34 |
|
|
57
|
+
|
|
58
|
+
This type of semantic generalization is a powerful advantage for LLM-backed models.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
Now compare performance on a regression task where the data contains samples of objects falling from different heights, under different gravity. This is a classic physics problem, with a well-known equation:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
fall_time_s = sqrt((2 * height_m) / gravity_mps2)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Recent `promptlearn` estimators are able to recover this exact formula and use it to generate near-perfect predictions:
|
|
69
|
+
|
|
70
|
+
| Model | MSE |
|
|
71
|
+
|------------------------|-----------|
|
|
72
|
+
| `promptlearn-o4-mini` | **0.00006** |
|
|
73
|
+
| `promptlearn-gpt-4o` | 0.00006 |
|
|
74
|
+
| `gradient_boosting` | 0.035 |
|
|
75
|
+
| `linear_regression` | 0.498 |
|
|
76
|
+
| `dummy` | 5.27 |
|
|
77
|
+
| `promptlearn-gpt-4` | 43.17 |
|
|
78
|
+
|
|
79
|
+
No feature engineering was performed. No physics constants were added. The model discovered the rule and applied it directly. Classical regressors, by contrast, approximated a curve but missed the exact structure.
|
|
80
|
+
|
|
81
|
+
These results highlight the practical benefit of reasoning models: they learn compact, expressive heuristics and can outperform traditional systems when symbolic insight or background knowledge is essential.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
### 🤖 Estimators Powered by Language
|
|
86
|
+
|
|
87
|
+
`promptlearn` provides scikit-learn-compatible estimators that use LLMs as the modeling engine:
|
|
88
|
+
|
|
89
|
+
- **`PromptClassifier`** – for predicting classes through generalized reasoning
|
|
90
|
+
- **`PromptRegressor`** – for modeling numeric relationships in data
|
|
91
|
+
|
|
92
|
+
These estimators follow the same API as other `scikit-learn` models (`fit`, `predict`, `score`) but operate via dynamic prompt construction and few-shot abstraction.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
### 📘 What it Learns: The Heuristic
|
|
97
|
+
|
|
98
|
+
When you call `.fit()`, the LLM reviews your data and writes out an internal heuristic — a compact representation of what it has inferred. This heuristic might describe:
|
|
99
|
+
|
|
100
|
+
- A relationship between age, hours worked, and income
|
|
101
|
+
- How education, gender, and occupation relate to survival rates
|
|
102
|
+
- Why one row differs from another
|
|
103
|
+
|
|
104
|
+
The result is a plain-text model. It is readable, portable, and expressive. This is stored in `.heuristic_`, and it powers all predictions.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
### 🧠 Language-Aware Reasoning
|
|
109
|
+
|
|
110
|
+
Because the models are backed by LLMs, they can reason across both structure and semantics:
|
|
111
|
+
|
|
112
|
+
- Names of columns matter
|
|
113
|
+
- Missing data can be explained or inferred
|
|
114
|
+
- World knowledge is available by default
|
|
115
|
+
|
|
116
|
+
A trained model might use context like:
|
|
117
|
+
|
|
118
|
+
> “Bachelors” typically correlates with medium income
|
|
119
|
+
> “Private” workclass often means lower capital gain
|
|
120
|
+
> Rows with missing `native-country` likely default to “United States”
|
|
121
|
+
|
|
122
|
+
This allows reasoning across incomplete, skewed, or lightly structured data without hand-tuning features.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
### 🧬 Background Knowledge Included
|
|
127
|
+
|
|
128
|
+
The LLM brings its internal knowledge graph to the modeling task. For instance:
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
Input: country = "Norway"
|
|
132
|
+
Output: has_blue_in_flag = 1
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Even if there is no signal in the data, the model may still predict correctly by referencing background information. This creates a kind of ambient “web join” during training and inference.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
### 🕳 Zero-Example Learning
|
|
140
|
+
|
|
141
|
+
If you call `.fit()` with no rows — just column names — `promptlearn` will still return a working model.
|
|
142
|
+
|
|
143
|
+
This is possible because the LLM can hallucinate a plausible mapping based on:
|
|
144
|
+
|
|
145
|
+
- Column names
|
|
146
|
+
- Prior knowledge
|
|
147
|
+
- Type hints or value patterns
|
|
148
|
+
|
|
149
|
+
This makes rapid prototyping and conceptual modeling trivial.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
### 🧠 Scaling with Chunked Training
|
|
154
|
+
|
|
155
|
+
To support large datasets, `promptlearn` uses a sliding window training mechanism.
|
|
156
|
+
|
|
157
|
+
During `.fit()`:
|
|
158
|
+
- The dataset is processed in batches (“chunks”)
|
|
159
|
+
- The current heuristic is passed forward like a scratchpad
|
|
160
|
+
- Each chunk contributes feedback and refinement
|
|
161
|
+
- The model evolves with each window
|
|
162
|
+
|
|
163
|
+
This allows training on limitless rows using a fixed memory budget. The process is transparent. If the dataset is large, chunked training activates automatically.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
### 🧪 Native `.sample()` Support
|
|
168
|
+
|
|
169
|
+
You can generate synthetic rows directly from any trained model using `.sample(n)`:
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
>>> model.sample(3)
|
|
173
|
+
fruit is_citrus
|
|
174
|
+
Lime 1
|
|
175
|
+
Banana 0
|
|
176
|
+
Orange 1
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
This is useful for:
|
|
180
|
+
|
|
181
|
+
- Understanding what the model believes
|
|
182
|
+
- Creating test sets or bootstrapped data
|
|
183
|
+
- Building readable examples from internal logic
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
### 💾 Save and Reload with `joblib`
|
|
188
|
+
|
|
189
|
+
Like any scikit-learn model, `promptlearn` estimators can be serialized:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
import joblib
|
|
193
|
+
|
|
194
|
+
joblib.dump(model, "model.joblib")
|
|
195
|
+
model = joblib.load("model.joblib")
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
The LLM client is excluded from the saved file and re-initialized on load. The heuristic remains intact, interpretable, and ready to use.
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## 📚 Related Work
|
|
203
|
+
|
|
204
|
+
### Scikit-LLM
|
|
205
|
+
|
|
206
|
+
[Scikit-LLM](https://github.com/BeastByteAI/scikit-llm) provides zero- and few-shot classification through template-based prompting.
|
|
207
|
+
It is lightweight and NLP-focused.
|
|
208
|
+
|
|
209
|
+
**promptlearn** offers a broader modeling philosophy:
|
|
210
|
+
|
|
211
|
+
| Capability | Scikit-LLM | promptlearn |
|
|
212
|
+
|-----------------------------|--------------------|----------------------------|
|
|
213
|
+
| Prompt generated during fit | ❌ No | ✅ Yes |
|
|
214
|
+
| Regression support | ❌ No | ✅ Yes |
|
|
215
|
+
| Produces textual heuristics | ❌ No | ✅ Yes |
|
|
216
|
+
| Works on tabular data | ✅ Partial | ✅ Full |
|
|
217
|
+
| Generates sample rows | ❌ No | ✅ `.sample()` |
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## 📁 License
|
|
222
|
+
|
|
223
|
+
MIT © 2025 Fredrik Linaker
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
promptlearn/__init__.py,sha256=MEwDOUKS-eT2qTeJXvs79OykaCVj4nq8TKwwCM--zAY,169
|
|
2
|
+
promptlearn/base.py,sha256=M5NOlqNw_-_YsgEo3TrF1MlYo0fJYnlzRZs4xPlXTbk,4885
|
|
3
|
+
promptlearn/classifier.py,sha256=EiuMHdXAIKPVB9LODyc0u_Zh6NEbCBmm9v5v5DEP0v8,6643
|
|
4
|
+
promptlearn/regressor.py,sha256=pQWmtlJjuVaSIyvwc-_XTYHd1BgjtfUQANUpH3R-DKA,3272
|
|
5
|
+
promptlearn/version.py,sha256=3MtN4yLt02e1-GxJiwHyA3zL1tcu_6CWpXKHAD1zLAI,49
|
|
6
|
+
promptlearn-0.1.0.dist-info/licenses/LICENSE,sha256=wem7QBzjXI8wmdMKsM3ndt4O2Lpa59rOpQcylMoWVc8,1101
|
|
7
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
tests/test_classifier.py,sha256=11apdCumJ5DSZPEk0RFpW9SAB1rmwokd0FzNncrQUwM,1254
|
|
9
|
+
tests/test_regressor.py,sha256=ncJtxGNfUNR8oivvqacwjsNhCimwl-5OOloiB4g_kRM,353
|
|
10
|
+
promptlearn-0.1.0.dist-info/METADATA,sha256=ovVNkZk8TRL71ImEwbfBQ6tGeO4EINQYZ2pvcJxSkLE,8084
|
|
11
|
+
promptlearn-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
+
promptlearn-0.1.0.dist-info/top_level.txt,sha256=ddd47vS9AWtb98x34_g8I04FSHN5HH9Yk9S-Wt1d-Ak,18
|
|
13
|
+
promptlearn-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Fredrik Linaker
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the “Software”), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_classifier.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import joblib
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from promptlearn import PromptClassifier
|
|
4
|
+
|
|
5
|
+
def test_zero_row_classifier_runs():
|
|
6
|
+
X = pd.DataFrame(columns=["country_name"])
|
|
7
|
+
y = pd.Series(name="has_blue_in_flag", dtype=int)
|
|
8
|
+
|
|
9
|
+
clf = PromptClassifier(verbose=False)
|
|
10
|
+
clf.fit(X, y)
|
|
11
|
+
|
|
12
|
+
result = clf.predict(pd.DataFrame([{"country_name": "France"}]))
|
|
13
|
+
assert isinstance(result[0], int)
|
|
14
|
+
|
|
15
|
+
def test_promptclassifier_joblib_roundtrip(tmp_path):
|
|
16
|
+
# Define a minimal "trained" classifier manually
|
|
17
|
+
clf = PromptClassifier(model="o4-mini", verbose=False)
|
|
18
|
+
clf.feature_names_in_ = ["name"]
|
|
19
|
+
clf.target_name_ = "is_animal"
|
|
20
|
+
clf.heuristic_ = "IF name in {cat, dog, tiger} THEN is_animal = 1 ELSE 0"
|
|
21
|
+
|
|
22
|
+
# Save to disk
|
|
23
|
+
model_path = tmp_path / "clf.joblib"
|
|
24
|
+
joblib.dump(clf, model_path)
|
|
25
|
+
|
|
26
|
+
# Load it back
|
|
27
|
+
loaded = joblib.load(model_path)
|
|
28
|
+
|
|
29
|
+
# Confirm state round-tripped
|
|
30
|
+
assert loaded.model == "o4-mini"
|
|
31
|
+
assert loaded.target_name_ == "is_animal"
|
|
32
|
+
assert loaded.heuristic_.startswith("IF name")
|
|
33
|
+
|
|
34
|
+
# Run prediction
|
|
35
|
+
X = pd.DataFrame([{"name": "cat"}, {"name": "car"}])
|
|
36
|
+
preds = loaded.predict(X)
|
|
37
|
+
|
|
38
|
+
assert isinstance(preds, list)
|
|
39
|
+
assert all(isinstance(p, int) for p in preds)
|
tests/test_regressor.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from promptlearn import PromptRegressor
|
|
3
|
+
|
|
4
|
+
def test_zero_row_regressor_runs():
|
|
5
|
+
X = pd.DataFrame(columns=["length"])
|
|
6
|
+
y = pd.Series(name="mass", dtype=float)
|
|
7
|
+
|
|
8
|
+
reg = PromptRegressor(verbose=False)
|
|
9
|
+
reg.fit(X, y)
|
|
10
|
+
|
|
11
|
+
result = reg.predict(pd.DataFrame([{"length": 2.5}]))
|
|
12
|
+
assert isinstance(result[0], float)
|