openaivec 0.10.0__py3-none-any.whl → 1.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/__init__.py +13 -4
- openaivec/_cache/__init__.py +12 -0
- openaivec/_cache/optimize.py +109 -0
- openaivec/_cache/proxy.py +806 -0
- openaivec/_di.py +326 -0
- openaivec/_embeddings.py +203 -0
- openaivec/{log.py → _log.py} +2 -2
- openaivec/_model.py +113 -0
- openaivec/{prompt.py → _prompt.py} +95 -28
- openaivec/_provider.py +207 -0
- openaivec/_responses.py +511 -0
- openaivec/_schema/__init__.py +9 -0
- openaivec/_schema/infer.py +340 -0
- openaivec/_schema/spec.py +350 -0
- openaivec/_serialize.py +234 -0
- openaivec/{util.py → _util.py} +25 -85
- openaivec/pandas_ext.py +1635 -425
- openaivec/spark.py +604 -335
- openaivec/task/__init__.py +27 -29
- openaivec/task/customer_support/__init__.py +9 -15
- openaivec/task/customer_support/customer_sentiment.py +51 -41
- openaivec/task/customer_support/inquiry_classification.py +86 -61
- openaivec/task/customer_support/inquiry_summary.py +44 -45
- openaivec/task/customer_support/intent_analysis.py +56 -41
- openaivec/task/customer_support/response_suggestion.py +49 -43
- openaivec/task/customer_support/urgency_analysis.py +76 -71
- openaivec/task/nlp/__init__.py +4 -4
- openaivec/task/nlp/dependency_parsing.py +19 -20
- openaivec/task/nlp/keyword_extraction.py +22 -24
- openaivec/task/nlp/morphological_analysis.py +25 -25
- openaivec/task/nlp/named_entity_recognition.py +26 -28
- openaivec/task/nlp/sentiment_analysis.py +29 -21
- openaivec/task/nlp/translation.py +24 -30
- openaivec/task/table/__init__.py +3 -0
- openaivec/task/table/fillna.py +183 -0
- openaivec-1.0.10.dist-info/METADATA +399 -0
- openaivec-1.0.10.dist-info/RECORD +39 -0
- {openaivec-0.10.0.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
- openaivec/embeddings.py +0 -172
- openaivec/responses.py +0 -392
- openaivec/serialize.py +0 -225
- openaivec/task/model.py +0 -84
- openaivec-0.10.0.dist-info/METADATA +0 -546
- openaivec-0.10.0.dist-info/RECORD +0 -29
- {openaivec-0.10.0.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,22 +5,22 @@ identifies and classifies named entities in text using OpenAI's language models.
|
|
|
5
5
|
|
|
6
6
|
Example:
|
|
7
7
|
Basic usage with BatchResponses:
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
```python
|
|
10
10
|
from openai import OpenAI
|
|
11
|
-
from openaivec
|
|
11
|
+
from openaivec import BatchResponses
|
|
12
12
|
from openaivec.task import nlp
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
client = OpenAI()
|
|
15
15
|
analyzer = BatchResponses.of_task(
|
|
16
16
|
client=client,
|
|
17
|
-
model_name="gpt-
|
|
17
|
+
model_name="gpt-4.1-mini",
|
|
18
18
|
task=nlp.NAMED_ENTITY_RECOGNITION
|
|
19
19
|
)
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
texts = ["John works at Microsoft in Seattle", "The meeting is on March 15th"]
|
|
22
22
|
analyses = analyzer.parse(texts)
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
for analysis in analyses:
|
|
25
25
|
print(f"Persons: {analysis.persons}")
|
|
26
26
|
print(f"Organizations: {analysis.organizations}")
|
|
@@ -28,31 +28,29 @@ Example:
|
|
|
28
28
|
```
|
|
29
29
|
|
|
30
30
|
With pandas integration:
|
|
31
|
-
|
|
31
|
+
|
|
32
32
|
```python
|
|
33
33
|
import pandas as pd
|
|
34
34
|
from openaivec import pandas_ext # Required for .ai accessor
|
|
35
35
|
from openaivec.task import nlp
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
df = pd.DataFrame({"text": ["John works at Microsoft in Seattle", "The meeting is on March 15th"]})
|
|
38
38
|
df["entities"] = df["text"].ai.task(nlp.NAMED_ENTITY_RECOGNITION)
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
# Extract entity components
|
|
41
41
|
extracted_df = df.ai.extract("entities")
|
|
42
42
|
print(extracted_df[["text", "entities_persons", "entities_organizations", "entities_locations"]])
|
|
43
43
|
```
|
|
44
44
|
|
|
45
45
|
Attributes:
|
|
46
|
-
NAMED_ENTITY_RECOGNITION (PreparedTask): A prepared task instance
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
NAMED_ENTITY_RECOGNITION (PreparedTask): A prepared task instance configured for named
|
|
47
|
+
entity recognition. Provide ``temperature=0.0`` and ``top_p=1.0`` to API calls for
|
|
48
|
+
deterministic output.
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
|
-
from
|
|
52
|
-
from pydantic import BaseModel
|
|
53
|
-
from pydantic import Field
|
|
51
|
+
from pydantic import BaseModel, Field
|
|
54
52
|
|
|
55
|
-
from
|
|
53
|
+
from openaivec._model import PreparedTask
|
|
56
54
|
|
|
57
55
|
__all__ = ["NAMED_ENTITY_RECOGNITION"]
|
|
58
56
|
|
|
@@ -62,22 +60,22 @@ class NamedEntity(BaseModel):
|
|
|
62
60
|
label: str = Field(description="Entity type label")
|
|
63
61
|
start: int = Field(description="Start position in the original text")
|
|
64
62
|
end: int = Field(description="End position in the original text")
|
|
65
|
-
confidence:
|
|
63
|
+
confidence: float | None = Field(description="Confidence score (0.0-1.0)")
|
|
66
64
|
|
|
67
65
|
|
|
68
66
|
class NamedEntityRecognition(BaseModel):
|
|
69
|
-
persons:
|
|
70
|
-
organizations:
|
|
71
|
-
locations:
|
|
72
|
-
dates:
|
|
73
|
-
money:
|
|
74
|
-
percentages:
|
|
75
|
-
miscellaneous:
|
|
67
|
+
persons: list[NamedEntity] = Field(description="Person entities")
|
|
68
|
+
organizations: list[NamedEntity] = Field(description="Organization entities")
|
|
69
|
+
locations: list[NamedEntity] = Field(description="Location entities")
|
|
70
|
+
dates: list[NamedEntity] = Field(description="Date and time entities")
|
|
71
|
+
money: list[NamedEntity] = Field(description="Money and currency entities")
|
|
72
|
+
percentages: list[NamedEntity] = Field(description="Percentage entities")
|
|
73
|
+
miscellaneous: list[NamedEntity] = Field(description="Other named entities")
|
|
76
74
|
|
|
77
75
|
|
|
78
76
|
NAMED_ENTITY_RECOGNITION = PreparedTask(
|
|
79
|
-
instructions="Identify and classify named entities in the following text. Extract persons,
|
|
77
|
+
instructions="Identify and classify named entities in the following text. Extract persons, "
|
|
78
|
+
"organizations, locations, dates, money, percentages, and other miscellaneous entities "
|
|
79
|
+
"with their positions and confidence scores.",
|
|
80
80
|
response_format=NamedEntityRecognition,
|
|
81
|
-
|
|
82
|
-
top_p=1.0
|
|
83
|
-
)
|
|
81
|
+
)
|
|
@@ -5,22 +5,22 @@ sentiment and emotions in text using OpenAI's language models.
|
|
|
5
5
|
|
|
6
6
|
Example:
|
|
7
7
|
Basic usage with BatchResponses:
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
```python
|
|
10
10
|
from openai import OpenAI
|
|
11
|
-
from openaivec
|
|
11
|
+
from openaivec import BatchResponses
|
|
12
12
|
from openaivec.task import nlp
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
client = OpenAI()
|
|
15
15
|
analyzer = BatchResponses.of_task(
|
|
16
16
|
client=client,
|
|
17
|
-
model_name="gpt-
|
|
17
|
+
model_name="gpt-4.1-mini",
|
|
18
18
|
task=nlp.SENTIMENT_ANALYSIS
|
|
19
19
|
)
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
texts = ["I love this product!", "This is terrible and disappointing."]
|
|
22
22
|
analyses = analyzer.parse(texts)
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
for analysis in analyses:
|
|
25
25
|
print(f"Sentiment: {analysis.sentiment}")
|
|
26
26
|
print(f"Confidence: {analysis.confidence}")
|
|
@@ -28,46 +28,54 @@ Example:
|
|
|
28
28
|
```
|
|
29
29
|
|
|
30
30
|
With pandas integration:
|
|
31
|
-
|
|
31
|
+
|
|
32
32
|
```python
|
|
33
33
|
import pandas as pd
|
|
34
34
|
from openaivec import pandas_ext # Required for .ai accessor
|
|
35
35
|
from openaivec.task import nlp
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
df = pd.DataFrame({"text": ["I love this product!", "This is terrible and disappointing."]})
|
|
38
38
|
df["sentiment"] = df["text"].ai.task(nlp.SENTIMENT_ANALYSIS)
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
# Extract sentiment components
|
|
41
41
|
extracted_df = df.ai.extract("sentiment")
|
|
42
42
|
print(extracted_df[["text", "sentiment_sentiment", "sentiment_confidence", "sentiment_polarity"]])
|
|
43
43
|
```
|
|
44
44
|
|
|
45
45
|
Attributes:
|
|
46
|
-
SENTIMENT_ANALYSIS (PreparedTask): A prepared task instance
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
SENTIMENT_ANALYSIS (PreparedTask): A prepared task instance configured for sentiment
|
|
47
|
+
analysis. Provide ``temperature=0.0`` and ``top_p=1.0`` to API calls for
|
|
48
|
+
deterministic output.
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
|
-
from typing import
|
|
51
|
+
from typing import Literal
|
|
52
|
+
|
|
52
53
|
from pydantic import BaseModel, Field
|
|
53
54
|
|
|
54
|
-
from
|
|
55
|
+
from openaivec._model import PreparedTask
|
|
55
56
|
|
|
56
57
|
__all__ = ["SENTIMENT_ANALYSIS"]
|
|
57
58
|
|
|
58
59
|
|
|
59
60
|
class SentimentAnalysis(BaseModel):
|
|
60
|
-
sentiment: Literal["positive", "negative", "neutral"] = Field(
|
|
61
|
+
sentiment: Literal["positive", "negative", "neutral"] = Field(
|
|
62
|
+
description="Overall sentiment (positive, negative, neutral)"
|
|
63
|
+
)
|
|
61
64
|
confidence: float = Field(description="Confidence score for sentiment (0.0-1.0)")
|
|
62
|
-
emotions:
|
|
63
|
-
|
|
65
|
+
emotions: list[Literal["joy", "sadness", "anger", "fear", "surprise", "disgust"]] = Field(
|
|
66
|
+
description="Detected emotions (joy, sadness, anger, fear, surprise, disgust)"
|
|
67
|
+
)
|
|
68
|
+
emotion_scores: list[float] = Field(description="Confidence scores for each emotion (0.0-1.0)")
|
|
64
69
|
polarity: float = Field(description="Polarity score from -1.0 (negative) to 1.0 (positive)")
|
|
65
70
|
subjectivity: float = Field(description="Subjectivity score from 0.0 (objective) to 1.0 (subjective)")
|
|
66
71
|
|
|
67
72
|
|
|
68
73
|
SENTIMENT_ANALYSIS = PreparedTask(
|
|
69
|
-
instructions="Analyze the sentiment and emotions in the following text. Provide overall
|
|
74
|
+
instructions="Analyze the sentiment and emotions in the following text. Provide overall "
|
|
75
|
+
"sentiment classification, confidence scores, detected emotions, polarity, and subjectivity "
|
|
76
|
+
"measures.\n\nIMPORTANT: Provide all analysis in the same language as the input text, except "
|
|
77
|
+
"for the predefined categorical fields (sentiment, emotions) which must use the exact "
|
|
78
|
+
"English values specified (positive/negative/neutral for sentiment, and "
|
|
79
|
+
"joy/sadness/anger/fear/surprise/disgust for emotions).",
|
|
70
80
|
response_format=SentimentAnalysis,
|
|
71
|
-
|
|
72
|
-
top_p=1.0
|
|
73
|
-
)
|
|
81
|
+
)
|
|
@@ -10,22 +10,22 @@ provides structured output with consistent language code naming.
|
|
|
10
10
|
|
|
11
11
|
Example:
|
|
12
12
|
Basic usage with BatchResponses:
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
```python
|
|
15
15
|
from openai import OpenAI
|
|
16
|
-
from openaivec
|
|
16
|
+
from openaivec import BatchResponses
|
|
17
17
|
from openaivec.task import nlp
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
client = OpenAI()
|
|
20
20
|
translator = BatchResponses.of_task(
|
|
21
21
|
client=client,
|
|
22
|
-
model_name="gpt-
|
|
22
|
+
model_name="gpt-4.1-mini",
|
|
23
23
|
task=nlp.MULTILINGUAL_TRANSLATION
|
|
24
24
|
)
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
texts = ["Hello", "Good morning", "Thank you"]
|
|
27
27
|
translations = translator.parse(texts)
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
for translation in translations:
|
|
30
30
|
print(f"English: {translation.en}")
|
|
31
31
|
print(f"Japanese: {translation.ja}")
|
|
@@ -33,15 +33,15 @@ Example:
|
|
|
33
33
|
```
|
|
34
34
|
|
|
35
35
|
With pandas integration:
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
```python
|
|
38
38
|
import pandas as pd
|
|
39
39
|
from openaivec import pandas_ext # Required for .ai accessor
|
|
40
40
|
from openaivec.task import nlp
|
|
41
|
-
|
|
41
|
+
|
|
42
42
|
df = pd.DataFrame({"text": ["Hello", "Goodbye"]})
|
|
43
43
|
df["translations"] = df["text"].ai.task(nlp.MULTILINGUAL_TRANSLATION)
|
|
44
|
-
|
|
44
|
+
|
|
45
45
|
# Extract specific languages
|
|
46
46
|
extracted_df = df.ai.extract("translations")
|
|
47
47
|
print(extracted_df[["text", "translations_en", "translations_ja", "translations_fr"]])
|
|
@@ -49,8 +49,8 @@ Example:
|
|
|
49
49
|
|
|
50
50
|
Attributes:
|
|
51
51
|
MULTILINGUAL_TRANSLATION (PreparedTask): A prepared task instance configured
|
|
52
|
-
for multilingual translation
|
|
53
|
-
deterministic output.
|
|
52
|
+
for multilingual translation. Provide ``temperature=0.0`` and ``top_p=1.0``
|
|
53
|
+
to the calling API wrapper for deterministic output.
|
|
54
54
|
|
|
55
55
|
Note:
|
|
56
56
|
The translation covers 58 languages across major language families. All field
|
|
@@ -72,10 +72,9 @@ Note:
|
|
|
72
72
|
- Other: Basque, Maltese
|
|
73
73
|
"""
|
|
74
74
|
|
|
75
|
-
from
|
|
76
|
-
from pydantic import Field
|
|
75
|
+
from pydantic import BaseModel, Field
|
|
77
76
|
|
|
78
|
-
from
|
|
77
|
+
from openaivec._model import PreparedTask
|
|
79
78
|
|
|
80
79
|
__all__ = ["MULTILINGUAL_TRANSLATION"]
|
|
81
80
|
|
|
@@ -88,7 +87,7 @@ class TranslatedString(BaseModel):
|
|
|
88
87
|
sv: str = Field(description="Translated text in Swedish")
|
|
89
88
|
da: str = Field(description="Translated text in Danish")
|
|
90
89
|
no: str = Field(description="Translated text in Norwegian")
|
|
91
|
-
|
|
90
|
+
|
|
92
91
|
# Romance languages
|
|
93
92
|
es: str = Field(description="Translated text in Spanish")
|
|
94
93
|
fr: str = Field(description="Translated text in French")
|
|
@@ -96,7 +95,7 @@ class TranslatedString(BaseModel):
|
|
|
96
95
|
pt: str = Field(description="Translated text in Portuguese")
|
|
97
96
|
ro: str = Field(description="Translated text in Romanian")
|
|
98
97
|
ca: str = Field(description="Translated text in Catalan")
|
|
99
|
-
|
|
98
|
+
|
|
100
99
|
# Slavic languages
|
|
101
100
|
ru: str = Field(description="Translated text in Russian")
|
|
102
101
|
pl: str = Field(description="Translated text in Polish")
|
|
@@ -106,37 +105,37 @@ class TranslatedString(BaseModel):
|
|
|
106
105
|
bg: str = Field(description="Translated text in Bulgarian")
|
|
107
106
|
hr: str = Field(description="Translated text in Croatian")
|
|
108
107
|
sr: str = Field(description="Translated text in Serbian")
|
|
109
|
-
|
|
108
|
+
|
|
110
109
|
# East Asian languages
|
|
111
110
|
ja: str = Field(description="Translated text in Japanese")
|
|
112
111
|
ko: str = Field(description="Translated text in Korean")
|
|
113
112
|
zh: str = Field(description="Translated text in Chinese (Simplified)")
|
|
114
113
|
zh_tw: str = Field(description="Translated text in Chinese (Traditional)")
|
|
115
|
-
|
|
114
|
+
|
|
116
115
|
# South Asian languages
|
|
117
116
|
hi: str = Field(description="Translated text in Hindi")
|
|
118
117
|
bn: str = Field(description="Translated text in Bengali")
|
|
119
118
|
te: str = Field(description="Translated text in Telugu")
|
|
120
119
|
ta: str = Field(description="Translated text in Tamil")
|
|
121
120
|
ur: str = Field(description="Translated text in Urdu")
|
|
122
|
-
|
|
121
|
+
|
|
123
122
|
# Southeast Asian languages
|
|
124
123
|
th: str = Field(description="Translated text in Thai")
|
|
125
124
|
vi: str = Field(description="Translated text in Vietnamese")
|
|
126
125
|
id: str = Field(description="Translated text in Indonesian")
|
|
127
126
|
ms: str = Field(description="Translated text in Malay")
|
|
128
127
|
tl: str = Field(description="Translated text in Filipino")
|
|
129
|
-
|
|
128
|
+
|
|
130
129
|
# Middle Eastern languages
|
|
131
130
|
ar: str = Field(description="Translated text in Arabic")
|
|
132
131
|
he: str = Field(description="Translated text in Hebrew")
|
|
133
132
|
fa: str = Field(description="Translated text in Persian")
|
|
134
133
|
tr: str = Field(description="Translated text in Turkish")
|
|
135
|
-
|
|
134
|
+
|
|
136
135
|
# African languages
|
|
137
136
|
sw: str = Field(description="Translated text in Swahili")
|
|
138
137
|
am: str = Field(description="Translated text in Amharic")
|
|
139
|
-
|
|
138
|
+
|
|
140
139
|
# Other European languages
|
|
141
140
|
fi: str = Field(description="Translated text in Finnish")
|
|
142
141
|
hu: str = Field(description="Translated text in Hungarian")
|
|
@@ -144,10 +143,10 @@ class TranslatedString(BaseModel):
|
|
|
144
143
|
lv: str = Field(description="Translated text in Latvian")
|
|
145
144
|
lt: str = Field(description="Translated text in Lithuanian")
|
|
146
145
|
el: str = Field(description="Translated text in Greek")
|
|
147
|
-
|
|
146
|
+
|
|
148
147
|
# Nordic languages
|
|
149
148
|
is_: str = Field(description="Translated text in Icelandic")
|
|
150
|
-
|
|
149
|
+
|
|
151
150
|
# Other languages
|
|
152
151
|
eu: str = Field(description="Translated text in Basque")
|
|
153
152
|
cy: str = Field(description="Translated text in Welsh")
|
|
@@ -157,9 +156,4 @@ class TranslatedString(BaseModel):
|
|
|
157
156
|
|
|
158
157
|
instructions = "Translate the following text into multiple languages. "
|
|
159
158
|
|
|
160
|
-
MULTILINGUAL_TRANSLATION = PreparedTask(
|
|
161
|
-
instructions=instructions,
|
|
162
|
-
response_format=TranslatedString,
|
|
163
|
-
temperature=0.0,
|
|
164
|
-
top_p=1.0
|
|
165
|
-
)
|
|
159
|
+
MULTILINGUAL_TRANSLATION = PreparedTask(instructions=instructions, response_format=TranslatedString)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Missing value imputation task for DataFrame columns.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to intelligently fill missing values in DataFrame
|
|
4
|
+
columns using AI-powered analysis. The task analyzes existing data patterns to
|
|
5
|
+
generate contextually appropriate values for missing entries.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
Basic usage with pandas DataFrame:
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from openaivec import pandas_ext # Required for .ai accessor
|
|
13
|
+
from openaivec.task.table import fillna
|
|
14
|
+
|
|
15
|
+
# Create DataFrame with missing values
|
|
16
|
+
df = pd.DataFrame({
|
|
17
|
+
"name": ["Alice", "Bob", None, "David"],
|
|
18
|
+
"age": [25, 30, 35, None],
|
|
19
|
+
"city": ["New York", "London", "Tokyo", "Paris"],
|
|
20
|
+
"salary": [50000, 60000, 70000, None]
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
# Fill missing values in the 'salary' column
|
|
24
|
+
task = fillna(df, "salary")
|
|
25
|
+
filled_salaries = df[df["salary"].isna()].ai.task(task)
|
|
26
|
+
|
|
27
|
+
# Apply filled values back to DataFrame
|
|
28
|
+
for result in filled_salaries:
|
|
29
|
+
df.loc[result.index, "salary"] = result.output
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
With BatchResponses for more control:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from openai import OpenAI
|
|
36
|
+
from openaivec import BatchResponses
|
|
37
|
+
from openaivec.task.table import fillna
|
|
38
|
+
|
|
39
|
+
client = OpenAI()
|
|
40
|
+
df = pd.DataFrame({...}) # Your DataFrame with missing values
|
|
41
|
+
|
|
42
|
+
# Create fillna task for target column
|
|
43
|
+
task = fillna(df, "target_column")
|
|
44
|
+
|
|
45
|
+
# Get rows with missing values in target column
|
|
46
|
+
missing_rows = df[df["target_column"].isna()]
|
|
47
|
+
|
|
48
|
+
# Process with BatchResponses
|
|
49
|
+
filler = BatchResponses.of_task(
|
|
50
|
+
client=client,
|
|
51
|
+
model_name="gpt-4.1-mini",
|
|
52
|
+
task=task
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Generate inputs for missing rows
|
|
56
|
+
inputs = []
|
|
57
|
+
for idx, row in missing_rows.iterrows():
|
|
58
|
+
inputs.append({
|
|
59
|
+
"index": idx,
|
|
60
|
+
"input": {k: v for k, v in row.items() if k != "target_column"}
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
filled_values = filler.parse(inputs)
|
|
64
|
+
```
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
import json
|
|
68
|
+
|
|
69
|
+
import pandas as pd
|
|
70
|
+
from pydantic import BaseModel, Field
|
|
71
|
+
|
|
72
|
+
from openaivec._model import PreparedTask
|
|
73
|
+
from openaivec._prompt import FewShotPromptBuilder
|
|
74
|
+
|
|
75
|
+
__all__ = ["fillna", "FillNaResponse"]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_examples(df: pd.DataFrame, target_column_name: str, max_examples: int) -> list[dict]:
|
|
79
|
+
examples: list[dict] = []
|
|
80
|
+
|
|
81
|
+
samples: pd.DataFrame = df.sample(frac=1).reset_index(drop=True).drop_duplicates()
|
|
82
|
+
samples = samples.dropna(subset=[target_column_name])
|
|
83
|
+
|
|
84
|
+
for i, row in samples.head(max_examples).iterrows():
|
|
85
|
+
examples.append(
|
|
86
|
+
{
|
|
87
|
+
"index": i,
|
|
88
|
+
"input": {k: v for k, v in row.items() if k != target_column_name},
|
|
89
|
+
"output": row[target_column_name],
|
|
90
|
+
}
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return examples
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_instructions(df: pd.DataFrame, target_column_name: str, max_examples: int) -> str:
|
|
97
|
+
examples = get_examples(df, target_column_name, max_examples)
|
|
98
|
+
|
|
99
|
+
builder = (
|
|
100
|
+
FewShotPromptBuilder()
|
|
101
|
+
.purpose("Fill missing values in the target column based on the context provided by other columns.")
|
|
102
|
+
.caution("Ensure that the filled values are consistent with the data in other columns.")
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
for row in examples:
|
|
106
|
+
builder.example(
|
|
107
|
+
input_value=json.dumps({"index": row["index"], "input": row["input"]}, ensure_ascii=False),
|
|
108
|
+
output_value=json.dumps({"index": row["index"], "output": row["output"]}, ensure_ascii=False),
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return builder.improve().build()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class FillNaResponse(BaseModel):
|
|
115
|
+
"""Response model for missing value imputation results.
|
|
116
|
+
|
|
117
|
+
Contains the row index and the imputed value for a specific missing
|
|
118
|
+
entry in the target column.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
index: int = Field(description="Index of the row in the original DataFrame")
|
|
122
|
+
output: int | float | str | bool | None = Field(
|
|
123
|
+
description="Filled value for the target column. This value should be JSON-compatible "
|
|
124
|
+
"and match the target column type in the original DataFrame."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def fillna(df: pd.DataFrame, target_column_name: str, max_examples: int = 500) -> PreparedTask:
|
|
129
|
+
"""Create a prepared task for filling missing values in a DataFrame column.
|
|
130
|
+
|
|
131
|
+
Analyzes the provided DataFrame to understand data patterns and creates
|
|
132
|
+
a configured task that can intelligently fill missing values in the
|
|
133
|
+
specified target column. The task uses few-shot learning with examples
|
|
134
|
+
extracted from non-null rows in the DataFrame.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
df (pd.DataFrame): Source DataFrame containing the data with missing values.
|
|
138
|
+
target_column_name (str): Name of the column to fill missing values for.
|
|
139
|
+
This column should exist in the DataFrame and contain some
|
|
140
|
+
non-null values to serve as training examples.
|
|
141
|
+
max_examples (int): Maximum number of example rows to use for few-shot
|
|
142
|
+
learning. Defaults to 500. Higher values provide more context
|
|
143
|
+
but increase token usage and processing time.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
PreparedTask configured for missing value imputation with:
|
|
147
|
+
- Instructions based on DataFrame patterns
|
|
148
|
+
- FillNaResponse format for structured output
|
|
149
|
+
- Default deterministic settings (temperature=0.0, top_p=1.0)
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
ValueError: If target_column_name doesn't exist in DataFrame,
|
|
153
|
+
contains no non-null values for training examples, DataFrame is empty,
|
|
154
|
+
or max_examples is not a positive integer.
|
|
155
|
+
|
|
156
|
+
Example:
|
|
157
|
+
```python
|
|
158
|
+
import pandas as pd
|
|
159
|
+
from openaivec.task.table import fillna
|
|
160
|
+
|
|
161
|
+
df = pd.DataFrame({
|
|
162
|
+
"product": ["laptop", "phone", "tablet", "laptop"],
|
|
163
|
+
"brand": ["Apple", "Samsung", None, "Dell"],
|
|
164
|
+
"price": [1200, 800, 600, 1000]
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
# Create task to fill missing brand values
|
|
168
|
+
task = fillna(df, "brand")
|
|
169
|
+
|
|
170
|
+
# Use with pandas AI accessor
|
|
171
|
+
missing_brands = df[df["brand"].isna()].ai.task(task)
|
|
172
|
+
```
|
|
173
|
+
"""
|
|
174
|
+
if df.empty:
|
|
175
|
+
raise ValueError("DataFrame is empty.")
|
|
176
|
+
if not isinstance(max_examples, int) or max_examples <= 0:
|
|
177
|
+
raise ValueError("max_examples must be a positive integer.")
|
|
178
|
+
if target_column_name not in df.columns:
|
|
179
|
+
raise ValueError(f"Column '{target_column_name}' does not exist in the DataFrame.")
|
|
180
|
+
if df[target_column_name].notna().sum() == 0:
|
|
181
|
+
raise ValueError(f"Column '{target_column_name}' contains no non-null values for training examples.")
|
|
182
|
+
instructions = get_instructions(df, target_column_name, max_examples)
|
|
183
|
+
return PreparedTask(instructions=instructions, response_format=FillNaResponse)
|