openaivec 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec-0.2.1/LICENSE +21 -0
- openaivec-0.2.1/PKG-INFO +153 -0
- openaivec-0.2.1/README.md +131 -0
- openaivec-0.2.1/openaivec/__init__.py +7 -0
- openaivec-0.2.1/openaivec/embedding.py +23 -0
- openaivec-0.2.1/openaivec/spark.py +91 -0
- openaivec-0.2.1/openaivec/test_util.py +76 -0
- openaivec-0.2.1/openaivec/util.py +38 -0
- openaivec-0.2.1/openaivec/vectorize.py +108 -0
- openaivec-0.2.1/pyproject.toml +27 -0
openaivec-0.2.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 anaregdesign
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
openaivec-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: openaivec
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary:
|
|
5
|
+
Home-page: https://github.com/anaregdesign/vectorize-openai
|
|
6
|
+
License: MIT
|
|
7
|
+
Author: Hiroki Mizukami
|
|
8
|
+
Author-email: hmizukami@microsoft.com
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Dist: httpx[http2] (>=0.28.1,<0.29.0)
|
|
16
|
+
Requires-Dist: openai (>=1.57.2,<2.0.0)
|
|
17
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
|
+
Requires-Dist: pyspark (>=3.5.1,<4.0.0)
|
|
19
|
+
Project-URL: Repository, https://github.com/anaregdesign/vectorize-openai
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# vectorize-openai
|
|
23
|
+
|
|
24
|
+
Simple wrapper of OpenAI for vectorize requests with single request.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install git+https://github.com/anaregdesign/vectorize-openai.git
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Uninstall
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip uninstall openaivec
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Basic Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import os
|
|
42
|
+
from openai import AzureOpenAI
|
|
43
|
+
from openaivec import VectorizedOpenAI
|
|
44
|
+
|
|
45
|
+
os.environ["AZURE_OPENAI_API_KEY"] = "<your_api_key>"
|
|
46
|
+
api_version = "2024-10-21"
|
|
47
|
+
azure_endpoint = "https://<your_resource_name>.openai.azure.com"
|
|
48
|
+
deployment_name = "<your_deployment_name>"
|
|
49
|
+
|
|
50
|
+
client = VectorizedOpenAI(
|
|
51
|
+
client=AzureOpenAI(
|
|
52
|
+
api_version=api_version,
|
|
53
|
+
azure_endpoint=azure_endpoint
|
|
54
|
+
),
|
|
55
|
+
temperature=0.0,
|
|
56
|
+
top_p=1.0,
|
|
57
|
+
model_name=deployment_name,
|
|
58
|
+
system_message="Please answer simply with a simple “xx family” and do not output anything else."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
client.predict(["panda", "rabit", "koala"]) # => ['bear family', 'rabbit family', 'koala family']
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Usage, process with pandas
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import pandas as pd
|
|
68
|
+
|
|
69
|
+
...
|
|
70
|
+
|
|
71
|
+
df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
|
|
72
|
+
|
|
73
|
+
df.assign(
|
|
74
|
+
kind=lambda df: client.predict(df.name)
|
|
75
|
+
)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
the result is:
|
|
79
|
+
|
|
80
|
+
| name | kind |
|
|
81
|
+
|--------|---------------|
|
|
82
|
+
| panda | bear family |
|
|
83
|
+
| rabbit | rabbit family |
|
|
84
|
+
| koala | koala family |
|
|
85
|
+
|
|
86
|
+
## Using Azure OpenAI with Apache Spark UDF
|
|
87
|
+
|
|
88
|
+
Here's simple example of parsing product names using OpenAI with Apache Spark UDF.
|
|
89
|
+
|
|
90
|
+
You can use the `openaivec` package to create a UDF function to use with Apache Spark.
|
|
91
|
+
At first, you need to create a `UDFConfig` object with the configuration of your OpenAI deployment.
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from openaivec.spark import UDFBuilder
|
|
95
|
+
|
|
96
|
+
udf = UDFBuilder(
|
|
97
|
+
api_key="<your-api-key>",
|
|
98
|
+
api_version="2024-10-21",
|
|
99
|
+
endpoint="https://<your-resource-name>.openai.azure.com",
|
|
100
|
+
model_name="<your-deployment-name"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
here you can use the `completion_udf` function to create a UDF function to use with Apache Spark.
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
spark.udf.register("parse_taste", udf.completion("""
|
|
109
|
+
- Extract flavor-related information included in the product name. Only output the flavor name concisely, and nothing else.
|
|
110
|
+
- Minimize unnecessary adjectives regarding the flavor as much as possible.
|
|
111
|
+
- Example:
|
|
112
|
+
- Hokkaido Milk → Milk
|
|
113
|
+
- Uji Matcha → Matcha
|
|
114
|
+
|
|
115
|
+
"""))
|
|
116
|
+
|
|
117
|
+
spark.udf.register("parse_product", udf.completion("""
|
|
118
|
+
- Extract the type of food included in the product name. Only output the food category and nothing else.
|
|
119
|
+
- Example output:
|
|
120
|
+
- Smoothie
|
|
121
|
+
- Milk Tea
|
|
122
|
+
- Protein Bar
|
|
123
|
+
"""))
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
and then you can use the UDF function in your queries.
|
|
127
|
+
|
|
128
|
+
```sparksql
|
|
129
|
+
select id,
|
|
130
|
+
product_name,
|
|
131
|
+
parse_taste(product_name) as taste,
|
|
132
|
+
parse_product(product_name) as product
|
|
133
|
+
from product_names
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Output:
|
|
137
|
+
|
|
138
|
+
| id | product_name | taste | product |
|
|
139
|
+
|---------------|--------------------------------------|-----------|-------------|
|
|
140
|
+
| 4414732714624 | Cafe Mocha Smoothie (Trial Size) | Mocha | Smoothie |
|
|
141
|
+
| 4200162318339 | Dark Chocolate Tea (New Product) | Chocolate | Tea |
|
|
142
|
+
| 4920122084098 | Cafe Mocha Protein Bar (Trial Size) | Mocha | Protein Bar |
|
|
143
|
+
| 4468864478874 | Dark Chocolate Smoothie (On Sale) | Chocolate | Smoothie |
|
|
144
|
+
| 4036242144725 | Uji Matcha Tea (New Product) | Matcha | Tea |
|
|
145
|
+
| 4847798245741 | Hokkaido Milk Tea (Trial Size) | Milk | Milk Tea |
|
|
146
|
+
| 4449574211957 | Dark Chocolate Smoothie (Trial Size) | Chocolate | Smoothie |
|
|
147
|
+
| 4127044426148 | Fruit Mix Tea (Trial Size) | Fruit | Tea |
|
|
148
|
+
| ... | ... | ... | ... |
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# vectorize-openai
|
|
2
|
+
|
|
3
|
+
Simple wrapper of OpenAI for vectorize requests with single request.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install git+https://github.com/anaregdesign/vectorize-openai.git
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Uninstall
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip uninstall openaivec
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Basic Usage
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import os
|
|
21
|
+
from openai import AzureOpenAI
|
|
22
|
+
from openaivec import VectorizedOpenAI
|
|
23
|
+
|
|
24
|
+
os.environ["AZURE_OPENAI_API_KEY"] = "<your_api_key>"
|
|
25
|
+
api_version = "2024-10-21"
|
|
26
|
+
azure_endpoint = "https://<your_resource_name>.openai.azure.com"
|
|
27
|
+
deployment_name = "<your_deployment_name>"
|
|
28
|
+
|
|
29
|
+
client = VectorizedOpenAI(
|
|
30
|
+
client=AzureOpenAI(
|
|
31
|
+
api_version=api_version,
|
|
32
|
+
azure_endpoint=azure_endpoint
|
|
33
|
+
),
|
|
34
|
+
temperature=0.0,
|
|
35
|
+
top_p=1.0,
|
|
36
|
+
model_name=deployment_name,
|
|
37
|
+
system_message="Please answer simply with a simple “xx family” and do not output anything else."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
client.predict(["panda", "rabit", "koala"]) # => ['bear family', 'rabbit family', 'koala family']
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage, process with pandas
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import pandas as pd
|
|
47
|
+
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
|
|
51
|
+
|
|
52
|
+
df.assign(
|
|
53
|
+
kind=lambda df: client.predict(df.name)
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
the result is:
|
|
58
|
+
|
|
59
|
+
| name | kind |
|
|
60
|
+
|--------|---------------|
|
|
61
|
+
| panda | bear family |
|
|
62
|
+
| rabbit | rabbit family |
|
|
63
|
+
| koala | koala family |
|
|
64
|
+
|
|
65
|
+
## Using Azure OpenAI with Apache Spark UDF
|
|
66
|
+
|
|
67
|
+
Here's simple example of parsing product names using OpenAI with Apache Spark UDF.
|
|
68
|
+
|
|
69
|
+
You can use the `openaivec` package to create a UDF function to use with Apache Spark.
|
|
70
|
+
At first, you need to create a `UDFConfig` object with the configuration of your OpenAI deployment.
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from openaivec.spark import UDFBuilder
|
|
74
|
+
|
|
75
|
+
udf = UDFBuilder(
|
|
76
|
+
api_key="<your-api-key>",
|
|
77
|
+
api_version="2024-10-21",
|
|
78
|
+
endpoint="https://<your-resource-name>.openai.azure.com",
|
|
79
|
+
model_name="<your-deployment-name"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
here you can use the `completion_udf` function to create a UDF function to use with Apache Spark.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
spark.udf.register("parse_taste", udf.completion("""
|
|
88
|
+
- Extract flavor-related information included in the product name. Only output the flavor name concisely, and nothing else.
|
|
89
|
+
- Minimize unnecessary adjectives regarding the flavor as much as possible.
|
|
90
|
+
- Example:
|
|
91
|
+
- Hokkaido Milk → Milk
|
|
92
|
+
- Uji Matcha → Matcha
|
|
93
|
+
|
|
94
|
+
"""))
|
|
95
|
+
|
|
96
|
+
spark.udf.register("parse_product", udf.completion("""
|
|
97
|
+
- Extract the type of food included in the product name. Only output the food category and nothing else.
|
|
98
|
+
- Example output:
|
|
99
|
+
- Smoothie
|
|
100
|
+
- Milk Tea
|
|
101
|
+
- Protein Bar
|
|
102
|
+
"""))
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
and then you can use the UDF function in your queries.
|
|
106
|
+
|
|
107
|
+
```sparksql
|
|
108
|
+
select id,
|
|
109
|
+
product_name,
|
|
110
|
+
parse_taste(product_name) as taste,
|
|
111
|
+
parse_product(product_name) as product
|
|
112
|
+
from product_names
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Output:
|
|
116
|
+
|
|
117
|
+
| id | product_name | taste | product |
|
|
118
|
+
|---------------|--------------------------------------|-----------|-------------|
|
|
119
|
+
| 4414732714624 | Cafe Mocha Smoothie (Trial Size) | Mocha | Smoothie |
|
|
120
|
+
| 4200162318339 | Dark Chocolate Tea (New Product) | Chocolate | Tea |
|
|
121
|
+
| 4920122084098 | Cafe Mocha Protein Bar (Trial Size) | Mocha | Protein Bar |
|
|
122
|
+
| 4468864478874 | Dark Chocolate Smoothie (On Sale) | Chocolate | Smoothie |
|
|
123
|
+
| 4036242144725 | Uji Matcha Tea (New Product) | Matcha | Tea |
|
|
124
|
+
| 4847798245741 | Hokkaido Milk Tea (Trial Size) | Milk | Milk Tea |
|
|
125
|
+
| 4449574211957 | Dark Chocolate Smoothie (Trial Size) | Chocolate | Smoothie |
|
|
126
|
+
| 4127044426148 | Fruit Mix Tea (Trial Size) | Fruit | Tea |
|
|
127
|
+
| ... | ... | ... | ... |
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from numpy.typing import NDArray
|
|
6
|
+
from openai import OpenAI
|
|
7
|
+
|
|
8
|
+
from openaivec.util import map_unique_minibatch
|
|
9
|
+
|
|
10
|
+
__ALL__ = ["EmbeddingOpenAI"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class EmbeddingOpenAI:
|
|
15
|
+
client: OpenAI
|
|
16
|
+
model_name: str
|
|
17
|
+
|
|
18
|
+
def embed(self, sentences: List[str]) -> List[NDArray[np.float32]]:
|
|
19
|
+
responses = self.client.embeddings.create(input=sentences, model=self.model_name)
|
|
20
|
+
return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
|
|
21
|
+
|
|
22
|
+
def embed_minibatch(self, sentences: List[str], batch_size: int) -> List[NDArray[np.float32]]:
|
|
23
|
+
return map_unique_minibatch(sentences, batch_size, self.embed)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pyspark.sql.pandas.functions import pandas_udf
|
|
7
|
+
from pyspark.sql.types import StringType, ArrayType, FloatType
|
|
8
|
+
|
|
9
|
+
__ALL__ = ["UDFBuilder"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class UDFBuilder:
|
|
14
|
+
api_key: str
|
|
15
|
+
api_version: str
|
|
16
|
+
endpoint: str
|
|
17
|
+
model_name: str
|
|
18
|
+
batch_size: int = 256
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def of_environment(cls) -> "UDFBuilder":
|
|
22
|
+
return cls(
|
|
23
|
+
api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
|
|
24
|
+
api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-10-21"),
|
|
25
|
+
endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
26
|
+
model_name=os.environ.get("AZURE_OPENAI_MODEL_NAME"),
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
assert self.api_key, "api_key must be set"
|
|
31
|
+
assert self.api_version, "api_version must be set"
|
|
32
|
+
assert self.endpoint, "endpoint must be set"
|
|
33
|
+
assert self.model_name, "model_name must be set"
|
|
34
|
+
|
|
35
|
+
def completion(self, system_message: str):
|
|
36
|
+
@pandas_udf(StringType())
|
|
37
|
+
def fn(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
|
|
38
|
+
import httpx
|
|
39
|
+
import pandas as pd
|
|
40
|
+
from openai import AzureOpenAI
|
|
41
|
+
|
|
42
|
+
from openaivec import VectorizedOpenAI
|
|
43
|
+
|
|
44
|
+
client = AzureOpenAI(
|
|
45
|
+
api_version=self.api_version,
|
|
46
|
+
azure_endpoint=self.endpoint,
|
|
47
|
+
http_client=httpx.Client(http2=True, verify=False),
|
|
48
|
+
api_key=self.api_key,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
client_vec = VectorizedOpenAI(
|
|
52
|
+
client=client,
|
|
53
|
+
model_name=self.model_name,
|
|
54
|
+
system_message=system_message,
|
|
55
|
+
top_p=1.0,
|
|
56
|
+
temperature=0.0,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
for part in col:
|
|
60
|
+
yield pd.Series(
|
|
61
|
+
client_vec.predict_minibatch(part.tolist(), self.batch_size)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return fn
|
|
65
|
+
|
|
66
|
+
def embedding(self):
|
|
67
|
+
@pandas_udf(ArrayType(FloatType()))
|
|
68
|
+
def fn(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
|
|
69
|
+
import httpx
|
|
70
|
+
from openai import AzureOpenAI
|
|
71
|
+
|
|
72
|
+
from openaivec.embedding import EmbeddingOpenAI
|
|
73
|
+
|
|
74
|
+
client = AzureOpenAI(
|
|
75
|
+
api_version=self.api_version,
|
|
76
|
+
azure_endpoint=self.endpoint,
|
|
77
|
+
http_client=httpx.Client(http2=True, verify=False),
|
|
78
|
+
api_key=self.api_key,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
client_emb = EmbeddingOpenAI(
|
|
82
|
+
client=client,
|
|
83
|
+
model_name=self.model_name,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
for part in col:
|
|
87
|
+
yield pd.Series(
|
|
88
|
+
client_emb.embed_minibatch(part.tolist(), self.batch_size)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return fn
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from unittest import TestCase
|
|
3
|
+
|
|
4
|
+
from openaivec.util import split_to_minibatch, map_minibatch, map_unique, map_unique_minibatch
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestMappingFunctions(TestCase):
|
|
8
|
+
|
|
9
|
+
def test_split_to_minibatch_normal(self):
|
|
10
|
+
b = [1, 2, 3, 4, 5]
|
|
11
|
+
batch_size = 2
|
|
12
|
+
expected = [[1, 2], [3, 4], [5]]
|
|
13
|
+
self.assertEqual(split_to_minibatch(b, batch_size), expected)
|
|
14
|
+
|
|
15
|
+
def test_split_to_minibatch_empty(self):
|
|
16
|
+
b: List[int] = []
|
|
17
|
+
batch_size = 3
|
|
18
|
+
expected: List[List[int]] = []
|
|
19
|
+
self.assertEqual(split_to_minibatch(b, batch_size), expected)
|
|
20
|
+
|
|
21
|
+
def test_map_minibatch(self):
|
|
22
|
+
# Function that doubles each element in the batch.
|
|
23
|
+
def double_list(lst: List[int]) -> List[int]:
|
|
24
|
+
return [x * 2 for x in lst]
|
|
25
|
+
|
|
26
|
+
b = [1, 2, 3, 4, 5]
|
|
27
|
+
batch_size = 2
|
|
28
|
+
# Batches: [1,2] -> [2,4], [3,4] -> [6,8], [5] -> [10]
|
|
29
|
+
expected = [2, 4, 6, 8, 10]
|
|
30
|
+
self.assertEqual(map_minibatch(b, batch_size, double_list), expected)
|
|
31
|
+
|
|
32
|
+
def test_map_minibatch_batch_size_one(self):
|
|
33
|
+
# Identity function: returns the list as is.
|
|
34
|
+
def identity(lst: List[int]) -> List[int]:
|
|
35
|
+
return lst
|
|
36
|
+
|
|
37
|
+
b = [1, 2, 3, 4]
|
|
38
|
+
batch_size = 1
|
|
39
|
+
expected = [1, 2, 3, 4]
|
|
40
|
+
self.assertEqual(map_minibatch(b, batch_size, identity), expected)
|
|
41
|
+
|
|
42
|
+
def test_map_minibatch_batch_size_greater_than_list(self):
|
|
43
|
+
def identity(lst: List[int]) -> List[int]:
|
|
44
|
+
return lst
|
|
45
|
+
|
|
46
|
+
b = [1, 2, 3]
|
|
47
|
+
batch_size = 5
|
|
48
|
+
expected = [1, 2, 3]
|
|
49
|
+
self.assertEqual(map_minibatch(b, batch_size, identity), expected)
|
|
50
|
+
|
|
51
|
+
def test_map_unique(self):
|
|
52
|
+
# Function that squares each element.
|
|
53
|
+
def square_list(lst: List[int]) -> List[int]:
|
|
54
|
+
return [x * x for x in lst]
|
|
55
|
+
|
|
56
|
+
b = [3, 2, 3, 1]
|
|
57
|
+
# Unique order preserved using dict.fromkeys: [3, 2, 1]
|
|
58
|
+
# After applying f: [9, 4, 1]
|
|
59
|
+
# Mapping back for original list: [9, 4, 9, 1]
|
|
60
|
+
expected = [9, 4, 9, 1]
|
|
61
|
+
self.assertEqual(map_unique(b, square_list), expected)
|
|
62
|
+
|
|
63
|
+
def test_map_unique_minibatch(self):
|
|
64
|
+
# Function that doubles each element.
|
|
65
|
+
def double_list(lst: List[int]) -> List[int]:
|
|
66
|
+
return [x * 2 for x in lst]
|
|
67
|
+
|
|
68
|
+
b = [1, 2, 1, 3]
|
|
69
|
+
batch_size = 2
|
|
70
|
+
# Unique order: [1, 2, 3]
|
|
71
|
+
# Using map_minibatch on unique values:
|
|
72
|
+
# Split [1,2,3] with batch_size=2 -> [[1,2], [3]]
|
|
73
|
+
# Apply function: [[2,4], [6]] -> flattened to [2,4,6]
|
|
74
|
+
# Mapping back for original list: [2, 4, 2, 6]
|
|
75
|
+
expected = [2, 4, 2, 6]
|
|
76
|
+
self.assertEqual(map_unique_minibatch(b, batch_size, double_list), expected)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from itertools import chain
|
|
2
|
+
from typing import List, TypeVar, Callable
|
|
3
|
+
|
|
4
|
+
T = TypeVar("T")
|
|
5
|
+
U = TypeVar("U")
|
|
6
|
+
|
|
7
|
+
def split_to_minibatch(b: List[T], batch_size: int) -> List[List[T]]:
|
|
8
|
+
"""Splits the list into sublists of size `batch_size`."""
|
|
9
|
+
return [b[i:i + batch_size] for i in range(0, len(b), batch_size)]
|
|
10
|
+
|
|
11
|
+
def map_minibatch(b: List[T], batch_size: int, f: Callable[[List[T]], List[U]]) -> List[U]:
|
|
12
|
+
"""
|
|
13
|
+
Splits the list `b` into batches of size `batch_size` and applies the function `f` to each batch.
|
|
14
|
+
The results (each a list) are then flattened into a single list.
|
|
15
|
+
"""
|
|
16
|
+
batches = split_to_minibatch(b, batch_size)
|
|
17
|
+
return list(chain.from_iterable(f(batch) for batch in batches))
|
|
18
|
+
|
|
19
|
+
def map_unique(b: List[T], f: Callable[[List[T]], List[U]]) -> List[U]:
|
|
20
|
+
"""
|
|
21
|
+
Applies the function `f` only once to the unique values in the list `b` (preserving their order),
|
|
22
|
+
and then maps the resulting values back to match the original list.
|
|
23
|
+
This avoids repeated execution of `f` for duplicate values.
|
|
24
|
+
"""
|
|
25
|
+
# Use dict.fromkeys to remove duplicates while preserving the order
|
|
26
|
+
unique_values = list(dict.fromkeys(b))
|
|
27
|
+
value_to_index = {v: i for i, v in enumerate(unique_values)}
|
|
28
|
+
results = f(unique_values)
|
|
29
|
+
return [results[value_to_index[value]] for value in b]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def map_unique_minibatch(b: List[T], batch_size: int, f: Callable[[List[T]], List[U]]) -> List[U]:
|
|
33
|
+
"""
|
|
34
|
+
Uses minibatch processing on the unique values of the list `b`.
|
|
35
|
+
The function `f` is applied to these unique values in batches,
|
|
36
|
+
and the results are mapped back to match the order of the original list.
|
|
37
|
+
"""
|
|
38
|
+
return map_unique(b, lambda x: map_minibatch(x, batch_size, f))
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from openai import OpenAI
|
|
5
|
+
from openai.types.chat import ParsedChatCompletion
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from openaivec.util import map_unique_minibatch
|
|
9
|
+
|
|
10
|
+
__ALL__ = ["VectorizedOpenAI"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def vectorize_system_message(system_message: str) -> str:
|
|
14
|
+
return f"""
|
|
15
|
+
<SystemMessage>
|
|
16
|
+
<Instructions>
|
|
17
|
+
<Instruction>{system_message}</Instruction>
|
|
18
|
+
<Instruction>
|
|
19
|
+
You will receive multiple user messages at once.
|
|
20
|
+
Please provide an appropriate response to each message individually.
|
|
21
|
+
</Instruction>
|
|
22
|
+
</Instructions>
|
|
23
|
+
<Examples>
|
|
24
|
+
<Example>
|
|
25
|
+
<Input>
|
|
26
|
+
{{
|
|
27
|
+
"user_messages": [
|
|
28
|
+
{{
|
|
29
|
+
"id": 1,
|
|
30
|
+
"text": "{{user_message_1}}"
|
|
31
|
+
}},
|
|
32
|
+
{{
|
|
33
|
+
"id": 2,
|
|
34
|
+
"text": "{{user_message_2}}"
|
|
35
|
+
}}
|
|
36
|
+
]
|
|
37
|
+
}}
|
|
38
|
+
</Input>
|
|
39
|
+
<Output>
|
|
40
|
+
{{
|
|
41
|
+
"assistant_messages": [
|
|
42
|
+
{{
|
|
43
|
+
"id": 1,
|
|
44
|
+
"text": "{{assistant_response_1}}"
|
|
45
|
+
}},
|
|
46
|
+
{{
|
|
47
|
+
"id": 2,
|
|
48
|
+
"text": "{{assistant_response_2}}"
|
|
49
|
+
}}
|
|
50
|
+
]
|
|
51
|
+
}}
|
|
52
|
+
</Output>
|
|
53
|
+
</Example>
|
|
54
|
+
</Examples>
|
|
55
|
+
</SystemMessage>
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class Message(BaseModel):
|
|
60
|
+
id: int
|
|
61
|
+
text: str
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Request(BaseModel):
|
|
65
|
+
user_messages: List[Message]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Response(BaseModel):
|
|
69
|
+
assistant_messages: List[Message]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass(frozen=True)
|
|
73
|
+
class VectorizedOpenAI:
|
|
74
|
+
client: OpenAI
|
|
75
|
+
model_name: str ## it would be the name of deployment for Azure
|
|
76
|
+
system_message: str
|
|
77
|
+
temperature: float = 0.0
|
|
78
|
+
top_p: float = 1.0
|
|
79
|
+
_vectorized_system_message: str = field(init=False)
|
|
80
|
+
|
|
81
|
+
def __post_init__(self):
|
|
82
|
+
object.__setattr__(self, "_vectorized_system_message", vectorize_system_message(self.system_message))
|
|
83
|
+
|
|
84
|
+
def request(self, user_messages: List[Message]) -> ParsedChatCompletion[Response]:
|
|
85
|
+
completion = self.client.beta.chat.completions.parse(
|
|
86
|
+
model=self.model_name,
|
|
87
|
+
messages=[
|
|
88
|
+
{"role": "system", "content": self._vectorized_system_message},
|
|
89
|
+
{"role": "user", "content": Request(user_messages=user_messages).model_dump_json()}
|
|
90
|
+
],
|
|
91
|
+
temperature=self.temperature,
|
|
92
|
+
top_p=self.top_p,
|
|
93
|
+
response_format=Response
|
|
94
|
+
)
|
|
95
|
+
return completion
|
|
96
|
+
|
|
97
|
+
def predict(self, user_messages: List[str]) -> List[str]:
|
|
98
|
+
messages = [Message(id=i, text=message) for i, message in enumerate(user_messages)]
|
|
99
|
+
completion = self.request(messages)
|
|
100
|
+
response_dict = {
|
|
101
|
+
message.id: message.text
|
|
102
|
+
for message in completion.choices[0].message.parsed.assistant_messages
|
|
103
|
+
}
|
|
104
|
+
sorted_responses = [response_dict[m.id] for m in messages]
|
|
105
|
+
return sorted_responses
|
|
106
|
+
|
|
107
|
+
def predict_minibatch(self, user_messages: List[str], batch_size: int) -> List[str]:
|
|
108
|
+
return map_unique_minibatch(user_messages, batch_size, self.predict)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "openaivec"
|
|
3
|
+
version = "0.2.1"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = ["Hiroki Mizukami <hmizukami@microsoft.com>"]
|
|
6
|
+
license = "MIT License"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
homepage = "https://github.com/anaregdesign/vectorize-openai"
|
|
9
|
+
repository = "https://github.com/anaregdesign/vectorize-openai"
|
|
10
|
+
packages = [
|
|
11
|
+
{ include = "openaivec", from = "." }
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[tool.poetry.dependencies]
|
|
15
|
+
python = "^3.10"
|
|
16
|
+
pandas = "^2.2.3"
|
|
17
|
+
pyspark = "^3.5.1"
|
|
18
|
+
openai = "^1.57.2"
|
|
19
|
+
httpx = {extras = ["http2"], version = "^0.28.1"}
|
|
20
|
+
|
|
21
|
+
[tool.poetry.dev-dependencies]
|
|
22
|
+
pytest = "^8.3.4"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
[build-system]
|
|
26
|
+
requires = ["poetry-core"]
|
|
27
|
+
build-backend = "poetry.core.masonry.api"
|