openaivec 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 anaregdesign
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,153 @@
1
+ Metadata-Version: 2.1
2
+ Name: openaivec
3
+ Version: 0.2.1
4
+ Summary:
5
+ Home-page: https://github.com/anaregdesign/vectorize-openai
6
+ License: MIT
7
+ Author: Hiroki Mizukami
8
+ Author-email: hmizukami@microsoft.com
9
+ Requires-Python: >=3.10,<4.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Dist: httpx[http2] (>=0.28.1,<0.29.0)
16
+ Requires-Dist: openai (>=1.57.2,<2.0.0)
17
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
+ Requires-Dist: pyspark (>=3.5.1,<4.0.0)
19
+ Project-URL: Repository, https://github.com/anaregdesign/vectorize-openai
20
+ Description-Content-Type: text/markdown
21
+
22
+ # vectorize-openai
23
+
24
+ Simple wrapper of OpenAI for vectorize requests with single request.
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install git+https://github.com/anaregdesign/vectorize-openai.git
30
+ ```
31
+
32
+ ## Uninstall
33
+
34
+ ```bash
35
+ pip uninstall openaivec
36
+ ```
37
+
38
+ ## Basic Usage
39
+
40
+ ```python
41
+ import os
42
+ from openai import AzureOpenAI
43
+ from openaivec import VectorizedOpenAI
44
+
45
+ os.environ["AZURE_OPENAI_API_KEY"] = "<your_api_key>"
46
+ api_version = "2024-10-21"
47
+ azure_endpoint = "https://<your_resource_name>.openai.azure.com"
48
+ deployment_name = "<your_deployment_name>"
49
+
50
+ client = VectorizedOpenAI(
51
+ client=AzureOpenAI(
52
+ api_version=api_version,
53
+ azure_endpoint=azure_endpoint
54
+ ),
55
+ temperature=0.0,
56
+ top_p=1.0,
57
+ model_name=deployment_name,
58
+ system_message="Please answer simply with a simple “xx family” and do not output anything else."
59
+ )
60
+
61
+ client.predict(["panda", "rabit", "koala"]) # => ['bear family', 'rabbit family', 'koala family']
62
+ ```
63
+
64
+ ## Usage, process with pandas
65
+
66
+ ```python
67
+ import pandas as pd
68
+
69
+ ...
70
+
71
+ df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
72
+
73
+ df.assign(
74
+ kind=lambda df: client.predict(df.name)
75
+ )
76
+ ```
77
+
78
+ the result is:
79
+
80
+ | name | kind |
81
+ |--------|---------------|
82
+ | panda | bear family |
83
+ | rabbit | rabbit family |
84
+ | koala | koala family |
85
+
86
+ ## Using Azure OpenAI with Apache Spark UDF
87
+
88
+ Here's simple example of parsing product names using OpenAI with Apache Spark UDF.
89
+
90
+ You can use the `openaivec` package to create a UDF function to use with Apache Spark.
91
+ At first, you need to create a `UDFConfig` object with the configuration of your OpenAI deployment.
92
+
93
+ ```python
94
+ from openaivec.spark import UDFBuilder
95
+
96
+ udf = UDFBuilder(
97
+ api_key="<your-api-key>",
98
+ api_version="2024-10-21",
99
+ endpoint="https://<your-resource-name>.openai.azure.com",
100
+ model_name="<your-deployment-name"
101
+ )
102
+
103
+ ```
104
+
105
+ here you can use the `completion_udf` function to create a UDF function to use with Apache Spark.
106
+
107
+ ```python
108
+ spark.udf.register("parse_taste", udf.completion("""
109
+ - Extract flavor-related information included in the product name. Only output the flavor name concisely, and nothing else.
110
+ - Minimize unnecessary adjectives regarding the flavor as much as possible.
111
+ - Example:
112
+ - Hokkaido Milk → Milk
113
+ - Uji Matcha → Matcha
114
+
115
+ """))
116
+
117
+ spark.udf.register("parse_product", udf.completion("""
118
+ - Extract the type of food included in the product name. Only output the food category and nothing else.
119
+ - Example output:
120
+ - Smoothie
121
+ - Milk Tea
122
+ - Protein Bar
123
+ """))
124
+ ```
125
+
126
+ and then you can use the UDF function in your queries.
127
+
128
+ ```sparksql
129
+ select id,
130
+ product_name,
131
+ parse_taste(product_name) as taste,
132
+ parse_product(product_name) as product
133
+ from product_names
134
+ ```
135
+
136
+ Output:
137
+
138
+ | id | product_name | taste | product |
139
+ |---------------|--------------------------------------|-----------|-------------|
140
+ | 4414732714624 | Cafe Mocha Smoothie (Trial Size) | Mocha | Smoothie |
141
+ | 4200162318339 | Dark Chocolate Tea (New Product) | Chocolate | Tea |
142
+ | 4920122084098 | Cafe Mocha Protein Bar (Trial Size) | Mocha | Protein Bar |
143
+ | 4468864478874 | Dark Chocolate Smoothie (On Sale) | Chocolate | Smoothie |
144
+ | 4036242144725 | Uji Matcha Tea (New Product) | Matcha | Tea |
145
+ | 4847798245741 | Hokkaido Milk Tea (Trial Size) | Milk | Milk Tea |
146
+ | 4449574211957 | Dark Chocolate Smoothie (Trial Size) | Chocolate | Smoothie |
147
+ | 4127044426148 | Fruit Mix Tea (Trial Size) | Fruit | Tea |
148
+ | ... | ... | ... | ... |
149
+
150
+
151
+
152
+
153
+
@@ -0,0 +1,131 @@
1
+ # vectorize-openai
2
+
3
+ Simple wrapper of OpenAI for vectorize requests with single request.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install git+https://github.com/anaregdesign/vectorize-openai.git
9
+ ```
10
+
11
+ ## Uninstall
12
+
13
+ ```bash
14
+ pip uninstall openaivec
15
+ ```
16
+
17
+ ## Basic Usage
18
+
19
+ ```python
20
+ import os
21
+ from openai import AzureOpenAI
22
+ from openaivec import VectorizedOpenAI
23
+
24
+ os.environ["AZURE_OPENAI_API_KEY"] = "<your_api_key>"
25
+ api_version = "2024-10-21"
26
+ azure_endpoint = "https://<your_resource_name>.openai.azure.com"
27
+ deployment_name = "<your_deployment_name>"
28
+
29
+ client = VectorizedOpenAI(
30
+ client=AzureOpenAI(
31
+ api_version=api_version,
32
+ azure_endpoint=azure_endpoint
33
+ ),
34
+ temperature=0.0,
35
+ top_p=1.0,
36
+ model_name=deployment_name,
37
+ system_message="Please answer simply with a simple “xx family” and do not output anything else."
38
+ )
39
+
40
+ client.predict(["panda", "rabit", "koala"]) # => ['bear family', 'rabbit family', 'koala family']
41
+ ```
42
+
43
+ ## Usage, process with pandas
44
+
45
+ ```python
46
+ import pandas as pd
47
+
48
+ ...
49
+
50
+ df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
51
+
52
+ df.assign(
53
+ kind=lambda df: client.predict(df.name)
54
+ )
55
+ ```
56
+
57
+ the result is:
58
+
59
+ | name | kind |
60
+ |--------|---------------|
61
+ | panda | bear family |
62
+ | rabbit | rabbit family |
63
+ | koala | koala family |
64
+
65
+ ## Using Azure OpenAI with Apache Spark UDF
66
+
67
+ Here's simple example of parsing product names using OpenAI with Apache Spark UDF.
68
+
69
+ You can use the `openaivec` package to create a UDF function to use with Apache Spark.
70
+ At first, you need to create a `UDFConfig` object with the configuration of your OpenAI deployment.
71
+
72
+ ```python
73
+ from openaivec.spark import UDFBuilder
74
+
75
+ udf = UDFBuilder(
76
+ api_key="<your-api-key>",
77
+ api_version="2024-10-21",
78
+ endpoint="https://<your-resource-name>.openai.azure.com",
79
+ model_name="<your-deployment-name"
80
+ )
81
+
82
+ ```
83
+
84
+ here you can use the `completion_udf` function to create a UDF function to use with Apache Spark.
85
+
86
+ ```python
87
+ spark.udf.register("parse_taste", udf.completion("""
88
+ - Extract flavor-related information included in the product name. Only output the flavor name concisely, and nothing else.
89
+ - Minimize unnecessary adjectives regarding the flavor as much as possible.
90
+ - Example:
91
+ - Hokkaido Milk → Milk
92
+ - Uji Matcha → Matcha
93
+
94
+ """))
95
+
96
+ spark.udf.register("parse_product", udf.completion("""
97
+ - Extract the type of food included in the product name. Only output the food category and nothing else.
98
+ - Example output:
99
+ - Smoothie
100
+ - Milk Tea
101
+ - Protein Bar
102
+ """))
103
+ ```
104
+
105
+ and then you can use the UDF function in your queries.
106
+
107
+ ```sparksql
108
+ select id,
109
+ product_name,
110
+ parse_taste(product_name) as taste,
111
+ parse_product(product_name) as product
112
+ from product_names
113
+ ```
114
+
115
+ Output:
116
+
117
+ | id | product_name | taste | product |
118
+ |---------------|--------------------------------------|-----------|-------------|
119
+ | 4414732714624 | Cafe Mocha Smoothie (Trial Size) | Mocha | Smoothie |
120
+ | 4200162318339 | Dark Chocolate Tea (New Product) | Chocolate | Tea |
121
+ | 4920122084098 | Cafe Mocha Protein Bar (Trial Size) | Mocha | Protein Bar |
122
+ | 4468864478874 | Dark Chocolate Smoothie (On Sale) | Chocolate | Smoothie |
123
+ | 4036242144725 | Uji Matcha Tea (New Product) | Matcha | Tea |
124
+ | 4847798245741 | Hokkaido Milk Tea (Trial Size) | Milk | Milk Tea |
125
+ | 4449574211957 | Dark Chocolate Smoothie (Trial Size) | Chocolate | Smoothie |
126
+ | 4127044426148 | Fruit Mix Tea (Trial Size) | Fruit | Tea |
127
+ | ... | ... | ... | ... |
128
+
129
+
130
+
131
+
@@ -0,0 +1,7 @@
1
+ from .embedding import EmbeddingOpenAI
2
+ from .vectorize import VectorizedOpenAI
3
+
4
+ __ALL__ = [
5
+ "VectorizedOpenAI",
6
+ "EmbeddingOpenAI",
7
+ ]
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+ from typing import List
3
+
4
+ import numpy as np
5
+ from numpy.typing import NDArray
6
+ from openai import OpenAI
7
+
8
+ from openaivec.util import map_unique_minibatch
9
+
10
+ __ALL__ = ["EmbeddingOpenAI"]
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class EmbeddingOpenAI:
15
+ client: OpenAI
16
+ model_name: str
17
+
18
+ def embed(self, sentences: List[str]) -> List[NDArray[np.float32]]:
19
+ responses = self.client.embeddings.create(input=sentences, model=self.model_name)
20
+ return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
21
+
22
+ def embed_minibatch(self, sentences: List[str], batch_size: int) -> List[NDArray[np.float32]]:
23
+ return map_unique_minibatch(sentences, batch_size, self.embed)
@@ -0,0 +1,91 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from typing import Iterator
4
+
5
+ import pandas as pd
6
+ from pyspark.sql.pandas.functions import pandas_udf
7
+ from pyspark.sql.types import StringType, ArrayType, FloatType
8
+
9
+ __ALL__ = ["UDFBuilder"]
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class UDFBuilder:
14
+ api_key: str
15
+ api_version: str
16
+ endpoint: str
17
+ model_name: str
18
+ batch_size: int = 256
19
+
20
+ @classmethod
21
+ def of_environment(cls) -> "UDFBuilder":
22
+ return cls(
23
+ api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
24
+ api_version=os.environ.get("AZURE_OPENAI_API_VERSION", "2024-10-21"),
25
+ endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
26
+ model_name=os.environ.get("AZURE_OPENAI_MODEL_NAME"),
27
+ )
28
+
29
+ def __post_init__(self):
30
+ assert self.api_key, "api_key must be set"
31
+ assert self.api_version, "api_version must be set"
32
+ assert self.endpoint, "endpoint must be set"
33
+ assert self.model_name, "model_name must be set"
34
+
35
+ def completion(self, system_message: str):
36
+ @pandas_udf(StringType())
37
+ def fn(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
38
+ import httpx
39
+ import pandas as pd
40
+ from openai import AzureOpenAI
41
+
42
+ from openaivec import VectorizedOpenAI
43
+
44
+ client = AzureOpenAI(
45
+ api_version=self.api_version,
46
+ azure_endpoint=self.endpoint,
47
+ http_client=httpx.Client(http2=True, verify=False),
48
+ api_key=self.api_key,
49
+ )
50
+
51
+ client_vec = VectorizedOpenAI(
52
+ client=client,
53
+ model_name=self.model_name,
54
+ system_message=system_message,
55
+ top_p=1.0,
56
+ temperature=0.0,
57
+ )
58
+
59
+ for part in col:
60
+ yield pd.Series(
61
+ client_vec.predict_minibatch(part.tolist(), self.batch_size)
62
+ )
63
+
64
+ return fn
65
+
66
+ def embedding(self):
67
+ @pandas_udf(ArrayType(FloatType()))
68
+ def fn(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
69
+ import httpx
70
+ from openai import AzureOpenAI
71
+
72
+ from openaivec.embedding import EmbeddingOpenAI
73
+
74
+ client = AzureOpenAI(
75
+ api_version=self.api_version,
76
+ azure_endpoint=self.endpoint,
77
+ http_client=httpx.Client(http2=True, verify=False),
78
+ api_key=self.api_key,
79
+ )
80
+
81
+ client_emb = EmbeddingOpenAI(
82
+ client=client,
83
+ model_name=self.model_name,
84
+ )
85
+
86
+ for part in col:
87
+ yield pd.Series(
88
+ client_emb.embed_minibatch(part.tolist(), self.batch_size)
89
+ )
90
+
91
+ return fn
@@ -0,0 +1,76 @@
1
+ from typing import List
2
+ from unittest import TestCase
3
+
4
+ from openaivec.util import split_to_minibatch, map_minibatch, map_unique, map_unique_minibatch
5
+
6
+
7
+ class TestMappingFunctions(TestCase):
8
+
9
+ def test_split_to_minibatch_normal(self):
10
+ b = [1, 2, 3, 4, 5]
11
+ batch_size = 2
12
+ expected = [[1, 2], [3, 4], [5]]
13
+ self.assertEqual(split_to_minibatch(b, batch_size), expected)
14
+
15
+ def test_split_to_minibatch_empty(self):
16
+ b: List[int] = []
17
+ batch_size = 3
18
+ expected: List[List[int]] = []
19
+ self.assertEqual(split_to_minibatch(b, batch_size), expected)
20
+
21
+ def test_map_minibatch(self):
22
+ # Function that doubles each element in the batch.
23
+ def double_list(lst: List[int]) -> List[int]:
24
+ return [x * 2 for x in lst]
25
+
26
+ b = [1, 2, 3, 4, 5]
27
+ batch_size = 2
28
+ # Batches: [1,2] -> [2,4], [3,4] -> [6,8], [5] -> [10]
29
+ expected = [2, 4, 6, 8, 10]
30
+ self.assertEqual(map_minibatch(b, batch_size, double_list), expected)
31
+
32
+ def test_map_minibatch_batch_size_one(self):
33
+ # Identity function: returns the list as is.
34
+ def identity(lst: List[int]) -> List[int]:
35
+ return lst
36
+
37
+ b = [1, 2, 3, 4]
38
+ batch_size = 1
39
+ expected = [1, 2, 3, 4]
40
+ self.assertEqual(map_minibatch(b, batch_size, identity), expected)
41
+
42
+ def test_map_minibatch_batch_size_greater_than_list(self):
43
+ def identity(lst: List[int]) -> List[int]:
44
+ return lst
45
+
46
+ b = [1, 2, 3]
47
+ batch_size = 5
48
+ expected = [1, 2, 3]
49
+ self.assertEqual(map_minibatch(b, batch_size, identity), expected)
50
+
51
+ def test_map_unique(self):
52
+ # Function that squares each element.
53
+ def square_list(lst: List[int]) -> List[int]:
54
+ return [x * x for x in lst]
55
+
56
+ b = [3, 2, 3, 1]
57
+ # Unique order preserved using dict.fromkeys: [3, 2, 1]
58
+ # After applying f: [9, 4, 1]
59
+ # Mapping back for original list: [9, 4, 9, 1]
60
+ expected = [9, 4, 9, 1]
61
+ self.assertEqual(map_unique(b, square_list), expected)
62
+
63
+ def test_map_unique_minibatch(self):
64
+ # Function that doubles each element.
65
+ def double_list(lst: List[int]) -> List[int]:
66
+ return [x * 2 for x in lst]
67
+
68
+ b = [1, 2, 1, 3]
69
+ batch_size = 2
70
+ # Unique order: [1, 2, 3]
71
+ # Using map_minibatch on unique values:
72
+ # Split [1,2,3] with batch_size=2 -> [[1,2], [3]]
73
+ # Apply function: [[2,4], [6]] -> flattened to [2,4,6]
74
+ # Mapping back for original list: [2, 4, 2, 6]
75
+ expected = [2, 4, 2, 6]
76
+ self.assertEqual(map_unique_minibatch(b, batch_size, double_list), expected)
@@ -0,0 +1,38 @@
1
+ from itertools import chain
2
+ from typing import List, TypeVar, Callable
3
+
4
+ T = TypeVar("T")
5
+ U = TypeVar("U")
6
+
7
+ def split_to_minibatch(b: List[T], batch_size: int) -> List[List[T]]:
8
+ """Splits the list into sublists of size `batch_size`."""
9
+ return [b[i:i + batch_size] for i in range(0, len(b), batch_size)]
10
+
11
+ def map_minibatch(b: List[T], batch_size: int, f: Callable[[List[T]], List[U]]) -> List[U]:
12
+ """
13
+ Splits the list `b` into batches of size `batch_size` and applies the function `f` to each batch.
14
+ The results (each a list) are then flattened into a single list.
15
+ """
16
+ batches = split_to_minibatch(b, batch_size)
17
+ return list(chain.from_iterable(f(batch) for batch in batches))
18
+
19
+ def map_unique(b: List[T], f: Callable[[List[T]], List[U]]) -> List[U]:
20
+ """
21
+ Applies the function `f` only once to the unique values in the list `b` (preserving their order),
22
+ and then maps the resulting values back to match the original list.
23
+ This avoids repeated execution of `f` for duplicate values.
24
+ """
25
+ # Use dict.fromkeys to remove duplicates while preserving the order
26
+ unique_values = list(dict.fromkeys(b))
27
+ value_to_index = {v: i for i, v in enumerate(unique_values)}
28
+ results = f(unique_values)
29
+ return [results[value_to_index[value]] for value in b]
30
+
31
+
32
+ def map_unique_minibatch(b: List[T], batch_size: int, f: Callable[[List[T]], List[U]]) -> List[U]:
33
+ """
34
+ Uses minibatch processing on the unique values of the list `b`.
35
+ The function `f` is applied to these unique values in batches,
36
+ and the results are mapped back to match the order of the original list.
37
+ """
38
+ return map_unique(b, lambda x: map_minibatch(x, batch_size, f))
@@ -0,0 +1,108 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List
3
+
4
+ from openai import OpenAI
5
+ from openai.types.chat import ParsedChatCompletion
6
+ from pydantic import BaseModel
7
+
8
+ from openaivec.util import map_unique_minibatch
9
+
10
+ __ALL__ = ["VectorizedOpenAI"]
11
+
12
+
13
+ def vectorize_system_message(system_message: str) -> str:
14
+ return f"""
15
+ <SystemMessage>
16
+ <Instructions>
17
+ <Instruction>{system_message}</Instruction>
18
+ <Instruction>
19
+ You will receive multiple user messages at once.
20
+ Please provide an appropriate response to each message individually.
21
+ </Instruction>
22
+ </Instructions>
23
+ <Examples>
24
+ <Example>
25
+ <Input>
26
+ {{
27
+ "user_messages": [
28
+ {{
29
+ "id": 1,
30
+ "text": "{{user_message_1}}"
31
+ }},
32
+ {{
33
+ "id": 2,
34
+ "text": "{{user_message_2}}"
35
+ }}
36
+ ]
37
+ }}
38
+ </Input>
39
+ <Output>
40
+ {{
41
+ "assistant_messages": [
42
+ {{
43
+ "id": 1,
44
+ "text": "{{assistant_response_1}}"
45
+ }},
46
+ {{
47
+ "id": 2,
48
+ "text": "{{assistant_response_2}}"
49
+ }}
50
+ ]
51
+ }}
52
+ </Output>
53
+ </Example>
54
+ </Examples>
55
+ </SystemMessage>
56
+ """
57
+
58
+
59
+ class Message(BaseModel):
60
+ id: int
61
+ text: str
62
+
63
+
64
+ class Request(BaseModel):
65
+ user_messages: List[Message]
66
+
67
+
68
+ class Response(BaseModel):
69
+ assistant_messages: List[Message]
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class VectorizedOpenAI:
74
+ client: OpenAI
75
+ model_name: str ## it would be the name of deployment for Azure
76
+ system_message: str
77
+ temperature: float = 0.0
78
+ top_p: float = 1.0
79
+ _vectorized_system_message: str = field(init=False)
80
+
81
+ def __post_init__(self):
82
+ object.__setattr__(self, "_vectorized_system_message", vectorize_system_message(self.system_message))
83
+
84
+ def request(self, user_messages: List[Message]) -> ParsedChatCompletion[Response]:
85
+ completion = self.client.beta.chat.completions.parse(
86
+ model=self.model_name,
87
+ messages=[
88
+ {"role": "system", "content": self._vectorized_system_message},
89
+ {"role": "user", "content": Request(user_messages=user_messages).model_dump_json()}
90
+ ],
91
+ temperature=self.temperature,
92
+ top_p=self.top_p,
93
+ response_format=Response
94
+ )
95
+ return completion
96
+
97
+ def predict(self, user_messages: List[str]) -> List[str]:
98
+ messages = [Message(id=i, text=message) for i, message in enumerate(user_messages)]
99
+ completion = self.request(messages)
100
+ response_dict = {
101
+ message.id: message.text
102
+ for message in completion.choices[0].message.parsed.assistant_messages
103
+ }
104
+ sorted_responses = [response_dict[m.id] for m in messages]
105
+ return sorted_responses
106
+
107
+ def predict_minibatch(self, user_messages: List[str], batch_size: int) -> List[str]:
108
+ return map_unique_minibatch(user_messages, batch_size, self.predict)
@@ -0,0 +1,27 @@
1
+ [tool.poetry]
2
+ name = "openaivec"
3
+ version = "0.2.1"
4
+ description = ""
5
+ authors = ["Hiroki Mizukami <hmizukami@microsoft.com>"]
6
+ license = "MIT License"
7
+ readme = "README.md"
8
+ homepage = "https://github.com/anaregdesign/vectorize-openai"
9
+ repository = "https://github.com/anaregdesign/vectorize-openai"
10
+ packages = [
11
+ { include = "openaivec", from = "." }
12
+ ]
13
+
14
+ [tool.poetry.dependencies]
15
+ python = "^3.10"
16
+ pandas = "^2.2.3"
17
+ pyspark = "^3.5.1"
18
+ openai = "^1.57.2"
19
+ httpx = {extras = ["http2"], version = "^0.28.1"}
20
+
21
+ [tool.poetry.dev-dependencies]
22
+ pytest = "^8.3.4"
23
+
24
+
25
+ [build-system]
26
+ requires = ["poetry-core"]
27
+ build-backend = "poetry.core.masonry.api"