dactyl-generation 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dactyl_generation-0.0.1/LICENSE +21 -0
- dactyl_generation-0.0.1/PKG-INFO +47 -0
- dactyl_generation-0.0.1/README.md +21 -0
- dactyl_generation-0.0.1/dactyl_generation/__init__.py +0 -0
- dactyl_generation-0.0.1/dactyl_generation/anthropic_generation.py +146 -0
- dactyl_generation-0.0.1/dactyl_generation/bedrock_generation.py +185 -0
- dactyl_generation-0.0.1/dactyl_generation/constants.py +70 -0
- dactyl_generation-0.0.1/dactyl_generation/deepseek_generation.py +49 -0
- dactyl_generation-0.0.1/dactyl_generation/google_generation.py +81 -0
- dactyl_generation-0.0.1/dactyl_generation/mistral_generation.py +124 -0
- dactyl_generation-0.0.1/dactyl_generation/openai_generation.py +112 -0
- dactyl_generation-0.0.1/dactyl_generation/quick.py +138 -0
- dactyl_generation-0.0.1/dactyl_generation.egg-info/PKG-INFO +47 -0
- dactyl_generation-0.0.1/dactyl_generation.egg-info/SOURCES.txt +17 -0
- dactyl_generation-0.0.1/dactyl_generation.egg-info/dependency_links.txt +1 -0
- dactyl_generation-0.0.1/dactyl_generation.egg-info/requires.txt +13 -0
- dactyl_generation-0.0.1/dactyl_generation.egg-info/top_level.txt +1 -0
- dactyl_generation-0.0.1/pyproject.toml +36 -0
- dactyl_generation-0.0.1/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Shantanu Thorat
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dactyl_generation
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: LLM helper package to generate AI-generated texts.
|
|
5
|
+
Author: Shantanu Thorat
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: anthropic
|
|
13
|
+
Requires-Dist: litellm
|
|
14
|
+
Requires-Dist: mistralai
|
|
15
|
+
Requires-Dist: numpy==1.26.4
|
|
16
|
+
Requires-Dist: openai
|
|
17
|
+
Requires-Dist: pandas
|
|
18
|
+
Requires-Dist: protobuf
|
|
19
|
+
Requires-Dist: python-dotenv
|
|
20
|
+
Requires-Dist: Requests
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: typing_extensions
|
|
23
|
+
Requires-Dist: google-generativeai
|
|
24
|
+
Requires-Dist: boto3
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# DACTYL-Generation
|
|
28
|
+
|
|
29
|
+
A Python package to generate LLM data from various APIs.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
```bash
|
|
33
|
+
pip install git+https://github.com/ShantanuT01/dactyl_generation.git
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
Load environment variables (your API keys first) before importing the library.
|
|
39
|
+
```python
|
|
40
|
+
# load environment variables first
|
|
41
|
+
from dotenv import load_dotenv
|
|
42
|
+
load_dotenv()
|
|
43
|
+
|
|
44
|
+
# now import library
|
|
45
|
+
from dactyl_generation.quick import *
|
|
46
|
+
|
|
47
|
+
```
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# DACTYL-Generation
|
|
2
|
+
|
|
3
|
+
A Python package to generate LLM data from various APIs.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
```bash
|
|
7
|
+
pip install git+https://github.com/ShantanuT01/dactyl_generation.git
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
## Usage
|
|
11
|
+
|
|
12
|
+
Load environment variables (your API keys first) before importing the library.
|
|
13
|
+
```python
|
|
14
|
+
# load environment variables first
|
|
15
|
+
from dotenv import load_dotenv
|
|
16
|
+
load_dotenv()
|
|
17
|
+
|
|
18
|
+
# now import library
|
|
19
|
+
from dactyl_generation.quick import *
|
|
20
|
+
|
|
21
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generates texts with using the Anthropic Batch API.
|
|
3
|
+
"""
|
|
4
|
+
import copy
|
|
5
|
+
|
|
6
|
+
import anthropic
|
|
7
|
+
import dotenv
|
|
8
|
+
import os
|
|
9
|
+
import numpy as np
|
|
10
|
+
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
|
|
11
|
+
from anthropic.types.messages.batch_create_params import Request
|
|
12
|
+
import json
|
|
13
|
+
import requests
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from typing import List
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
from dactyl_generation.constants import *
|
|
18
|
+
dotenv.load_dotenv()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
ANTHROPIC_CLIENT = anthropic.Anthropic(
|
|
22
|
+
api_key = os.environ['ANTHROPIC_API_KEY'],
|
|
23
|
+
)
|
|
24
|
+
API_HEADERS = {"x-api-key": os.environ['ANTHROPIC_API_KEY'], "anthropic-version": "2023-06-01"}
|
|
25
|
+
|
|
26
|
+
def convert_openai_system_message_to_anthropic_system_message(openai_message: dict) -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Converts OpenAI system message to Anthropic API system message.
|
|
29
|
+
Doesn't support cache control yet!
|
|
30
|
+
Args:
|
|
31
|
+
openai_message: dictionary containing system prompt
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
anthropic_system_prompt: dictionary containing Anthropic API message
|
|
35
|
+
"""
|
|
36
|
+
ret = dict()
|
|
37
|
+
ret[TEXT] = openai_message[CONTENT]
|
|
38
|
+
ret[TYPE] = TEXT
|
|
39
|
+
return ret
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def convert_anthropic_system_message_to_openai_system_message(anthropic_message: dict) -> dict:
|
|
43
|
+
"""
|
|
44
|
+
Converts Anthropic API system message to OpenAI API system message.
|
|
45
|
+
Doesn't support cache control yet!
|
|
46
|
+
Args:
|
|
47
|
+
openai_message: dictionary containing system prompt
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
anthropic_system_prompt: dictionary containing Anthropic API message
|
|
51
|
+
"""
|
|
52
|
+
ret = dict()
|
|
53
|
+
ret[ROLE] = SYSTEM
|
|
54
|
+
ret[CONTENT] = anthropic_message[TEXT]
|
|
55
|
+
return ret
|
|
56
|
+
|
|
57
|
+
def get_message_batch(prompts_df: pd.DataFrame) -> List[Request]:
|
|
58
|
+
"""
|
|
59
|
+
Generate a batch of requests from list of prompts
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
prompts_df: DataFrame where each row is an API call to the Anthropic API.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
requests: list of requests
|
|
66
|
+
"""
|
|
67
|
+
requests = list()
|
|
68
|
+
calls = prompts_df.to_dict(orient="records")
|
|
69
|
+
digits_length = int(np.log10(len(calls))) + 1
|
|
70
|
+
for i, call in enumerate(calls):
|
|
71
|
+
system_messages = list()
|
|
72
|
+
normal_messages = list()
|
|
73
|
+
for message in call[PROMPT]:
|
|
74
|
+
if message[ROLE] == SYSTEM:
|
|
75
|
+
system_messages.append(convert_openai_system_message_to_anthropic_system_message(message))
|
|
76
|
+
else:
|
|
77
|
+
normal_messages.append(message)
|
|
78
|
+
|
|
79
|
+
call[SYSTEM] = system_messages
|
|
80
|
+
call[MESSAGES] = normal_messages
|
|
81
|
+
message_parameters = copy.copy(call)
|
|
82
|
+
del message_parameters[PROMPT]
|
|
83
|
+
# each individual request maps to one few shot set
|
|
84
|
+
request = Request(
|
|
85
|
+
custom_id=f"request-{str(i).zfill(digits_length)}",
|
|
86
|
+
params=MessageCreateParamsNonStreaming(
|
|
87
|
+
**message_parameters
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
requests.append(request)
|
|
91
|
+
return requests
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def create_batch_job(prompts_df: pd.DataFrame) -> dict:
|
|
95
|
+
"""
|
|
96
|
+
Requests message batch to Anthropic API given a list of examples.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
prompts_df: Dataframe containing prompts to run.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
request_data: requests sent to Anthropic API
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
requests = get_message_batch(prompts_df)
|
|
107
|
+
custom_ids = [request[CUSTOM_ID] for request in requests]
|
|
108
|
+
message_batch = ANTHROPIC_CLIENT.messages.batches.create(requests=requests)
|
|
109
|
+
prompts_df[CUSTOM_ID] = custom_ids
|
|
110
|
+
return {
|
|
111
|
+
BATCH_ID: message_batch.id,
|
|
112
|
+
PROMPTS: prompts_df.to_dict(orient='records'),
|
|
113
|
+
API_CALL: ANTHROPIC,
|
|
114
|
+
TIMESTAMP: str(datetime.now(timezone.utc))
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_batch_job_output(file_path: str) -> pd.DataFrame:
|
|
120
|
+
"""
|
|
121
|
+
Gets batch job results using saved metadata from a local JSON file.
|
|
122
|
+
Args:
|
|
123
|
+
file_path: local JSON file containing output of the `request_batch_job` function
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
df: pandas DataFrame of generations.
|
|
127
|
+
"""
|
|
128
|
+
with open(file_path) as f:
|
|
129
|
+
data = json.load(f)
|
|
130
|
+
message_id = data[BATCH_ID]
|
|
131
|
+
response = requests.get(f"https://api.anthropic.com/v1/messages/batches/{message_id}/results",headers=API_HEADERS)
|
|
132
|
+
lines = response.text.splitlines()
|
|
133
|
+
objects = list()
|
|
134
|
+
for line in lines:
|
|
135
|
+
objects.append(json.loads(line))
|
|
136
|
+
generations = list()
|
|
137
|
+
for object in objects:
|
|
138
|
+
generation = dict()
|
|
139
|
+
generation[CUSTOM_ID] = object[CUSTOM_ID]
|
|
140
|
+
generation[TEXT] = object[RESULT][MESSAGE][CONTENT][0][TEXT]
|
|
141
|
+
generations.append(generation)
|
|
142
|
+
generations = pd.DataFrame(generations)
|
|
143
|
+
generations[TIMESTAMP] = data[TIMESTAMP]
|
|
144
|
+
prompt_rows = pd.DataFrame(data[PROMPTS])
|
|
145
|
+
ret = pd.DataFrame(prompt_rows)
|
|
146
|
+
return generations.merge(ret, on=CUSTOM_ID, how='left')
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generates texts using AWS Bedrock APIs.
|
|
3
|
+
!!! note
|
|
4
|
+
Only supports AWS region US East 1!
|
|
5
|
+
"""
|
|
6
|
+
from litellm import completion
|
|
7
|
+
from typing import List
|
|
8
|
+
import os
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from dactyl_generation.constants import *
|
|
12
|
+
os.environ['AWS_REGION']='us-east-1'
|
|
13
|
+
import boto3
|
|
14
|
+
import json
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def prompt(messages:List[dict], model: str, temperature: float, top_p: float, max_completion_tokens: int =512) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Prompt AWS Bedrock model with few shot learning examples.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
messages: List of OpenAI messages
|
|
24
|
+
model: name of model
|
|
25
|
+
temperature: temperature parameter
|
|
26
|
+
top_p: top p parameter
|
|
27
|
+
max_completion_tokens: maximum number of tokens for completion
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
response_content: string containing message content
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
response = completion(model, messages, temperature=temperature, top_p=top_p,max_completion_tokens=max_completion_tokens)
|
|
34
|
+
return response.choices[0].message.content
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def format_llama_prompt(messages: List[dict]) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Formats OpenAI style message to Llama 3.2 style.
|
|
40
|
+
Args:
|
|
41
|
+
messages: list of dictionaries containing OpenAI style messages
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
llama_prompt: formatted llama prompt
|
|
45
|
+
"""
|
|
46
|
+
formatted_prompt = "<|begin_of_text|>"
|
|
47
|
+
for message in messages:
|
|
48
|
+
role = message[ROLE]
|
|
49
|
+
formatted_prompt += LLAMA_START_HEADER + role + LLAMA_END_HEADER + message[CONTENT] + "<|eot_id|>"
|
|
50
|
+
formatted_prompt += f"{LLAMA_START_HEADER}assistant{LLAMA_END_HEADER}"
|
|
51
|
+
return formatted_prompt
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def create_jsonl_input_for_llama(prompts_df: pd.DataFrame, s3_path: str) -> pd.DataFrame:
|
|
55
|
+
"""
|
|
56
|
+
Creates a JSONL file to upload to S3.
|
|
57
|
+
Args:
|
|
58
|
+
prompts_df: prompt dataframe containing OpenAI style messages
|
|
59
|
+
s3_path: Path to S3 bucket to save file
|
|
60
|
+
max_gen_len: maximum generation token count per request
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
None
|
|
64
|
+
"""
|
|
65
|
+
original_prompts = prompts_df[PROMPT].to_list()
|
|
66
|
+
prompts_df_copy = pd.DataFrame(prompts_df)
|
|
67
|
+
prompts_df_copy[PROMPT] = prompts_df_copy[PROMPT].apply(lambda messages: format_llama_prompt(messages))
|
|
68
|
+
messages = prompts_df_copy.to_dict(orient="records")
|
|
69
|
+
|
|
70
|
+
rows = list()
|
|
71
|
+
for i in range(len(messages)):
|
|
72
|
+
rows.append({
|
|
73
|
+
RECORDID: f"CALL{str(i).zfill(7)}",
|
|
74
|
+
MODELINPUT:messages[i]
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
input_frame = pd.DataFrame(rows)
|
|
78
|
+
input_frame.to_json(s3_path, orient="records",index=False, lines=True)
|
|
79
|
+
prompts_df_ret = pd.DataFrame(prompts_df)
|
|
80
|
+
prompts_df_ret[RECORDID] = input_frame[RECORDID].to_list()
|
|
81
|
+
prompts_df_ret[PROMPT] = original_prompts
|
|
82
|
+
return prompts_df_ret
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def create_batch_job(prompts_df: pd.DataFrame, s3_input_path: str, s3_output_path: str, model: str, role_arn: str, job_name: str) -> dict:
|
|
86
|
+
"""
|
|
87
|
+
Creates batch job for Bedrock models.
|
|
88
|
+
|
|
89
|
+
!!! warning
|
|
90
|
+
This function has not been tested yet!
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
prompts_df: Dataframe of OpenAI-style prompts.
|
|
94
|
+
s3_input_path: Input data path.
|
|
95
|
+
s3_output_path: Output data path.
|
|
96
|
+
model: Bedrock model ID.
|
|
97
|
+
role_arn: Role to run batch job.
|
|
98
|
+
job_name: Name of job
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
jobArn: dictionary containing single string
|
|
102
|
+
"""
|
|
103
|
+
inputted_frame = create_jsonl_input_for_llama(prompts_df, s3_input_path)
|
|
104
|
+
bedrock = boto3.client(service_name="bedrock",region_name="us-east-1")
|
|
105
|
+
input_data_config = (
|
|
106
|
+
{
|
|
107
|
+
S3_INPUT_DATA_CONFIG: {
|
|
108
|
+
S3URI: s3_input_path
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
output_data_config = (
|
|
113
|
+
{
|
|
114
|
+
S3_OUTPUT_DATA_CONFIG:{
|
|
115
|
+
S3URI: s3_output_path
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
response = bedrock.create_model_invocation_job(
|
|
121
|
+
roleArn=role_arn,
|
|
122
|
+
modelId=model,
|
|
123
|
+
jobName=job_name,
|
|
124
|
+
inputDataConfig=input_data_config,
|
|
125
|
+
outputDataConfig=output_data_config
|
|
126
|
+
)
|
|
127
|
+
inputted_frame[MODEL] = model
|
|
128
|
+
return {
|
|
129
|
+
JOB_ARN: response.get(JOB_ARN),
|
|
130
|
+
S3_OUTPUT_DATA_CONFIG: s3_output_path,
|
|
131
|
+
API_CALL: BEDROCK,
|
|
132
|
+
JOB_NAME: job_name,
|
|
133
|
+
INPUT_FILE: json.loads(inputted_frame.to_json(orient='records')),
|
|
134
|
+
TIMESTAMP: str(datetime.now(timezone.utc))
|
|
135
|
+
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def get_batch_job_output(file_path: str) -> pd.DataFrame:
|
|
140
|
+
"""
|
|
141
|
+
Fetches batch job results given JSON file.
|
|
142
|
+
Args:
|
|
143
|
+
file_path: JSON file containing jobArn.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
output_df: Dataframe containing generations.
|
|
147
|
+
"""
|
|
148
|
+
with open(file_path, 'r') as file:
|
|
149
|
+
data = json.load(file)
|
|
150
|
+
job_arn = data[JOB_ARN].split("/")[-1]
|
|
151
|
+
s3_client = boto3.resource('s3')
|
|
152
|
+
# ignore s3://
|
|
153
|
+
bucket_name = data[S3_OUTPUT_DATA_CONFIG].split("/")[2]
|
|
154
|
+
bucket = s3_client.Bucket(bucket_name)
|
|
155
|
+
folder_path = "/".join(data[S3_OUTPUT_DATA_CONFIG].split("/")[3:]) + job_arn + "/"
|
|
156
|
+
target_file = None
|
|
157
|
+
for object_summary in bucket.objects.filter(Prefix=folder_path):
|
|
158
|
+
if object_summary.key.endswith(".jsonl.out"):
|
|
159
|
+
target_file = object_summary.key
|
|
160
|
+
break
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
if target_file:
|
|
164
|
+
output_df = pd.read_json(f"s3://{bucket_name}/"+target_file, lines=True)
|
|
165
|
+
rows = list()
|
|
166
|
+
print(output_df.head())
|
|
167
|
+
for _, row in output_df.iterrows():
|
|
168
|
+
entry = dict()
|
|
169
|
+
entry[TEXT] = row[MODEL_OUTPUT][GENERATION].strip()
|
|
170
|
+
entry[RECORDID] = row[RECORDID]
|
|
171
|
+
rows.append(entry)
|
|
172
|
+
|
|
173
|
+
outputs = pd.DataFrame(rows)
|
|
174
|
+
inputs = pd.DataFrame(data[INPUT_FILE])
|
|
175
|
+
outputs = outputs.merge(inputs, how='left', on=RECORDID)
|
|
176
|
+
outputs = outputs.drop(columns=RECORDID)
|
|
177
|
+
outputs[TIMESTAMP] = data[TIMESTAMP]
|
|
178
|
+
return outputs
|
|
179
|
+
else:
|
|
180
|
+
raise Exception(f"{bucket_name} does not contain .jsonl.out file! Please check if job has completed.")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# constants to use
|
|
2
|
+
|
|
3
|
+
# OpenAI constants
|
|
4
|
+
SYSTEM = "system"
|
|
5
|
+
ROLE = "role"
|
|
6
|
+
USER = "user"
|
|
7
|
+
CONTENT = "content"
|
|
8
|
+
GPT_4o = "gpt-4o"
|
|
9
|
+
GPT_4o_MINI = "gpt-4o-mini"
|
|
10
|
+
DEEPSEEK_V3 = "deepseek-ai/DeepSeek-V3"
|
|
11
|
+
|
|
12
|
+
LLAMA_START_HEADER = "<|start_header_id|>"
|
|
13
|
+
LLAMA_END_HEADER = "<|end_header_id|>"
|
|
14
|
+
BODY = "body"
|
|
15
|
+
MESSAGE = "message"
|
|
16
|
+
MESSAGES = "messages"
|
|
17
|
+
TEMPERATURE = "temperature"
|
|
18
|
+
TOP_P = "top_p"
|
|
19
|
+
MODEL = "model"
|
|
20
|
+
MAX_COMPLETION_TOKENS = "max_completion_tokens"
|
|
21
|
+
CUSTOM_ID = "custom_id"
|
|
22
|
+
INPUT_FILE = "input_file"
|
|
23
|
+
RESPONSE = "response"
|
|
24
|
+
CHOICES = "choices"
|
|
25
|
+
RESULT_FILE_ID = "result_file_id"
|
|
26
|
+
PROMPTS = "prompts"
|
|
27
|
+
PROMPT = "prompt"
|
|
28
|
+
TEXT = "text"
|
|
29
|
+
RESULT = "result"
|
|
30
|
+
PARAMS = "params"
|
|
31
|
+
JOB_NAME = "job_name"
|
|
32
|
+
API_CALL = "api_call"
|
|
33
|
+
TARGET = "target"
|
|
34
|
+
BATCH_ID = "batch_id"
|
|
35
|
+
TYPE = "type"
|
|
36
|
+
MAX_TOKENS = "max_tokens"
|
|
37
|
+
BATCH = "batch"
|
|
38
|
+
BLOCK_NONE = "BLOCK_NONE"
|
|
39
|
+
MODELINPUT = "modelInput"
|
|
40
|
+
RECORDID = "recordId"
|
|
41
|
+
TIMESTAMP = "timestamp"
|
|
42
|
+
CREATED = "created"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
EXAMPLES = "examples"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
CLAUDE = "claude"
|
|
49
|
+
GPT = "gpt"
|
|
50
|
+
MISTRAL = "mistral"
|
|
51
|
+
ANTHROPIC = "anthropic"
|
|
52
|
+
OPENAI = "openai"
|
|
53
|
+
BEDROCK = "bedrock"
|
|
54
|
+
GEMINI = "gemini"
|
|
55
|
+
DEEPSEEK = "deepseek"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
PX = "p(x)"
|
|
60
|
+
ENTROPY = "entropy"
|
|
61
|
+
COUNT = "count"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
S3URI = "s3Uri"
|
|
65
|
+
JOB_ARN = "jobArn"
|
|
66
|
+
|
|
67
|
+
S3_OUTPUT_DATA_CONFIG = "s3OutputDataConfig"
|
|
68
|
+
S3_INPUT_DATA_CONFIG = "s3InputDataConfig"
|
|
69
|
+
MODEL_OUTPUT = "modelOutput"
|
|
70
|
+
GENERATION = "generation"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generates texts with DeepSeek models using the DeepInfra API.
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
from dactyl_generation.constants import *
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from typing import List
|
|
10
|
+
load_dotenv()
|
|
11
|
+
|
|
12
|
+
DEEPSEEK_CLIENT = OpenAI(
|
|
13
|
+
api_key=os.environ["FIREWORKS_AI_API_KEY"],
|
|
14
|
+
base_url="https://api.fireworks.ai/inference/v1"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def prompt(messages: List[dict], model: str, temperature:float , top_p:float, max_completion_tokens:int=512,number_of_responses:int=1) -> list:
|
|
19
|
+
"""
|
|
20
|
+
Pass a single list of messages to DeepSeek to generate text.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
messages: List of messages to pass in.
|
|
24
|
+
model: model name.
|
|
25
|
+
temperature: temperature, value from 0 to 2.
|
|
26
|
+
top_p: top-p value from 0 to 1.
|
|
27
|
+
max_completion_tokens: maximum number of completion tokens
|
|
28
|
+
number_of_responses: maximum number of responses.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
responses: List of responses.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
api_response = DEEPSEEK_CLIENT.chat.completions.create(
|
|
36
|
+
messages=messages,
|
|
37
|
+
model=model,
|
|
38
|
+
temperature=temperature,
|
|
39
|
+
top_p=top_p,
|
|
40
|
+
max_completion_tokens=max_completion_tokens,
|
|
41
|
+
n=number_of_responses
|
|
42
|
+
)
|
|
43
|
+
responses = list()
|
|
44
|
+
for response in api_response.choices:
|
|
45
|
+
responses.append(response.message.content.strip())
|
|
46
|
+
return responses
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module helps in generating texts using the Gemini API.
|
|
3
|
+
|
|
4
|
+
!!! danger
|
|
5
|
+
For Gemini, all safety filters have been turned off!
|
|
6
|
+
"""
|
|
7
|
+
from google import genai
|
|
8
|
+
from google.genai import types
|
|
9
|
+
import os
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
from typing import List
|
|
12
|
+
import typing_extensions as typing
|
|
13
|
+
from dactyl_generation.constants import *
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
load_dotenv()
|
|
18
|
+
GOOGLE_CLIENT = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
|
19
|
+
|
|
20
|
+
GEMINI_SAFETY_SETTINGS = [
|
|
21
|
+
types.SafetySetting(
|
|
22
|
+
category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
|
23
|
+
threshold=BLOCK_NONE
|
|
24
|
+
),
|
|
25
|
+
types.SafetySetting(
|
|
26
|
+
category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
|
27
|
+
threshold=BLOCK_NONE
|
|
28
|
+
),
|
|
29
|
+
types.SafetySetting(
|
|
30
|
+
category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
|
31
|
+
threshold=BLOCK_NONE
|
|
32
|
+
),
|
|
33
|
+
types.SafetySetting(
|
|
34
|
+
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
|
35
|
+
threshold=BLOCK_NONE
|
|
36
|
+
),
|
|
37
|
+
types.SafetySetting(
|
|
38
|
+
category=types.HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY,
|
|
39
|
+
threshold=BLOCK_NONE
|
|
40
|
+
)
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
class GeneratedResponse(BaseModel):
|
|
44
|
+
text: str
|
|
45
|
+
|
|
46
|
+
def prompt(messages: List[dict], model_name: str, temperature: float, top_p: float, max_completion_tokens: int) -> str:
|
|
47
|
+
"""
|
|
48
|
+
Prompt Gemini model with an individual request.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
messages: List of OpenAI messages
|
|
52
|
+
model_name (str): Name of model.
|
|
53
|
+
temperature (float): Temperature to pass.
|
|
54
|
+
top_p (float): Top-p value.
|
|
55
|
+
max_completion_tokens: maximum number of tokens to generate
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
text: Generation output.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
system_instructions = list()
|
|
62
|
+
user_instructions = list()
|
|
63
|
+
for message in messages:
|
|
64
|
+
if message[ROLE] == SYSTEM:
|
|
65
|
+
system_instructions.append(message[CONTENT])
|
|
66
|
+
else:
|
|
67
|
+
user_instructions.append(message[CONTENT])
|
|
68
|
+
prompt_config = types.GenerateContentConfig(
|
|
69
|
+
system_instruction=system_instructions,
|
|
70
|
+
max_output_tokens=max_completion_tokens,
|
|
71
|
+
top_p=top_p,
|
|
72
|
+
temperature=temperature,
|
|
73
|
+
safety_settings=GEMINI_SAFETY_SETTINGS
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
response = GOOGLE_CLIENT.models.generate_content(model=model_name, contents=user_instructions, config=prompt_config)
|
|
77
|
+
#print(response)
|
|
78
|
+
return response.text
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generates texts with using the Mistral Batch API.
|
|
3
|
+
"""
|
|
4
|
+
import copy
|
|
5
|
+
|
|
6
|
+
import mistralai.files
|
|
7
|
+
from mistralai import Mistral, File
|
|
8
|
+
from dotenv import load_dotenv
|
|
9
|
+
import os
|
|
10
|
+
from io import BytesIO
|
|
11
|
+
import json
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from typing import List, Tuple
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from dactyl_generation.constants import *
|
|
17
|
+
|
|
18
|
+
load_dotenv()
|
|
19
|
+
|
|
20
|
+
MISTRAL_CLIENT = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def create_message_batch(file_name: str, prompts_df: pd.DataFrame) -> Tuple[List[dict], mistralai.models.UploadFileOut]:
|
|
24
|
+
"""
|
|
25
|
+
Creates batch of messages to send to Mistral API.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
file_name: Name of file in Mistral API to save as.
|
|
29
|
+
prompts_df: DataFrame containing prompts and generation parameters
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
tuple: List of requests sent, UploadFileOut object
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
buffer = BytesIO()
|
|
36
|
+
list_of_requests = list()
|
|
37
|
+
messages = prompts_df.to_dict(orient="records")
|
|
38
|
+
digits_length = int(np.log10(len(prompts_df))) + 1
|
|
39
|
+
for index, message_batch in enumerate(messages):
|
|
40
|
+
request = {
|
|
41
|
+
CUSTOM_ID: f"request-{str(index).zfill(digits_length)}",
|
|
42
|
+
BODY: message_batch
|
|
43
|
+
}
|
|
44
|
+
list_of_requests.append(request)
|
|
45
|
+
buffer.write((json.dumps(request)+"\n").encode("utf-8"))
|
|
46
|
+
file = File(file_name=file_name, content=buffer.getvalue())
|
|
47
|
+
return list_of_requests, MISTRAL_CLIENT.files.upload(file=file, purpose=BATCH)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def start_batch_job(input_file: mistralai.models.UploadFileOut, model: str) -> mistralai.models.BatchJobOut:
|
|
51
|
+
"""
|
|
52
|
+
Start batch job from input file stored on Mistral API containing prompts.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
input_file: input file object to create job with
|
|
56
|
+
model: model name to use for generation
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
batch_job: Batch job object
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
batch_job = MISTRAL_CLIENT.batch.jobs.create(
|
|
63
|
+
input_files=[input_file.id],
|
|
64
|
+
model=model,
|
|
65
|
+
endpoint="/v1/chat/completions",
|
|
66
|
+
metadata={"job_type": "testing"}
|
|
67
|
+
)
|
|
68
|
+
return batch_job
|
|
69
|
+
|
|
70
|
+
def create_batch_job(file_name: str, prompts_df: pd.DataFrame) -> dict:
|
|
71
|
+
"""
|
|
72
|
+
Creates batch job for set of prompts given file name to save Mistral prompts to.
|
|
73
|
+
Args:
|
|
74
|
+
file_name: name of file to upload to Mistral API.
|
|
75
|
+
prompts_df: DataFrame containing generation prompts and parameters.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
info: dictionary containing batch job info
|
|
79
|
+
"""
|
|
80
|
+
assert(len(prompts_df[MODEL].unique()) == 1)
|
|
81
|
+
model = prompts_df[MODEL].unique()[0]
|
|
82
|
+
prompts, input_file = create_message_batch(file_name, prompts_df)
|
|
83
|
+
batch_job = start_batch_job(input_file, model)
|
|
84
|
+
input_file = input_file.model_dump(mode="json")
|
|
85
|
+
batch_job = batch_job.model_dump(mode="json")
|
|
86
|
+
return {"batch_job": batch_job, INPUT_FILE: input_file, PROMPTS: prompts, API_CALL: MISTRAL}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_batch_jobs():
|
|
91
|
+
return MISTRAL_CLIENT.batch.jobs.list(
|
|
92
|
+
metadata={"job_type": "testing"}
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def get_batch_job_output(file_path: str) -> pd.DataFrame:
|
|
96
|
+
"""
|
|
97
|
+
Gets batch job results using saved metadata from a local JSON file.
|
|
98
|
+
Args:
|
|
99
|
+
file_path: local JSON file containing output of the `create_batch_job` function
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
df: pandas DataFrame of generations.
|
|
103
|
+
"""
|
|
104
|
+
with open(file_path, "r") as f:
|
|
105
|
+
data = json.load(f)
|
|
106
|
+
job_id = data["batch_job"]["id"]
|
|
107
|
+
output_file = MISTRAL_CLIENT.batch.jobs.get(job_id=job_id).output_file
|
|
108
|
+
content = MISTRAL_CLIENT.files.download(file_id=output_file).read().decode("utf-8")
|
|
109
|
+
json_obj = "[" + ", ".join(content.splitlines()) + "]"
|
|
110
|
+
responses = json.loads(json_obj)
|
|
111
|
+
rows = list()
|
|
112
|
+
for response in responses:
|
|
113
|
+
row = dict()
|
|
114
|
+
row[CUSTOM_ID] = response[CUSTOM_ID]
|
|
115
|
+
row[TEXT] = response[RESPONSE][BODY][CHOICES][0][MESSAGE][CONTENT]
|
|
116
|
+
row[TIMESTAMP] = str(datetime.fromtimestamp(response[RESPONSE][BODY][CREATED], tz=timezone.utc))
|
|
117
|
+
rows.append(row)
|
|
118
|
+
raw_prompts = pd.DataFrame([{**prompt[BODY], **{CUSTOM_ID: prompt[CUSTOM_ID]}} for prompt in data[PROMPTS]])
|
|
119
|
+
print(raw_prompts.head())
|
|
120
|
+
generations = pd.DataFrame(rows)
|
|
121
|
+
return generations.merge(raw_prompts, on=CUSTOM_ID,how="left")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generates texts with using the OpenAI Batch API.
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
from dactyl_generation.constants import *
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import json
|
|
10
|
+
import numpy as np
|
|
11
|
+
from io import BytesIO
|
|
12
|
+
from typing import List, Any
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
|
|
15
|
+
load_dotenv()
|
|
16
|
+
|
|
17
|
+
OPENAI_CLIENT = OpenAI(
|
|
18
|
+
api_key=os.environ["OPENAI_API_KEY"] # This is the default and can be omitted
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
def create_individual_request(custom_id: str, message_body: dict) -> dict:
|
|
22
|
+
"""
|
|
23
|
+
Creates OpenAI REST API request for a single request.
|
|
24
|
+
Args:
|
|
25
|
+
custom_id: Custom ID of request
|
|
26
|
+
message_body: dictionary of a single message. This includes the messages, max_completion_token parameters etc.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
request: individual request formatted for OpenAI REST API.
|
|
30
|
+
"""
|
|
31
|
+
request = {CUSTOM_ID: str(custom_id), "method": "POST", "url": "/v1/chat/completions", BODY: message_body}
|
|
32
|
+
return request
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def create_batch_job(prompts_df: pd.DataFrame) -> dict:
|
|
36
|
+
"""
|
|
37
|
+
Creates batch job of prompts given messages and temperatures.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
prompts_df: DataFrame where each row corresponds to an OpenAI API call.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
results: dictionary containing request information
|
|
44
|
+
"""
|
|
45
|
+
digits_length = int(np.log10(len(prompts_df))) + 1
|
|
46
|
+
json_strs = list()
|
|
47
|
+
requests = list()
|
|
48
|
+
records = prompts_df.to_dict("records")
|
|
49
|
+
for i, record in enumerate(records):
|
|
50
|
+
request = create_individual_request(f"request-{str(i).zfill(digits_length)}", record)
|
|
51
|
+
requests.append(request)
|
|
52
|
+
json_strs.append(json.dumps(request))
|
|
53
|
+
buffer = BytesIO(("\n".join(json_strs)).encode("utf-8"))
|
|
54
|
+
# with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) as fp:
|
|
55
|
+
# fp.write("\n".join(json_strs))
|
|
56
|
+
# temp_filename = fp.name
|
|
57
|
+
|
|
58
|
+
batch_file = OPENAI_CLIENT.files.create(
|
|
59
|
+
file=buffer,
|
|
60
|
+
purpose="batch"
|
|
61
|
+
)
|
|
62
|
+
# os.remove(temp_filename)
|
|
63
|
+
|
|
64
|
+
batch_job = OPENAI_CLIENT.batches.create(
|
|
65
|
+
input_file_id=batch_file.id,
|
|
66
|
+
endpoint="/v1/chat/completions",
|
|
67
|
+
completion_window="24h"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
result_file_id = batch_job.id
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
RESULT_FILE_ID: result_file_id,
|
|
74
|
+
INPUT_FILE: requests,
|
|
75
|
+
API_CALL: OPENAI
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_batch_job_output(file_path: str) -> pd.DataFrame:
|
|
80
|
+
"""
|
|
81
|
+
Gets batch job results using saved metadata from a local JSON file.
|
|
82
|
+
Args:
|
|
83
|
+
file_path: local JSON file containing output of the `create_batch_job` function
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
df: pandas DataFrame of generations.
|
|
87
|
+
"""
|
|
88
|
+
with open(file_path,'r') as f:
|
|
89
|
+
data = json.load(f)
|
|
90
|
+
batch_job = OPENAI_CLIENT.batches.retrieve(data[RESULT_FILE_ID])
|
|
91
|
+
result = OPENAI_CLIENT.files.content(batch_job.output_file_id).content
|
|
92
|
+
df = pd.read_json(BytesIO(result), lines=True)
|
|
93
|
+
responses = df[RESPONSE]
|
|
94
|
+
custom_ids = df[CUSTOM_ID]
|
|
95
|
+
generations = list()
|
|
96
|
+
for response, custom_id in zip(responses, custom_ids):
|
|
97
|
+
generation = dict()
|
|
98
|
+
generation[TEXT] = response[BODY][CHOICES][0][MESSAGE][CONTENT]
|
|
99
|
+
generation[CUSTOM_ID] = custom_id
|
|
100
|
+
generation[TIMESTAMP] = str(datetime.fromtimestamp(response[BODY][CREATED],tz=timezone.utc))
|
|
101
|
+
generations.append(generation)
|
|
102
|
+
generations = pd.DataFrame(generations)
|
|
103
|
+
requests = pd.DataFrame(data[INPUT_FILE])
|
|
104
|
+
|
|
105
|
+
generations = generations.merge(requests, on=CUSTOM_ID, how='left')
|
|
106
|
+
return generations
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generates texts quickly using wrapper functions to redirect to appropriate model functions.
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
from dactyl_generation import openai_generation, anthropic_generation, mistral_generation
|
|
6
|
+
from dactyl_generation import google_generation, bedrock_generation, deepseek_generation
|
|
7
|
+
from dactyl_generation.constants import *
|
|
8
|
+
import numpy as np
|
|
9
|
+
import tempfile
|
|
10
|
+
import time
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import List
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def generate_texts_using_batch(output_path: str, prompts_df: pd.DataFrame, api_provider: str, aws_args: dict = None) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Generates prompts to use using batch APIs from select providers using example prompts path.
|
|
20
|
+
Prompt and batch data are saved to the output_path as a JSON.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
output_path: output path to save prompt metadata
|
|
24
|
+
prompts_df: prompts for each generation
|
|
25
|
+
api_provider: Batch API provider to route request to.
|
|
26
|
+
aws_args: dictionary containing AWS Bedrock args.
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
None
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
if api_provider == ANTHROPIC:
|
|
36
|
+
parameters = anthropic_generation.create_batch_job(prompts_df)
|
|
37
|
+
with open(output_path, 'w+') as file:
|
|
38
|
+
json.dump(parameters, file, indent=4)
|
|
39
|
+
elif api_provider == OPENAI:
|
|
40
|
+
parameters = openai_generation.create_batch_job(prompts_df)
|
|
41
|
+
with open(output_path, 'w+') as file:
|
|
42
|
+
json.dump(parameters, file, indent=4)
|
|
43
|
+
elif api_provider == MISTRAL:
|
|
44
|
+
file_name = next(tempfile._get_candidate_names())
|
|
45
|
+
file_name = f"{file_name}.jsonl"
|
|
46
|
+
parameters = mistral_generation.create_batch_job(file_name,prompts_df)
|
|
47
|
+
with open(output_path, 'w+') as file:
|
|
48
|
+
json.dump(parameters, file, indent=4)
|
|
49
|
+
elif api_provider == BEDROCK:
|
|
50
|
+
parameters = bedrock_generation.create_batch_job(prompts_df, **aws_args)
|
|
51
|
+
with open(output_path, 'w+') as file:
|
|
52
|
+
json.dump(parameters, file, indent=4)
|
|
53
|
+
else:
|
|
54
|
+
raise Exception("Model type not supported for batch inference.")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_batch_job_results(file_path: str, output_path: str) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Saves batch job prompts as JSON file.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
file_path: File path containing batch data saved from `generate_texts_using_batch_with_few_shot_prompting`.
|
|
63
|
+
output_path: Output JSON path to save generations.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
None
|
|
67
|
+
"""
|
|
68
|
+
with open(file_path) as file:
|
|
69
|
+
data = json.load(file)
|
|
70
|
+
api_call = data[API_CALL]
|
|
71
|
+
if api_call == ANTHROPIC:
|
|
72
|
+
df = anthropic_generation.get_batch_job_output(file_path)
|
|
73
|
+
elif api_call == MISTRAL:
|
|
74
|
+
df = mistral_generation.get_batch_job_output(file_path)
|
|
75
|
+
elif api_call == OPENAI:
|
|
76
|
+
df = openai_generation.get_batch_job_output(file_path)
|
|
77
|
+
elif api_call == BEDROCK:
|
|
78
|
+
df = bedrock_generation.get_batch_job_output(file_path)
|
|
79
|
+
else:
|
|
80
|
+
raise Exception(f"API call {api_call} not supported")
|
|
81
|
+
df.to_json(output_path,index=False, orient='records', indent=4)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def generate_texts_streaming(model: str, prompts_df: pd.DataFrame, output_path: str, max_completion_tokens: int =512, category: str ="", wait_after_every:int =20, sleep_time: int =30) -> None:
|
|
85
|
+
"""
|
|
86
|
+
This function generates examples from an API live, no batching. If `example_prompts_path` is given, the function will use all prompts in the JSON file.
|
|
87
|
+
Otherwise, it will generate random few shot examples.
|
|
88
|
+
Outputs are saved as JSON.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
model: name of model
|
|
92
|
+
prompts_df: dataframe containing prompts
|
|
93
|
+
output_path: output path to save JSON file
|
|
94
|
+
max_completion_tokens: maximum number of tokens per generation
|
|
95
|
+
category: categorical column
|
|
96
|
+
wait_after_every: Pauses generation after a certain amount of requests
|
|
97
|
+
sleep_time: Sleeps for a certain amount of time in seconds
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
None
|
|
102
|
+
"""
|
|
103
|
+
rows = list()
|
|
104
|
+
|
|
105
|
+
messages = prompts_df[MESSAGES].to_list()
|
|
106
|
+
temperatures = prompts_df[TEMPERATURE].to_list()
|
|
107
|
+
top_ps = prompts_df[TOP_P].to_list()
|
|
108
|
+
for index in tqdm(range(len(prompts_df))):
|
|
109
|
+
|
|
110
|
+
message_batch = messages[index]
|
|
111
|
+
|
|
112
|
+
temperature = temperatures[index]
|
|
113
|
+
top_p = top_ps[index]
|
|
114
|
+
row = dict()
|
|
115
|
+
row[PROMPT] = message_batch
|
|
116
|
+
row[TEMPERATURE] = temperature
|
|
117
|
+
row[TOP_P] = top_p
|
|
118
|
+
row[MODEL] = model
|
|
119
|
+
row[TARGET] = 1
|
|
120
|
+
row["category"] = category
|
|
121
|
+
if model.find(BEDROCK) >= 0:
|
|
122
|
+
text = bedrock_generation.prompt(message_batch, model, temperature, top_p, max_completion_tokens=max_completion_tokens)
|
|
123
|
+
elif model.find(DEEPSEEK) >= 0:
|
|
124
|
+
text = deepseek_generation.prompt(message_batch, model, temperature, top_p, max_completion_tokens=max_completion_tokens)[0]
|
|
125
|
+
elif model.find(GEMINI) >= 0:
|
|
126
|
+
text = google_generation.prompt(message_batch,model, temperature, top_p, max_completion_tokens)
|
|
127
|
+
else:
|
|
128
|
+
raise Exception("Model type not supported")
|
|
129
|
+
row[TEXT] = text
|
|
130
|
+
row[TIMESTAMP] = str(datetime.now(timezone.utc))
|
|
131
|
+
rows.append(row)
|
|
132
|
+
pd.DataFrame(rows).to_json(output_path, orient="records", indent=4, index=False)
|
|
133
|
+
if (index % wait_after_every == 0) and (index > 0):
|
|
134
|
+
time.sleep(sleep_time)
|
|
135
|
+
|
|
136
|
+
pd.DataFrame(rows).to_json(output_path, orient="records", indent=4, index=False)
|
|
137
|
+
|
|
138
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dactyl_generation
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: LLM helper package to generate AI-generated texts.
|
|
5
|
+
Author: Shantanu Thorat
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: anthropic
|
|
13
|
+
Requires-Dist: litellm
|
|
14
|
+
Requires-Dist: mistralai
|
|
15
|
+
Requires-Dist: numpy==1.26.4
|
|
16
|
+
Requires-Dist: openai
|
|
17
|
+
Requires-Dist: pandas
|
|
18
|
+
Requires-Dist: protobuf
|
|
19
|
+
Requires-Dist: python-dotenv
|
|
20
|
+
Requires-Dist: Requests
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Requires-Dist: typing_extensions
|
|
23
|
+
Requires-Dist: google-generativeai
|
|
24
|
+
Requires-Dist: boto3
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# DACTYL-Generation
|
|
28
|
+
|
|
29
|
+
A Python package to generate LLM data from various APIs.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
```bash
|
|
33
|
+
pip install git+https://github.com/ShantanuT01/dactyl_generation.git
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
Load environment variables (your API keys first) before importing the library.
|
|
39
|
+
```python
|
|
40
|
+
# load environment variables first
|
|
41
|
+
from dotenv import load_dotenv
|
|
42
|
+
load_dotenv()
|
|
43
|
+
|
|
44
|
+
# now import library
|
|
45
|
+
from dactyl_generation.quick import *
|
|
46
|
+
|
|
47
|
+
```
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
dactyl_generation/__init__.py
|
|
5
|
+
dactyl_generation/anthropic_generation.py
|
|
6
|
+
dactyl_generation/bedrock_generation.py
|
|
7
|
+
dactyl_generation/constants.py
|
|
8
|
+
dactyl_generation/deepseek_generation.py
|
|
9
|
+
dactyl_generation/google_generation.py
|
|
10
|
+
dactyl_generation/mistral_generation.py
|
|
11
|
+
dactyl_generation/openai_generation.py
|
|
12
|
+
dactyl_generation/quick.py
|
|
13
|
+
dactyl_generation.egg-info/PKG-INFO
|
|
14
|
+
dactyl_generation.egg-info/SOURCES.txt
|
|
15
|
+
dactyl_generation.egg-info/dependency_links.txt
|
|
16
|
+
dactyl_generation.egg-info/requires.txt
|
|
17
|
+
dactyl_generation.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dactyl_generation
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dactyl_generation"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Shantanu Thorat"},
|
|
10
|
+
]
|
|
11
|
+
description = "LLM helper package to generate AI-generated texts."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"anthropic",
|
|
20
|
+
"litellm",
|
|
21
|
+
"mistralai",
|
|
22
|
+
"numpy==1.26.4",
|
|
23
|
+
"openai",
|
|
24
|
+
"pandas",
|
|
25
|
+
"protobuf",
|
|
26
|
+
"python-dotenv",
|
|
27
|
+
"Requests",
|
|
28
|
+
"tqdm",
|
|
29
|
+
"typing_extensions",
|
|
30
|
+
"google-generativeai",
|
|
31
|
+
"boto3"
|
|
32
|
+
]
|
|
33
|
+
license = {text = "MIT"}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|