dactyl-generation 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Shantanu Thorat
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.4
2
+ Name: dactyl_generation
3
+ Version: 0.0.1
4
+ Summary: LLM helper package to generate AI-generated texts.
5
+ Author: Shantanu Thorat
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: anthropic
13
+ Requires-Dist: litellm
14
+ Requires-Dist: mistralai
15
+ Requires-Dist: numpy==1.26.4
16
+ Requires-Dist: openai
17
+ Requires-Dist: pandas
18
+ Requires-Dist: protobuf
19
+ Requires-Dist: python-dotenv
20
+ Requires-Dist: Requests
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: typing_extensions
23
+ Requires-Dist: google-generativeai
24
+ Requires-Dist: boto3
25
+ Dynamic: license-file
26
+
27
+ # DACTYL-Generation
28
+
29
+ A Python package to generate LLM data from various APIs.
30
+
31
+ ## Installation
32
+ ```bash
33
+ pip install git+https://github.com/ShantanuT01/dactyl_generation.git
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ Load environment variables (your API keys first) before importing the library.
39
+ ```python
40
+ # load environment variables first
41
+ from dotenv import load_dotenv
42
+ load_dotenv()
43
+
44
+ # now import library
45
+ from dactyl_generation.quick import *
46
+
47
+ ```
@@ -0,0 +1,21 @@
1
+ # DACTYL-Generation
2
+
3
+ A Python package to generate LLM data from various APIs.
4
+
5
+ ## Installation
6
+ ```bash
7
+ pip install git+https://github.com/ShantanuT01/dactyl_generation.git
8
+ ```
9
+
10
+ ## Usage
11
+
12
+ Load environment variables (your API keys first) before importing the library.
13
+ ```python
14
+ # load environment variables first
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+ # now import library
19
+ from dactyl_generation.quick import *
20
+
21
+ ```
File without changes
@@ -0,0 +1,146 @@
1
+ """
2
+ Generates texts with using the Anthropic Batch API.
3
+ """
4
+ import copy
5
+
6
+ import anthropic
7
+ import dotenv
8
+ import os
9
+ import numpy as np
10
+ from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
11
+ from anthropic.types.messages.batch_create_params import Request
12
+ import json
13
+ import requests
14
+ import pandas as pd
15
+ from typing import List
16
+ from datetime import datetime, timezone
17
+ from dactyl_generation.constants import *
18
+ dotenv.load_dotenv()
19
+
20
+
21
+ ANTHROPIC_CLIENT = anthropic.Anthropic(
22
+ api_key = os.environ['ANTHROPIC_API_KEY'],
23
+ )
24
+ API_HEADERS = {"x-api-key": os.environ['ANTHROPIC_API_KEY'], "anthropic-version": "2023-06-01"}
25
+
26
+ def convert_openai_system_message_to_anthropic_system_message(openai_message: dict) -> dict:
27
+ """
28
+ Converts OpenAI system message to Anthropic API system message.
29
+ Doesn't support cache control yet!
30
+ Args:
31
+ openai_message: dictionary containing system prompt
32
+
33
+ Returns:
34
+ anthropic_system_prompt: dictionary containing Anthropic API message
35
+ """
36
+ ret = dict()
37
+ ret[TEXT] = openai_message[CONTENT]
38
+ ret[TYPE] = TEXT
39
+ return ret
40
+
41
+
42
+ def convert_anthropic_system_message_to_openai_system_message(anthropic_message: dict) -> dict:
43
+ """
44
+ Converts Anthropic API system message to OpenAI API system message.
45
+ Doesn't support cache control yet!
46
+ Args:
47
+ openai_message: dictionary containing system prompt
48
+
49
+ Returns:
50
+ anthropic_system_prompt: dictionary containing Anthropic API message
51
+ """
52
+ ret = dict()
53
+ ret[ROLE] = SYSTEM
54
+ ret[CONTENT] = anthropic_message[TEXT]
55
+ return ret
56
+
57
+ def get_message_batch(prompts_df: pd.DataFrame) -> List[Request]:
58
+ """
59
+ Generate a batch of requests from list of prompts
60
+
61
+ Args:
62
+ prompts_df: DataFrame where each row is an API call to the Anthropic API.
63
+
64
+ Returns:
65
+ requests: list of requests
66
+ """
67
+ requests = list()
68
+ calls = prompts_df.to_dict(orient="records")
69
+ digits_length = int(np.log10(len(calls))) + 1
70
+ for i, call in enumerate(calls):
71
+ system_messages = list()
72
+ normal_messages = list()
73
+ for message in call[PROMPT]:
74
+ if message[ROLE] == SYSTEM:
75
+ system_messages.append(convert_openai_system_message_to_anthropic_system_message(message))
76
+ else:
77
+ normal_messages.append(message)
78
+
79
+ call[SYSTEM] = system_messages
80
+ call[MESSAGES] = normal_messages
81
+ message_parameters = copy.copy(call)
82
+ del message_parameters[PROMPT]
83
+ # each individual request maps to one few shot set
84
+ request = Request(
85
+ custom_id=f"request-{str(i).zfill(digits_length)}",
86
+ params=MessageCreateParamsNonStreaming(
87
+ **message_parameters
88
+ )
89
+ )
90
+ requests.append(request)
91
+ return requests
92
+
93
+
94
+ def create_batch_job(prompts_df: pd.DataFrame) -> dict:
95
+ """
96
+ Requests message batch to Anthropic API given a list of examples.
97
+
98
+ Args:
99
+ prompts_df: Dataframe containing prompts to run.
100
+
101
+ Returns:
102
+ request_data: requests sent to Anthropic API
103
+ """
104
+
105
+
106
+ requests = get_message_batch(prompts_df)
107
+ custom_ids = [request[CUSTOM_ID] for request in requests]
108
+ message_batch = ANTHROPIC_CLIENT.messages.batches.create(requests=requests)
109
+ prompts_df[CUSTOM_ID] = custom_ids
110
+ return {
111
+ BATCH_ID: message_batch.id,
112
+ PROMPTS: prompts_df.to_dict(orient='records'),
113
+ API_CALL: ANTHROPIC,
114
+ TIMESTAMP: str(datetime.now(timezone.utc))
115
+ }
116
+
117
+
118
+
119
+ def get_batch_job_output(file_path: str) -> pd.DataFrame:
120
+ """
121
+ Gets batch job results using saved metadata from a local JSON file.
122
+ Args:
123
+ file_path: local JSON file containing output of the `request_batch_job` function
124
+
125
+ Returns:
126
+ df: pandas DataFrame of generations.
127
+ """
128
+ with open(file_path) as f:
129
+ data = json.load(f)
130
+ message_id = data[BATCH_ID]
131
+ response = requests.get(f"https://api.anthropic.com/v1/messages/batches/{message_id}/results",headers=API_HEADERS)
132
+ lines = response.text.splitlines()
133
+ objects = list()
134
+ for line in lines:
135
+ objects.append(json.loads(line))
136
+ generations = list()
137
+ for object in objects:
138
+ generation = dict()
139
+ generation[CUSTOM_ID] = object[CUSTOM_ID]
140
+ generation[TEXT] = object[RESULT][MESSAGE][CONTENT][0][TEXT]
141
+ generations.append(generation)
142
+ generations = pd.DataFrame(generations)
143
+ generations[TIMESTAMP] = data[TIMESTAMP]
144
+ prompt_rows = pd.DataFrame(data[PROMPTS])
145
+ ret = pd.DataFrame(prompt_rows)
146
+ return generations.merge(ret, on=CUSTOM_ID, how='left')
@@ -0,0 +1,185 @@
1
+ """
2
+ Generates texts using AWS Bedrock APIs.
3
+ !!! note
4
+ Only supports AWS region US East 1!
5
+ """
6
+ from litellm import completion
7
+ from typing import List
8
+ import os
9
+ import pandas as pd
10
+
11
+ from dactyl_generation.constants import *
12
+ os.environ['AWS_REGION']='us-east-1'
13
+ import boto3
14
+ import json
15
+ from datetime import datetime, timezone
16
+
17
+
18
+ def prompt(messages:List[dict], model: str, temperature: float, top_p: float, max_completion_tokens: int =512) -> str:
19
+ """
20
+ Prompt AWS Bedrock model with few shot learning examples.
21
+
22
+ Args:
23
+ messages: List of OpenAI messages
24
+ model: name of model
25
+ temperature: temperature parameter
26
+ top_p: top p parameter
27
+ max_completion_tokens: maximum number of tokens for completion
28
+
29
+ Returns:
30
+ response_content: string containing message content
31
+ """
32
+
33
+ response = completion(model, messages, temperature=temperature, top_p=top_p,max_completion_tokens=max_completion_tokens)
34
+ return response.choices[0].message.content
35
+
36
+
37
+ def format_llama_prompt(messages: List[dict]) -> str:
38
+ """
39
+ Formats OpenAI style message to Llama 3.2 style.
40
+ Args:
41
+ messages: list of dictionaries containing OpenAI style messages
42
+
43
+ Returns:
44
+ llama_prompt: formatted llama prompt
45
+ """
46
+ formatted_prompt = "<|begin_of_text|>"
47
+ for message in messages:
48
+ role = message[ROLE]
49
+ formatted_prompt += LLAMA_START_HEADER + role + LLAMA_END_HEADER + message[CONTENT] + "<|eot_id|>"
50
+ formatted_prompt += f"{LLAMA_START_HEADER}assistant{LLAMA_END_HEADER}"
51
+ return formatted_prompt
52
+
53
+
54
+ def create_jsonl_input_for_llama(prompts_df: pd.DataFrame, s3_path: str) -> pd.DataFrame:
55
+ """
56
+ Creates a JSONL file to upload to S3.
57
+ Args:
58
+ prompts_df: prompt dataframe containing OpenAI style messages
59
+ s3_path: Path to S3 bucket to save file
60
+ max_gen_len: maximum generation token count per request
61
+
62
+ Returns:
63
+ None
64
+ """
65
+ original_prompts = prompts_df[PROMPT].to_list()
66
+ prompts_df_copy = pd.DataFrame(prompts_df)
67
+ prompts_df_copy[PROMPT] = prompts_df_copy[PROMPT].apply(lambda messages: format_llama_prompt(messages))
68
+ messages = prompts_df_copy.to_dict(orient="records")
69
+
70
+ rows = list()
71
+ for i in range(len(messages)):
72
+ rows.append({
73
+ RECORDID: f"CALL{str(i).zfill(7)}",
74
+ MODELINPUT:messages[i]
75
+ }
76
+ )
77
+ input_frame = pd.DataFrame(rows)
78
+ input_frame.to_json(s3_path, orient="records",index=False, lines=True)
79
+ prompts_df_ret = pd.DataFrame(prompts_df)
80
+ prompts_df_ret[RECORDID] = input_frame[RECORDID].to_list()
81
+ prompts_df_ret[PROMPT] = original_prompts
82
+ return prompts_df_ret
83
+
84
+
85
+ def create_batch_job(prompts_df: pd.DataFrame, s3_input_path: str, s3_output_path: str, model: str, role_arn: str, job_name: str) -> dict:
86
+ """
87
+ Creates batch job for Bedrock models.
88
+
89
+ !!! warning
90
+ This function has not been tested yet!
91
+
92
+ Args:
93
+ prompts_df: Dataframe of OpenAI-style prompts.
94
+ s3_input_path: Input data path.
95
+ s3_output_path: Output data path.
96
+ model: Bedrock model ID.
97
+ role_arn: Role to run batch job.
98
+ job_name: Name of job
99
+
100
+ Returns:
101
+ jobArn: dictionary containing single string
102
+ """
103
+ inputted_frame = create_jsonl_input_for_llama(prompts_df, s3_input_path)
104
+ bedrock = boto3.client(service_name="bedrock",region_name="us-east-1")
105
+ input_data_config = (
106
+ {
107
+ S3_INPUT_DATA_CONFIG: {
108
+ S3URI: s3_input_path
109
+ }
110
+ }
111
+ )
112
+ output_data_config = (
113
+ {
114
+ S3_OUTPUT_DATA_CONFIG:{
115
+ S3URI: s3_output_path
116
+ }
117
+ }
118
+ )
119
+
120
+ response = bedrock.create_model_invocation_job(
121
+ roleArn=role_arn,
122
+ modelId=model,
123
+ jobName=job_name,
124
+ inputDataConfig=input_data_config,
125
+ outputDataConfig=output_data_config
126
+ )
127
+ inputted_frame[MODEL] = model
128
+ return {
129
+ JOB_ARN: response.get(JOB_ARN),
130
+ S3_OUTPUT_DATA_CONFIG: s3_output_path,
131
+ API_CALL: BEDROCK,
132
+ JOB_NAME: job_name,
133
+ INPUT_FILE: json.loads(inputted_frame.to_json(orient='records')),
134
+ TIMESTAMP: str(datetime.now(timezone.utc))
135
+
136
+ }
137
+
138
+
139
+ def get_batch_job_output(file_path: str) -> pd.DataFrame:
140
+ """
141
+ Fetches batch job results given JSON file.
142
+ Args:
143
+ file_path: JSON file containing jobArn.
144
+
145
+ Returns:
146
+ output_df: Dataframe containing generations.
147
+ """
148
+ with open(file_path, 'r') as file:
149
+ data = json.load(file)
150
+ job_arn = data[JOB_ARN].split("/")[-1]
151
+ s3_client = boto3.resource('s3')
152
+ # ignore s3://
153
+ bucket_name = data[S3_OUTPUT_DATA_CONFIG].split("/")[2]
154
+ bucket = s3_client.Bucket(bucket_name)
155
+ folder_path = "/".join(data[S3_OUTPUT_DATA_CONFIG].split("/")[3:]) + job_arn + "/"
156
+ target_file = None
157
+ for object_summary in bucket.objects.filter(Prefix=folder_path):
158
+ if object_summary.key.endswith(".jsonl.out"):
159
+ target_file = object_summary.key
160
+ break
161
+
162
+
163
+ if target_file:
164
+ output_df = pd.read_json(f"s3://{bucket_name}/"+target_file, lines=True)
165
+ rows = list()
166
+ print(output_df.head())
167
+ for _, row in output_df.iterrows():
168
+ entry = dict()
169
+ entry[TEXT] = row[MODEL_OUTPUT][GENERATION].strip()
170
+ entry[RECORDID] = row[RECORDID]
171
+ rows.append(entry)
172
+
173
+ outputs = pd.DataFrame(rows)
174
+ inputs = pd.DataFrame(data[INPUT_FILE])
175
+ outputs = outputs.merge(inputs, how='left', on=RECORDID)
176
+ outputs = outputs.drop(columns=RECORDID)
177
+ outputs[TIMESTAMP] = data[TIMESTAMP]
178
+ return outputs
179
+ else:
180
+ raise Exception(f"{bucket_name} does not contain .jsonl.out file! Please check if job has completed.")
181
+
182
+
183
+
184
+
185
+
@@ -0,0 +1,70 @@
1
+ # constants to use
2
+
3
+ # OpenAI constants
4
+ SYSTEM = "system"
5
+ ROLE = "role"
6
+ USER = "user"
7
+ CONTENT = "content"
8
+ GPT_4o = "gpt-4o"
9
+ GPT_4o_MINI = "gpt-4o-mini"
10
+ DEEPSEEK_V3 = "deepseek-ai/DeepSeek-V3"
11
+
12
+ LLAMA_START_HEADER = "<|start_header_id|>"
13
+ LLAMA_END_HEADER = "<|end_header_id|>"
14
+ BODY = "body"
15
+ MESSAGE = "message"
16
+ MESSAGES = "messages"
17
+ TEMPERATURE = "temperature"
18
+ TOP_P = "top_p"
19
+ MODEL = "model"
20
+ MAX_COMPLETION_TOKENS = "max_completion_tokens"
21
+ CUSTOM_ID = "custom_id"
22
+ INPUT_FILE = "input_file"
23
+ RESPONSE = "response"
24
+ CHOICES = "choices"
25
+ RESULT_FILE_ID = "result_file_id"
26
+ PROMPTS = "prompts"
27
+ PROMPT = "prompt"
28
+ TEXT = "text"
29
+ RESULT = "result"
30
+ PARAMS = "params"
31
+ JOB_NAME = "job_name"
32
+ API_CALL = "api_call"
33
+ TARGET = "target"
34
+ BATCH_ID = "batch_id"
35
+ TYPE = "type"
36
+ MAX_TOKENS = "max_tokens"
37
+ BATCH = "batch"
38
+ BLOCK_NONE = "BLOCK_NONE"
39
+ MODELINPUT = "modelInput"
40
+ RECORDID = "recordId"
41
+ TIMESTAMP = "timestamp"
42
+ CREATED = "created"
43
+
44
+
45
+ EXAMPLES = "examples"
46
+
47
+
48
+ CLAUDE = "claude"
49
+ GPT = "gpt"
50
+ MISTRAL = "mistral"
51
+ ANTHROPIC = "anthropic"
52
+ OPENAI = "openai"
53
+ BEDROCK = "bedrock"
54
+ GEMINI = "gemini"
55
+ DEEPSEEK = "deepseek"
56
+
57
+
58
+
59
+ PX = "p(x)"
60
+ ENTROPY = "entropy"
61
+ COUNT = "count"
62
+
63
+
64
+ S3URI = "s3Uri"
65
+ JOB_ARN = "jobArn"
66
+
67
+ S3_OUTPUT_DATA_CONFIG = "s3OutputDataConfig"
68
+ S3_INPUT_DATA_CONFIG = "s3InputDataConfig"
69
+ MODEL_OUTPUT = "modelOutput"
70
+ GENERATION = "generation"
@@ -0,0 +1,49 @@
1
+ """
2
+ Generates texts with DeepSeek models using the DeepInfra API.
3
+ """
4
+ import os
5
+ from openai import OpenAI
6
+ from dotenv import load_dotenv
7
+ from dactyl_generation.constants import *
8
+ import pandas as pd
9
+ from typing import List
10
+ load_dotenv()
11
+
12
+ DEEPSEEK_CLIENT = OpenAI(
13
+ api_key=os.environ["FIREWORKS_AI_API_KEY"],
14
+ base_url="https://api.fireworks.ai/inference/v1"
15
+ )
16
+
17
+
18
+ def prompt(messages: List[dict], model: str, temperature:float , top_p:float, max_completion_tokens:int=512,number_of_responses:int=1) -> list:
19
+ """
20
+ Pass a single list of messages to DeepSeek to generate text.
21
+
22
+ Args:
23
+ messages: List of messages to pass in.
24
+ model: model name.
25
+ temperature: temperature, value from 0 to 2.
26
+ top_p: top-p value from 0 to 1.
27
+ max_completion_tokens: maximum number of completion tokens
28
+ number_of_responses: maximum number of responses.
29
+
30
+ Returns:
31
+ responses: List of responses.
32
+ """
33
+
34
+
35
+ api_response = DEEPSEEK_CLIENT.chat.completions.create(
36
+ messages=messages,
37
+ model=model,
38
+ temperature=temperature,
39
+ top_p=top_p,
40
+ max_completion_tokens=max_completion_tokens,
41
+ n=number_of_responses
42
+ )
43
+ responses = list()
44
+ for response in api_response.choices:
45
+ responses.append(response.message.content.strip())
46
+ return responses
47
+
48
+
49
+
@@ -0,0 +1,81 @@
1
+ """
2
+ This module helps in generating texts using the Gemini API.
3
+
4
+ !!! danger
5
+ For Gemini, all safety filters have been turned off!
6
+ """
7
+ from google import genai
8
+ from google.genai import types
9
+ import os
10
+ from dotenv import load_dotenv
11
+ from typing import List
12
+ import typing_extensions as typing
13
+ from dactyl_generation.constants import *
14
+ from pydantic import BaseModel
15
+
16
+
17
+ load_dotenv()
18
+ GOOGLE_CLIENT = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
19
+
20
+ GEMINI_SAFETY_SETTINGS = [
21
+ types.SafetySetting(
22
+ category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
23
+ threshold=BLOCK_NONE
24
+ ),
25
+ types.SafetySetting(
26
+ category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
27
+ threshold=BLOCK_NONE
28
+ ),
29
+ types.SafetySetting(
30
+ category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
31
+ threshold=BLOCK_NONE
32
+ ),
33
+ types.SafetySetting(
34
+ category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
35
+ threshold=BLOCK_NONE
36
+ ),
37
+ types.SafetySetting(
38
+ category=types.HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY,
39
+ threshold=BLOCK_NONE
40
+ )
41
+ ]
42
+
43
+ class GeneratedResponse(BaseModel):
44
+ text: str
45
+
46
+ def prompt(messages: List[dict], model_name: str, temperature: float, top_p: float, max_completion_tokens: int) -> str:
47
+ """
48
+ Prompt Gemini model with an individual request.
49
+
50
+ Args:
51
+ messages: List of OpenAI messages
52
+ model_name (str): Name of model.
53
+ temperature (float): Temperature to pass.
54
+ top_p (float): Top-p value.
55
+ max_completion_tokens: maximum number of tokens to generate
56
+
57
+ Returns:
58
+ text: Generation output.
59
+ """
60
+
61
+ system_instructions = list()
62
+ user_instructions = list()
63
+ for message in messages:
64
+ if message[ROLE] == SYSTEM:
65
+ system_instructions.append(message[CONTENT])
66
+ else:
67
+ user_instructions.append(message[CONTENT])
68
+ prompt_config = types.GenerateContentConfig(
69
+ system_instruction=system_instructions,
70
+ max_output_tokens=max_completion_tokens,
71
+ top_p=top_p,
72
+ temperature=temperature,
73
+ safety_settings=GEMINI_SAFETY_SETTINGS
74
+ )
75
+
76
+ response = GOOGLE_CLIENT.models.generate_content(model=model_name, contents=user_instructions, config=prompt_config)
77
+ #print(response)
78
+ return response.text
79
+
80
+
81
+
@@ -0,0 +1,124 @@
1
+ """
2
+ Generates texts with using the Mistral Batch API.
3
+ """
4
+ import copy
5
+
6
+ import mistralai.files
7
+ from mistralai import Mistral, File
8
+ from dotenv import load_dotenv
9
+ import os
10
+ from io import BytesIO
11
+ import json
12
+ import numpy as np
13
+ import pandas as pd
14
+ from typing import List, Tuple
15
+ from datetime import datetime, timezone
16
+ from dactyl_generation.constants import *
17
+
18
+ load_dotenv()
19
+
20
+ MISTRAL_CLIENT = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
21
+
22
+
23
+ def create_message_batch(file_name: str, prompts_df: pd.DataFrame) -> Tuple[List[dict], mistralai.models.UploadFileOut]:
24
+ """
25
+ Creates batch of messages to send to Mistral API.
26
+
27
+ Args:
28
+ file_name: Name of file in Mistral API to save as.
29
+ prompts_df: DataFrame containing prompts and generation parameters
30
+
31
+ Returns:
32
+ tuple: List of requests sent, UploadFileOut object
33
+ """
34
+
35
+ buffer = BytesIO()
36
+ list_of_requests = list()
37
+ messages = prompts_df.to_dict(orient="records")
38
+ digits_length = int(np.log10(len(prompts_df))) + 1
39
+ for index, message_batch in enumerate(messages):
40
+ request = {
41
+ CUSTOM_ID: f"request-{str(index).zfill(digits_length)}",
42
+ BODY: message_batch
43
+ }
44
+ list_of_requests.append(request)
45
+ buffer.write((json.dumps(request)+"\n").encode("utf-8"))
46
+ file = File(file_name=file_name, content=buffer.getvalue())
47
+ return list_of_requests, MISTRAL_CLIENT.files.upload(file=file, purpose=BATCH)
48
+
49
+
50
+ def start_batch_job(input_file: mistralai.models.UploadFileOut, model: str) -> mistralai.models.BatchJobOut:
51
+ """
52
+ Start batch job from input file stored on Mistral API containing prompts.
53
+
54
+ Args:
55
+ input_file: input file object to create job with
56
+ model: model name to use for generation
57
+
58
+ Returns:
59
+ batch_job: Batch job object
60
+ """
61
+
62
+ batch_job = MISTRAL_CLIENT.batch.jobs.create(
63
+ input_files=[input_file.id],
64
+ model=model,
65
+ endpoint="/v1/chat/completions",
66
+ metadata={"job_type": "testing"}
67
+ )
68
+ return batch_job
69
+
70
+ def create_batch_job(file_name: str, prompts_df: pd.DataFrame) -> dict:
71
+ """
72
+ Creates batch job for set of prompts given file name to save Mistral prompts to.
73
+ Args:
74
+ file_name: name of file to upload to Mistral API.
75
+ prompts_df: DataFrame containing generation prompts and parameters.
76
+
77
+ Returns:
78
+ info: dictionary containing batch job info
79
+ """
80
+ assert(len(prompts_df[MODEL].unique()) == 1)
81
+ model = prompts_df[MODEL].unique()[0]
82
+ prompts, input_file = create_message_batch(file_name, prompts_df)
83
+ batch_job = start_batch_job(input_file, model)
84
+ input_file = input_file.model_dump(mode="json")
85
+ batch_job = batch_job.model_dump(mode="json")
86
+ return {"batch_job": batch_job, INPUT_FILE: input_file, PROMPTS: prompts, API_CALL: MISTRAL}
87
+
88
+
89
+
90
+ def get_batch_jobs():
91
+ return MISTRAL_CLIENT.batch.jobs.list(
92
+ metadata={"job_type": "testing"}
93
+ )
94
+
95
+ def get_batch_job_output(file_path: str) -> pd.DataFrame:
96
+ """
97
+ Gets batch job results using saved metadata from a local JSON file.
98
+ Args:
99
+ file_path: local JSON file containing output of the `create_batch_job` function
100
+
101
+ Returns:
102
+ df: pandas DataFrame of generations.
103
+ """
104
+ with open(file_path, "r") as f:
105
+ data = json.load(f)
106
+ job_id = data["batch_job"]["id"]
107
+ output_file = MISTRAL_CLIENT.batch.jobs.get(job_id=job_id).output_file
108
+ content = MISTRAL_CLIENT.files.download(file_id=output_file).read().decode("utf-8")
109
+ json_obj = "[" + ", ".join(content.splitlines()) + "]"
110
+ responses = json.loads(json_obj)
111
+ rows = list()
112
+ for response in responses:
113
+ row = dict()
114
+ row[CUSTOM_ID] = response[CUSTOM_ID]
115
+ row[TEXT] = response[RESPONSE][BODY][CHOICES][0][MESSAGE][CONTENT]
116
+ row[TIMESTAMP] = str(datetime.fromtimestamp(response[RESPONSE][BODY][CREATED], tz=timezone.utc))
117
+ rows.append(row)
118
+ raw_prompts = pd.DataFrame([{**prompt[BODY], **{CUSTOM_ID: prompt[CUSTOM_ID]}} for prompt in data[PROMPTS]])
119
+ print(raw_prompts.head())
120
+ generations = pd.DataFrame(rows)
121
+ return generations.merge(raw_prompts, on=CUSTOM_ID,how="left")
122
+
123
+
124
+
@@ -0,0 +1,112 @@
1
+ """
2
+ Generates texts with using the OpenAI Batch API.
3
+ """
4
+ import os
5
+ from openai import OpenAI
6
+ from dotenv import load_dotenv
7
+ from dactyl_generation.constants import *
8
+ import pandas as pd
9
+ import json
10
+ import numpy as np
11
+ from io import BytesIO
12
+ from typing import List, Any
13
+ from datetime import datetime, timezone
14
+
15
+ load_dotenv()
16
+
17
+ OPENAI_CLIENT = OpenAI(
18
+ api_key=os.environ["OPENAI_API_KEY"] # This is the default and can be omitted
19
+ )
20
+
21
+ def create_individual_request(custom_id: str, message_body: dict) -> dict:
22
+ """
23
+ Creates OpenAI REST API request for a single request.
24
+ Args:
25
+ custom_id: Custom ID of request
26
+ message_body: dictionary of a single message. This includes the messages, max_completion_token parameters etc.
27
+
28
+ Returns:
29
+ request: individual request formatted for OpenAI REST API.
30
+ """
31
+ request = {CUSTOM_ID: str(custom_id), "method": "POST", "url": "/v1/chat/completions", BODY: message_body}
32
+ return request
33
+
34
+
35
+ def create_batch_job(prompts_df: pd.DataFrame) -> dict:
36
+ """
37
+ Creates batch job of prompts given messages and temperatures.
38
+
39
+ Args:
40
+ prompts_df: DataFrame where each row corresponds to an OpenAI API call.
41
+
42
+ Returns:
43
+ results: dictionary containing request information
44
+ """
45
+ digits_length = int(np.log10(len(prompts_df))) + 1
46
+ json_strs = list()
47
+ requests = list()
48
+ records = prompts_df.to_dict("records")
49
+ for i, record in enumerate(records):
50
+ request = create_individual_request(f"request-{str(i).zfill(digits_length)}", record)
51
+ requests.append(request)
52
+ json_strs.append(json.dumps(request))
53
+ buffer = BytesIO(("\n".join(json_strs)).encode("utf-8"))
54
+ # with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', delete=False) as fp:
55
+ # fp.write("\n".join(json_strs))
56
+ # temp_filename = fp.name
57
+
58
+ batch_file = OPENAI_CLIENT.files.create(
59
+ file=buffer,
60
+ purpose="batch"
61
+ )
62
+ # os.remove(temp_filename)
63
+
64
+ batch_job = OPENAI_CLIENT.batches.create(
65
+ input_file_id=batch_file.id,
66
+ endpoint="/v1/chat/completions",
67
+ completion_window="24h"
68
+ )
69
+
70
+ result_file_id = batch_job.id
71
+
72
+ return {
73
+ RESULT_FILE_ID: result_file_id,
74
+ INPUT_FILE: requests,
75
+ API_CALL: OPENAI
76
+ }
77
+
78
+
79
+ def get_batch_job_output(file_path: str) -> pd.DataFrame:
80
+ """
81
+ Gets batch job results using saved metadata from a local JSON file.
82
+ Args:
83
+ file_path: local JSON file containing output of the `create_batch_job` function
84
+
85
+ Returns:
86
+ df: pandas DataFrame of generations.
87
+ """
88
+ with open(file_path,'r') as f:
89
+ data = json.load(f)
90
+ batch_job = OPENAI_CLIENT.batches.retrieve(data[RESULT_FILE_ID])
91
+ result = OPENAI_CLIENT.files.content(batch_job.output_file_id).content
92
+ df = pd.read_json(BytesIO(result), lines=True)
93
+ responses = df[RESPONSE]
94
+ custom_ids = df[CUSTOM_ID]
95
+ generations = list()
96
+ for response, custom_id in zip(responses, custom_ids):
97
+ generation = dict()
98
+ generation[TEXT] = response[BODY][CHOICES][0][MESSAGE][CONTENT]
99
+ generation[CUSTOM_ID] = custom_id
100
+ generation[TIMESTAMP] = str(datetime.fromtimestamp(response[BODY][CREATED],tz=timezone.utc))
101
+ generations.append(generation)
102
+ generations = pd.DataFrame(generations)
103
+ requests = pd.DataFrame(data[INPUT_FILE])
104
+
105
+ generations = generations.merge(requests, on=CUSTOM_ID, how='left')
106
+ return generations
107
+
108
+
109
+
110
+
111
+
112
+
@@ -0,0 +1,138 @@
1
+ """
2
+ Generates texts quickly using wrapper functions to redirect to appropriate model functions.
3
+ """
4
+ import json
5
+ from dactyl_generation import openai_generation, anthropic_generation, mistral_generation
6
+ from dactyl_generation import google_generation, bedrock_generation, deepseek_generation
7
+ from dactyl_generation.constants import *
8
+ import numpy as np
9
+ import tempfile
10
+ import time
11
+ import pandas as pd
12
+ from tqdm import tqdm
13
+ from datetime import datetime, timezone
14
+ from typing import List
15
+
16
+
17
+ def generate_texts_using_batch(output_path: str, prompts_df: pd.DataFrame, api_provider: str, aws_args: dict = None) -> None:
18
+ """
19
+ Generates prompts to use using batch APIs from select providers using example prompts path.
20
+ Prompt and batch data are saved to the output_path as a JSON.
21
+
22
+ Args:
23
+ output_path: output path to save prompt metadata
24
+ prompts_df: prompts for each generation
25
+ api_provider: Batch API provider to route request to.
26
+ aws_args: dictionary containing AWS Bedrock args.
27
+
28
+
29
+ Returns:
30
+ None
31
+ """
32
+
33
+
34
+
35
+ if api_provider == ANTHROPIC:
36
+ parameters = anthropic_generation.create_batch_job(prompts_df)
37
+ with open(output_path, 'w+') as file:
38
+ json.dump(parameters, file, indent=4)
39
+ elif api_provider == OPENAI:
40
+ parameters = openai_generation.create_batch_job(prompts_df)
41
+ with open(output_path, 'w+') as file:
42
+ json.dump(parameters, file, indent=4)
43
+ elif api_provider == MISTRAL:
44
+ file_name = next(tempfile._get_candidate_names())
45
+ file_name = f"{file_name}.jsonl"
46
+ parameters = mistral_generation.create_batch_job(file_name,prompts_df)
47
+ with open(output_path, 'w+') as file:
48
+ json.dump(parameters, file, indent=4)
49
+ elif api_provider == BEDROCK:
50
+ parameters = bedrock_generation.create_batch_job(prompts_df, **aws_args)
51
+ with open(output_path, 'w+') as file:
52
+ json.dump(parameters, file, indent=4)
53
+ else:
54
+ raise Exception("Model type not supported for batch inference.")
55
+
56
+
57
+ def get_batch_job_results(file_path: str, output_path: str) -> None:
58
+ """
59
+ Saves batch job prompts as JSON file.
60
+
61
+ Args:
62
+ file_path: File path containing batch data saved from `generate_texts_using_batch_with_few_shot_prompting`.
63
+ output_path: Output JSON path to save generations.
64
+
65
+ Returns:
66
+ None
67
+ """
68
+ with open(file_path) as file:
69
+ data = json.load(file)
70
+ api_call = data[API_CALL]
71
+ if api_call == ANTHROPIC:
72
+ df = anthropic_generation.get_batch_job_output(file_path)
73
+ elif api_call == MISTRAL:
74
+ df = mistral_generation.get_batch_job_output(file_path)
75
+ elif api_call == OPENAI:
76
+ df = openai_generation.get_batch_job_output(file_path)
77
+ elif api_call == BEDROCK:
78
+ df = bedrock_generation.get_batch_job_output(file_path)
79
+ else:
80
+ raise Exception(f"API call {api_call} not supported")
81
+ df.to_json(output_path,index=False, orient='records', indent=4)
82
+
83
+
84
+ def generate_texts_streaming(model: str, prompts_df: pd.DataFrame, output_path: str, max_completion_tokens: int =512, category: str ="", wait_after_every:int =20, sleep_time: int =30) -> None:
85
+ """
86
+ This function generates examples from an API live, no batching. If `example_prompts_path` is given, the function will use all prompts in the JSON file.
87
+ Otherwise, it will generate random few shot examples.
88
+ Outputs are saved as JSON.
89
+
90
+ Args:
91
+ model: name of model
92
+ prompts_df: dataframe containing prompts
93
+ output_path: output path to save JSON file
94
+ max_completion_tokens: maximum number of tokens per generation
95
+ category: categorical column
96
+ wait_after_every: Pauses generation after a certain amount of requests
97
+ sleep_time: Sleeps for a certain amount of time in seconds
98
+
99
+
100
+ Returns:
101
+ None
102
+ """
103
+ rows = list()
104
+
105
+ messages = prompts_df[MESSAGES].to_list()
106
+ temperatures = prompts_df[TEMPERATURE].to_list()
107
+ top_ps = prompts_df[TOP_P].to_list()
108
+ for index in tqdm(range(len(prompts_df))):
109
+
110
+ message_batch = messages[index]
111
+
112
+ temperature = temperatures[index]
113
+ top_p = top_ps[index]
114
+ row = dict()
115
+ row[PROMPT] = message_batch
116
+ row[TEMPERATURE] = temperature
117
+ row[TOP_P] = top_p
118
+ row[MODEL] = model
119
+ row[TARGET] = 1
120
+ row["category"] = category
121
+ if model.find(BEDROCK) >= 0:
122
+ text = bedrock_generation.prompt(message_batch, model, temperature, top_p, max_completion_tokens=max_completion_tokens)
123
+ elif model.find(DEEPSEEK) >= 0:
124
+ text = deepseek_generation.prompt(message_batch, model, temperature, top_p, max_completion_tokens=max_completion_tokens)[0]
125
+ elif model.find(GEMINI) >= 0:
126
+ text = google_generation.prompt(message_batch,model, temperature, top_p, max_completion_tokens)
127
+ else:
128
+ raise Exception("Model type not supported")
129
+ row[TEXT] = text
130
+ row[TIMESTAMP] = str(datetime.now(timezone.utc))
131
+ rows.append(row)
132
+ pd.DataFrame(rows).to_json(output_path, orient="records", indent=4, index=False)
133
+ if (index % wait_after_every == 0) and (index > 0):
134
+ time.sleep(sleep_time)
135
+
136
+ pd.DataFrame(rows).to_json(output_path, orient="records", indent=4, index=False)
137
+
138
+
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.4
2
+ Name: dactyl_generation
3
+ Version: 0.0.1
4
+ Summary: LLM helper package to generate AI-generated texts.
5
+ Author: Shantanu Thorat
6
+ License: MIT
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: anthropic
13
+ Requires-Dist: litellm
14
+ Requires-Dist: mistralai
15
+ Requires-Dist: numpy==1.26.4
16
+ Requires-Dist: openai
17
+ Requires-Dist: pandas
18
+ Requires-Dist: protobuf
19
+ Requires-Dist: python-dotenv
20
+ Requires-Dist: Requests
21
+ Requires-Dist: tqdm
22
+ Requires-Dist: typing_extensions
23
+ Requires-Dist: google-generativeai
24
+ Requires-Dist: boto3
25
+ Dynamic: license-file
26
+
27
+ # DACTYL-Generation
28
+
29
+ A Python package to generate LLM data from various APIs.
30
+
31
+ ## Installation
32
+ ```bash
33
+ pip install git+https://github.com/ShantanuT01/dactyl_generation.git
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ Load environment variables (your API keys first) before importing the library.
39
+ ```python
40
+ # load environment variables first
41
+ from dotenv import load_dotenv
42
+ load_dotenv()
43
+
44
+ # now import library
45
+ from dactyl_generation.quick import *
46
+
47
+ ```
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ dactyl_generation/__init__.py
5
+ dactyl_generation/anthropic_generation.py
6
+ dactyl_generation/bedrock_generation.py
7
+ dactyl_generation/constants.py
8
+ dactyl_generation/deepseek_generation.py
9
+ dactyl_generation/google_generation.py
10
+ dactyl_generation/mistral_generation.py
11
+ dactyl_generation/openai_generation.py
12
+ dactyl_generation/quick.py
13
+ dactyl_generation.egg-info/PKG-INFO
14
+ dactyl_generation.egg-info/SOURCES.txt
15
+ dactyl_generation.egg-info/dependency_links.txt
16
+ dactyl_generation.egg-info/requires.txt
17
+ dactyl_generation.egg-info/top_level.txt
@@ -0,0 +1,13 @@
1
+ anthropic
2
+ litellm
3
+ mistralai
4
+ numpy==1.26.4
5
+ openai
6
+ pandas
7
+ protobuf
8
+ python-dotenv
9
+ Requests
10
+ tqdm
11
+ typing_extensions
12
+ google-generativeai
13
+ boto3
@@ -0,0 +1 @@
1
+ dactyl_generation
@@ -0,0 +1,36 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dactyl_generation"
7
+ version = "0.0.1"
8
+ authors = [
9
+ { name="Shantanu Thorat"},
10
+ ]
11
+ description = "LLM helper package to generate AI-generated texts."
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+ dependencies = [
19
+ "anthropic",
20
+ "litellm",
21
+ "mistralai",
22
+ "numpy==1.26.4",
23
+ "openai",
24
+ "pandas",
25
+ "protobuf",
26
+ "python-dotenv",
27
+ "Requests",
28
+ "tqdm",
29
+ "typing_extensions",
30
+ "google-generativeai",
31
+ "boto3"
32
+ ]
33
+ license = {text = "MIT"}
34
+
35
+
36
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+