parallex 0.2.0__tar.gz → 0.3.0__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {parallex-0.2.0 → parallex-0.3.0}/PKG-INFO +26 -1
- {parallex-0.2.0 → parallex-0.3.0}/README.md +25 -0
- parallex-0.3.0/parallex/ai/output_processor.py +57 -0
- parallex-0.3.0/parallex/ai/uploader.py +157 -0
- parallex-0.3.0/parallex/models/parallex_prompts_callable_output.py +13 -0
- parallex-0.3.0/parallex/models/prompt_response.py +6 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/parallex.py +102 -6
- {parallex-0.2.0 → parallex-0.3.0}/parallex/utils/constants.py +1 -1
- {parallex-0.2.0 → parallex-0.3.0}/pyproject.toml +1 -1
- parallex-0.2.0/parallex/ai/output_processor.py +0 -25
- parallex-0.2.0/parallex/ai/uploader.py +0 -91
- {parallex-0.2.0 → parallex-0.3.0}/LICENSE +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/__init__.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/ai/batch_processor.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/ai/open_ai_client.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/file_management/converter.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/file_management/file_finder.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/file_management/remote_file_handler.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/file_management/utils.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/models/batch_file.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/models/image_file.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/models/page_response.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/models/parallex_callable_output.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/models/raw_file.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/models/upload_batch.py +0 -0
- {parallex-0.2.0 → parallex-0.3.0}/parallex/utils/logger.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parallex
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: PDF to markdown using Azure OpenAI batch processing
|
5
5
|
Home-page: https://github.com/Summed-AI/parallex
|
6
6
|
Author: Jeff Hostetler
|
@@ -96,3 +96,28 @@ class PageResponse(BaseModel):
|
|
96
96
|
"""
|
97
97
|
```
|
98
98
|
|
99
|
+
### Batch processing for list of prompts
|
100
|
+
If you do not need to process images, but just want to process prompts using the Batch API,
|
101
|
+
you can call;
|
102
|
+
```python
|
103
|
+
response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
|
104
|
+
model=model,
|
105
|
+
prompts=["Some prompt", "Some other prompt"],
|
106
|
+
post_process_callable=example_post_process
|
107
|
+
)
|
108
|
+
responses = response_data.responses
|
109
|
+
```
|
110
|
+
This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
|
111
|
+
|
112
|
+
Responses have the following structure;
|
113
|
+
```python
|
114
|
+
class ParallexPromptsCallableOutput(BaseModel):
|
115
|
+
original_prompts: list[str] = Field(description="List of given prompts")
|
116
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
117
|
+
responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
|
118
|
+
|
119
|
+
class PromptResponse(BaseModel):
|
120
|
+
output_content: str = Field(description="Response from the model")
|
121
|
+
prompt_index: int = Field(description="Index corresponding to the given prompts")
|
122
|
+
```
|
123
|
+
|
@@ -75,3 +75,28 @@ class PageResponse(BaseModel):
|
|
75
75
|
If unable to parse, return an empty string.
|
76
76
|
"""
|
77
77
|
```
|
78
|
+
|
79
|
+
### Batch processing for list of prompts
|
80
|
+
If you do not need to process images, but just want to process prompts using the Batch API,
|
81
|
+
you can call;
|
82
|
+
```python
|
83
|
+
response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
|
84
|
+
model=model,
|
85
|
+
prompts=["Some prompt", "Some other prompt"],
|
86
|
+
post_process_callable=example_post_process
|
87
|
+
)
|
88
|
+
responses = response_data.responses
|
89
|
+
```
|
90
|
+
This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
|
91
|
+
|
92
|
+
Responses have the following structure;
|
93
|
+
```python
|
94
|
+
class ParallexPromptsCallableOutput(BaseModel):
|
95
|
+
original_prompts: list[str] = Field(description="List of given prompts")
|
96
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
97
|
+
responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
|
98
|
+
|
99
|
+
class PromptResponse(BaseModel):
|
100
|
+
output_content: str = Field(description="Response from the model")
|
101
|
+
prompt_index: int = Field(description="Index corresponding to the given prompts")
|
102
|
+
```
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import json
|
2
|
+
from typing import TypeVar, Callable
|
3
|
+
|
4
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
5
|
+
from parallex.models.page_response import PageResponse
|
6
|
+
from parallex.models.prompt_response import PromptResponse
|
7
|
+
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
8
|
+
|
9
|
+
|
10
|
+
async def process_images_output(
|
11
|
+
client: OpenAIClient, output_file_id: str
|
12
|
+
) -> list[PageResponse]:
|
13
|
+
return await _process_output(
|
14
|
+
client,
|
15
|
+
output_file_id,
|
16
|
+
lambda content, identifier: PageResponse(
|
17
|
+
output_content=content, page_number=int(identifier)
|
18
|
+
),
|
19
|
+
)
|
20
|
+
|
21
|
+
|
22
|
+
async def process_prompts_output(
|
23
|
+
client: OpenAIClient, output_file_id: str
|
24
|
+
) -> list[PromptResponse]:
|
25
|
+
"""Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
|
26
|
+
return await _process_output(
|
27
|
+
client,
|
28
|
+
output_file_id,
|
29
|
+
lambda content, identifier: PromptResponse(
|
30
|
+
output_content=content, prompt_index=int(identifier)
|
31
|
+
),
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
ResponseType = TypeVar("ResponseType")
|
36
|
+
|
37
|
+
|
38
|
+
async def _process_output(
|
39
|
+
client: OpenAIClient,
|
40
|
+
output_file_id: str,
|
41
|
+
response_builder: Callable[[str, str], ResponseType],
|
42
|
+
) -> list[ResponseType]:
|
43
|
+
file_response = await client.retrieve_file(output_file_id)
|
44
|
+
raw_responses = file_response.text.strip().split("\n")
|
45
|
+
responses = []
|
46
|
+
|
47
|
+
for raw_response in raw_responses:
|
48
|
+
json_response = json.loads(raw_response)
|
49
|
+
custom_id = json_response["custom_id"]
|
50
|
+
identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
|
51
|
+
output_content = json_response["response"]["body"]["choices"][0]["message"][
|
52
|
+
"content"
|
53
|
+
]
|
54
|
+
response = response_builder(output_content, identifier)
|
55
|
+
responses.append(response)
|
56
|
+
|
57
|
+
return responses
|
@@ -0,0 +1,157 @@
|
|
1
|
+
import base64
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
from uuid import UUID
|
5
|
+
|
6
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
7
|
+
from parallex.file_management.utils import file_in_temp_dir
|
8
|
+
from parallex.models.batch_file import BatchFile
|
9
|
+
from parallex.models.image_file import ImageFile
|
10
|
+
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
11
|
+
|
12
|
+
MAX_FILE_SIZE = 180 * 1024 * 1024 # 180 MB in bytes. Limit for Azure is 200MB.
|
13
|
+
|
14
|
+
|
15
|
+
async def upload_images_for_processing(
|
16
|
+
client: OpenAIClient,
|
17
|
+
image_files: list[ImageFile],
|
18
|
+
temp_directory: str,
|
19
|
+
prompt_text: str,
|
20
|
+
) -> list[BatchFile]:
|
21
|
+
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
22
|
+
trace_id = image_files[0].trace_id
|
23
|
+
current_index = 0
|
24
|
+
batch_files = []
|
25
|
+
upload_file_location = file_in_temp_dir(
|
26
|
+
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
27
|
+
)
|
28
|
+
|
29
|
+
for image_file in image_files:
|
30
|
+
if await _approaching_file_size_limit(upload_file_location):
|
31
|
+
"""When approaching upload file limit, upload and start new file"""
|
32
|
+
batch_file = await _create_batch_file(
|
33
|
+
client, trace_id, upload_file_location
|
34
|
+
)
|
35
|
+
batch_files.append(batch_file)
|
36
|
+
upload_file_location = await _increment_batch_file_index(
|
37
|
+
current_index, temp_directory, trace_id
|
38
|
+
)
|
39
|
+
|
40
|
+
with open(image_file.path, "rb") as image:
|
41
|
+
base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
|
42
|
+
|
43
|
+
prompt_custom_id = (
|
44
|
+
f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
|
45
|
+
)
|
46
|
+
jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
|
47
|
+
with open(upload_file_location, "a") as jsonl_file:
|
48
|
+
jsonl_file.write(json.dumps(jsonl) + "\n")
|
49
|
+
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
50
|
+
batch_files.append(batch_file)
|
51
|
+
return batch_files
|
52
|
+
|
53
|
+
|
54
|
+
async def upload_prompts_for_processing(
|
55
|
+
client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
|
56
|
+
) -> list[BatchFile]:
|
57
|
+
"""Creates jsonl file and uploads for processing"""
|
58
|
+
current_index = 0
|
59
|
+
batch_files = []
|
60
|
+
|
61
|
+
upload_file_location = await set_file_location(
|
62
|
+
current_index, temp_directory, trace_id
|
63
|
+
)
|
64
|
+
for index, prompt in enumerate(prompts):
|
65
|
+
if await _approaching_file_size_limit(upload_file_location):
|
66
|
+
"""When approaching upload file limit, upload and start new file"""
|
67
|
+
batch_file = await _create_batch_file(
|
68
|
+
client, trace_id, upload_file_location
|
69
|
+
)
|
70
|
+
batch_files.append(batch_file)
|
71
|
+
upload_file_location = await _increment_batch_file_index(
|
72
|
+
current_index, temp_directory, trace_id
|
73
|
+
)
|
74
|
+
|
75
|
+
prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
|
76
|
+
jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
|
77
|
+
with open(upload_file_location, "a") as jsonl_file:
|
78
|
+
jsonl_file.write(json.dumps(jsonl) + "\n")
|
79
|
+
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
80
|
+
batch_files.append(batch_file)
|
81
|
+
return batch_files
|
82
|
+
|
83
|
+
|
84
|
+
async def set_file_location(
|
85
|
+
current_index: int, temp_directory: str, trace_id: UUID
|
86
|
+
) -> str:
|
87
|
+
return file_in_temp_dir(
|
88
|
+
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
async def _approaching_file_size_limit(upload_file_location: str) -> bool:
|
93
|
+
return (
|
94
|
+
os.path.exists(upload_file_location)
|
95
|
+
and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
|
96
|
+
)
|
97
|
+
|
98
|
+
|
99
|
+
async def _increment_batch_file_index(
|
100
|
+
current_index: int, temp_directory: str, trace_id: UUID
|
101
|
+
) -> str:
|
102
|
+
current_index += 1
|
103
|
+
upload_file_location = await set_file_location(
|
104
|
+
current_index, temp_directory, trace_id
|
105
|
+
)
|
106
|
+
return upload_file_location
|
107
|
+
|
108
|
+
|
109
|
+
async def _create_batch_file(
|
110
|
+
client: OpenAIClient, trace_id: UUID, upload_file_location: str
|
111
|
+
) -> BatchFile:
|
112
|
+
file_response = await client.upload(upload_file_location)
|
113
|
+
return BatchFile(
|
114
|
+
id=file_response.id,
|
115
|
+
name=file_response.filename,
|
116
|
+
purpose=file_response.purpose,
|
117
|
+
status=file_response.status,
|
118
|
+
trace_id=trace_id,
|
119
|
+
)
|
120
|
+
|
121
|
+
|
122
|
+
def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
|
123
|
+
return {
|
124
|
+
"custom_id": prompt_custom_id,
|
125
|
+
"method": "POST",
|
126
|
+
"url": "/chat/completions",
|
127
|
+
"body": {
|
128
|
+
"model": os.getenv("AZURE_API_DEPLOYMENT"),
|
129
|
+
"messages": [{"role": "user", "content": prompt_text}],
|
130
|
+
},
|
131
|
+
}
|
132
|
+
|
133
|
+
|
134
|
+
def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
|
135
|
+
return {
|
136
|
+
"custom_id": prompt_custom_id,
|
137
|
+
"method": "POST",
|
138
|
+
"url": "/chat/completions",
|
139
|
+
"body": {
|
140
|
+
"model": os.getenv("AZURE_API_DEPLOYMENT"),
|
141
|
+
"messages": [
|
142
|
+
{
|
143
|
+
"role": "user",
|
144
|
+
"content": [
|
145
|
+
{"type": "text", "text": prompt_text},
|
146
|
+
{
|
147
|
+
"type": "image_url",
|
148
|
+
"image_url": {
|
149
|
+
"url": f"data:image/png;base64,{encoded_image}"
|
150
|
+
},
|
151
|
+
},
|
152
|
+
],
|
153
|
+
}
|
154
|
+
],
|
155
|
+
"max_tokens": 2000,
|
156
|
+
},
|
157
|
+
}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
from parallex.models.prompt_response import PromptResponse
|
6
|
+
|
7
|
+
|
8
|
+
class ParallexPromptsCallableOutput(BaseModel):
|
9
|
+
original_prompts: list[str] = Field(description="List of given prompts")
|
10
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
11
|
+
responses: list[PromptResponse] = Field(
|
12
|
+
description="List of PromptResponse objects"
|
13
|
+
)
|
@@ -1,17 +1,24 @@
|
|
1
1
|
import asyncio
|
2
2
|
import tempfile
|
3
|
+
import uuid
|
3
4
|
from typing import Callable, Optional
|
4
5
|
from uuid import UUID
|
5
6
|
|
6
7
|
from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
|
7
8
|
from parallex.ai.open_ai_client import OpenAIClient
|
8
|
-
from parallex.ai.output_processor import
|
9
|
-
from parallex.ai.uploader import
|
9
|
+
from parallex.ai.output_processor import process_images_output, process_prompts_output
|
10
|
+
from parallex.ai.uploader import (
|
11
|
+
upload_images_for_processing,
|
12
|
+
upload_prompts_for_processing,
|
13
|
+
)
|
10
14
|
from parallex.file_management.converter import convert_pdf_to_images
|
11
15
|
from parallex.file_management.file_finder import add_file_to_temp_directory
|
12
16
|
from parallex.file_management.remote_file_handler import RemoteFileHandler
|
13
17
|
from parallex.models.batch_file import BatchFile
|
14
18
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
19
|
+
from parallex.models.parallex_prompts_callable_output import (
|
20
|
+
ParallexPromptsCallableOutput,
|
21
|
+
)
|
15
22
|
from parallex.models.upload_batch import UploadBatch
|
16
23
|
from parallex.utils.constants import DEFAULT_PROMPT
|
17
24
|
from parallex.utils.logger import logger, setup_logger
|
@@ -40,9 +47,92 @@ async def parallex(
|
|
40
47
|
except Exception as e:
|
41
48
|
logger.error(f"Error occurred: {e}")
|
42
49
|
finally:
|
43
|
-
|
44
|
-
|
45
|
-
|
50
|
+
await _delete_associated_files(open_ai_client, remote_file_handler)
|
51
|
+
|
52
|
+
|
53
|
+
async def parallex_simple_prompts(
|
54
|
+
model: str,
|
55
|
+
prompts: list[str],
|
56
|
+
post_process_callable: Optional[Callable[..., None]] = None,
|
57
|
+
log_level: Optional[str] = "ERROR",
|
58
|
+
concurrency: Optional[int] = 20,
|
59
|
+
) -> ParallexPromptsCallableOutput:
|
60
|
+
setup_logger(log_level)
|
61
|
+
remote_file_handler = RemoteFileHandler()
|
62
|
+
open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
|
63
|
+
try:
|
64
|
+
return await _prompts_execute(
|
65
|
+
open_ai_client=open_ai_client,
|
66
|
+
prompts=prompts,
|
67
|
+
post_process_callable=post_process_callable,
|
68
|
+
concurrency=concurrency,
|
69
|
+
)
|
70
|
+
except Exception as e:
|
71
|
+
logger.error(f"Error occurred: {e}")
|
72
|
+
finally:
|
73
|
+
await _delete_associated_files(open_ai_client, remote_file_handler)
|
74
|
+
|
75
|
+
|
76
|
+
async def _prompts_execute(
|
77
|
+
open_ai_client: OpenAIClient,
|
78
|
+
prompts: list[str],
|
79
|
+
post_process_callable: Optional[Callable[..., None]] = None,
|
80
|
+
concurrency: Optional[int] = 20,
|
81
|
+
):
|
82
|
+
with tempfile.TemporaryDirectory() as temp_directory:
|
83
|
+
trace_id = uuid.uuid4()
|
84
|
+
batch_files = await upload_prompts_for_processing(
|
85
|
+
client=open_ai_client,
|
86
|
+
prompts=prompts,
|
87
|
+
temp_directory=temp_directory,
|
88
|
+
trace_id=trace_id,
|
89
|
+
)
|
90
|
+
start_batch_semaphore = asyncio.Semaphore(concurrency)
|
91
|
+
start_batch_tasks = []
|
92
|
+
for file in batch_files:
|
93
|
+
batch_task = asyncio.create_task(
|
94
|
+
_create_batch_jobs(
|
95
|
+
batch_file=file,
|
96
|
+
client=open_ai_client,
|
97
|
+
trace_id=trace_id,
|
98
|
+
semaphore=start_batch_semaphore,
|
99
|
+
)
|
100
|
+
)
|
101
|
+
start_batch_tasks.append(batch_task)
|
102
|
+
batch_jobs = await asyncio.gather(*start_batch_tasks)
|
103
|
+
|
104
|
+
prompt_tasks = []
|
105
|
+
for batch in batch_jobs:
|
106
|
+
logger.info(
|
107
|
+
f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
|
108
|
+
)
|
109
|
+
page_task = asyncio.create_task(
|
110
|
+
await wait_for_batch_completion(client=open_ai_client, batch=batch)
|
111
|
+
)
|
112
|
+
prompt_tasks.append(page_task)
|
113
|
+
|
114
|
+
output_file_ids = await asyncio.gather(*prompt_tasks)
|
115
|
+
|
116
|
+
prompts_output = []
|
117
|
+
for output_file_id in output_file_ids:
|
118
|
+
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
119
|
+
prompts_output.append(
|
120
|
+
await process_prompts_output(
|
121
|
+
client=open_ai_client, output_file_id=output_file_id
|
122
|
+
)
|
123
|
+
)
|
124
|
+
|
125
|
+
flat_prompts = [page for batch in prompts_output for page in batch]
|
126
|
+
|
127
|
+
sorted_responses = sorted(flat_prompts, key=lambda x: x.prompt_index)
|
128
|
+
callable_output = ParallexPromptsCallableOutput(
|
129
|
+
original_prompts=prompts,
|
130
|
+
trace_id=trace_id,
|
131
|
+
responses=sorted_responses,
|
132
|
+
)
|
133
|
+
if post_process_callable is not None:
|
134
|
+
post_process_callable(output=callable_output)
|
135
|
+
return callable_output
|
46
136
|
|
47
137
|
|
48
138
|
async def _execute(
|
@@ -115,7 +205,7 @@ async def _wait_and_create_pages(
|
|
115
205
|
logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
116
206
|
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
117
207
|
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
118
|
-
page_responses = await
|
208
|
+
page_responses = await process_images_output(
|
119
209
|
client=client, output_file_id=output_file_id
|
120
210
|
)
|
121
211
|
return page_responses
|
@@ -132,3 +222,9 @@ async def _create_batch_jobs(
|
|
132
222
|
client=client, file_id=batch_file.id, trace_id=trace_id
|
133
223
|
)
|
134
224
|
return upload_batch
|
225
|
+
|
226
|
+
|
227
|
+
async def _delete_associated_files(open_ai_client, remote_file_handler):
|
228
|
+
for file in remote_file_handler.created_files:
|
229
|
+
logger.info(f"deleting - {file}")
|
230
|
+
await open_ai_client.delete_file(file)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "parallex"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.0"
|
4
4
|
description = "PDF to markdown using Azure OpenAI batch processing"
|
5
5
|
authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
|
6
6
|
repository = "https://github.com/Summed-AI/parallex"
|
@@ -1,25 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from parallex.ai.open_ai_client import OpenAIClient
|
4
|
-
from parallex.models.page_response import PageResponse
|
5
|
-
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
6
|
-
|
7
|
-
|
8
|
-
async def process_output(
|
9
|
-
client: OpenAIClient, output_file_id: str
|
10
|
-
) -> list[PageResponse]:
|
11
|
-
"""Gets content from completed Batch to create PageResponse with created markdown"""
|
12
|
-
file_response = await client.retrieve_file(output_file_id)
|
13
|
-
raw_responses = file_response.text.strip().split("\n")
|
14
|
-
|
15
|
-
pages = []
|
16
|
-
for raw_response in raw_responses:
|
17
|
-
json_response = json.loads(raw_response)
|
18
|
-
custom_id = json_response["custom_id"]
|
19
|
-
page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
|
20
|
-
output_content = json_response["response"]["body"]["choices"][0]["message"][
|
21
|
-
"content"
|
22
|
-
]
|
23
|
-
page = PageResponse(output_content=output_content, page_number=int(page_number))
|
24
|
-
pages.append(page)
|
25
|
-
return pages
|
@@ -1,91 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
|
5
|
-
from parallex.ai.open_ai_client import OpenAIClient
|
6
|
-
from parallex.file_management.utils import file_in_temp_dir
|
7
|
-
from parallex.models.batch_file import BatchFile
|
8
|
-
from parallex.models.image_file import ImageFile
|
9
|
-
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
10
|
-
|
11
|
-
MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
|
12
|
-
|
13
|
-
|
14
|
-
async def upload_images_for_processing(
|
15
|
-
client: OpenAIClient,
|
16
|
-
image_files: list[ImageFile],
|
17
|
-
temp_directory: str,
|
18
|
-
prompt_text: str,
|
19
|
-
) -> list[BatchFile]:
|
20
|
-
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
21
|
-
trace_id = image_files[0].trace_id
|
22
|
-
current_index = 0
|
23
|
-
batch_files = []
|
24
|
-
upload_file_location = file_in_temp_dir(
|
25
|
-
directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
|
26
|
-
)
|
27
|
-
|
28
|
-
for image_file in image_files:
|
29
|
-
if (
|
30
|
-
os.path.exists(upload_file_location)
|
31
|
-
and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
|
32
|
-
):
|
33
|
-
"""When approaching upload file limit, upload and start new file"""
|
34
|
-
batch_file = await _create_batch_file(
|
35
|
-
client, trace_id, upload_file_location
|
36
|
-
)
|
37
|
-
batch_files.append(batch_file)
|
38
|
-
current_index += 1
|
39
|
-
upload_file_location = file_in_temp_dir(
|
40
|
-
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
41
|
-
)
|
42
|
-
|
43
|
-
with open(image_file.path, "rb") as image:
|
44
|
-
base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
|
45
|
-
|
46
|
-
prompt_custom_id = (
|
47
|
-
f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
|
48
|
-
)
|
49
|
-
jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
|
50
|
-
with open(upload_file_location, "a") as jsonl_file:
|
51
|
-
jsonl_file.write(json.dumps(jsonl) + "\n")
|
52
|
-
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
53
|
-
batch_files.append(batch_file)
|
54
|
-
return batch_files
|
55
|
-
|
56
|
-
|
57
|
-
async def _create_batch_file(client, trace_id, upload_file_location):
|
58
|
-
file_response = await client.upload(upload_file_location)
|
59
|
-
return BatchFile(
|
60
|
-
id=file_response.id,
|
61
|
-
name=file_response.filename,
|
62
|
-
purpose=file_response.purpose,
|
63
|
-
status=file_response.status,
|
64
|
-
trace_id=trace_id,
|
65
|
-
)
|
66
|
-
|
67
|
-
|
68
|
-
def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
|
69
|
-
return {
|
70
|
-
"custom_id": prompt_custom_id,
|
71
|
-
"method": "POST",
|
72
|
-
"url": "/chat/completions",
|
73
|
-
"body": {
|
74
|
-
"model": os.getenv("AZURE_API_DEPLOYMENT"),
|
75
|
-
"messages": [
|
76
|
-
{
|
77
|
-
"role": "user",
|
78
|
-
"content": [
|
79
|
-
{"type": "text", "text": prompt_text},
|
80
|
-
{
|
81
|
-
"type": "image_url",
|
82
|
-
"image_url": {
|
83
|
-
"url": f"data:image/png;base64,{encoded_image}"
|
84
|
-
},
|
85
|
-
},
|
86
|
-
],
|
87
|
-
}
|
88
|
-
],
|
89
|
-
"max_tokens": 2000,
|
90
|
-
},
|
91
|
-
}
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|