parallex 0.3.3__tar.gz → 0.4.0__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {parallex-0.3.3 → parallex-0.4.0}/PKG-INFO +1 -1
- {parallex-0.3.3 → parallex-0.4.0}/parallex/ai/open_ai_client.py +0 -1
- {parallex-0.3.3 → parallex-0.4.0}/parallex/ai/output_processor.py +12 -6
- {parallex-0.3.3 → parallex-0.4.0}/parallex/ai/uploader.py +34 -6
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/page_response.py +1 -1
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/prompt_response.py +1 -1
- {parallex-0.3.3 → parallex-0.4.0}/parallex/parallex.py +15 -6
- {parallex-0.3.3 → parallex-0.4.0}/pyproject.toml +1 -1
- {parallex-0.3.3 → parallex-0.4.0}/LICENSE +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/README.md +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/__init__.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/ai/batch_processor.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/file_management/converter.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/file_management/file_finder.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/file_management/remote_file_handler.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/file_management/utils.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/batch_file.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/image_file.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/parallex_callable_output.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/parallex_prompts_callable_output.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/raw_file.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/models/upload_batch.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/utils/constants.py +0 -0
- {parallex-0.3.3 → parallex-0.4.0}/parallex/utils/logger.py +0 -0
@@ -1,5 +1,7 @@
|
|
1
1
|
import json
|
2
|
-
from typing import TypeVar, Callable
|
2
|
+
from typing import TypeVar, Callable, Optional
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
3
5
|
|
4
6
|
from parallex.ai.open_ai_client import OpenAIClient
|
5
7
|
from parallex.models.page_response import PageResponse
|
@@ -8,11 +10,12 @@ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
|
8
10
|
|
9
11
|
|
10
12
|
async def process_images_output(
|
11
|
-
client: OpenAIClient, output_file_id: str
|
13
|
+
client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
|
12
14
|
) -> list[PageResponse]:
|
13
15
|
return await _process_output(
|
14
16
|
client,
|
15
17
|
output_file_id,
|
18
|
+
model,
|
16
19
|
lambda content, identifier: PageResponse(
|
17
20
|
output_content=content, page_number=int(identifier)
|
18
21
|
),
|
@@ -20,12 +23,13 @@ async def process_images_output(
|
|
20
23
|
|
21
24
|
|
22
25
|
async def process_prompts_output(
|
23
|
-
client: OpenAIClient, output_file_id: str
|
26
|
+
client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
|
24
27
|
) -> list[PromptResponse]:
|
25
28
|
"""Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
|
26
29
|
return await _process_output(
|
27
30
|
client,
|
28
31
|
output_file_id,
|
32
|
+
model,
|
29
33
|
lambda content, identifier: PromptResponse(
|
30
34
|
output_content=content, prompt_index=int(identifier)
|
31
35
|
),
|
@@ -38,6 +42,7 @@ ResponseType = TypeVar("ResponseType")
|
|
38
42
|
async def _process_output(
|
39
43
|
client: OpenAIClient,
|
40
44
|
output_file_id: str,
|
45
|
+
model: Optional[type[BaseModel]],
|
41
46
|
response_builder: Callable[[str, str], ResponseType],
|
42
47
|
) -> list[ResponseType]:
|
43
48
|
file_response = await client.retrieve_file(output_file_id)
|
@@ -48,9 +53,10 @@ async def _process_output(
|
|
48
53
|
json_response = json.loads(raw_response)
|
49
54
|
custom_id = json_response["custom_id"]
|
50
55
|
identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
|
51
|
-
output_content = json_response["response"]["body"]["choices"][0]["message"][
|
52
|
-
|
53
|
-
|
56
|
+
output_content = json_response["response"]["body"]["choices"][0]["message"]["content"]
|
57
|
+
if model:
|
58
|
+
json_data = json.loads(output_content)
|
59
|
+
output_content = model(**json_data)
|
54
60
|
response = response_builder(output_content, identifier)
|
55
61
|
responses.append(response)
|
56
62
|
|
@@ -1,8 +1,12 @@
|
|
1
1
|
import base64
|
2
2
|
import json
|
3
3
|
import os
|
4
|
+
from typing import Optional
|
4
5
|
from uuid import UUID
|
5
6
|
|
7
|
+
from openai.lib._pydantic import to_strict_json_schema
|
8
|
+
from pydantic import BaseModel
|
9
|
+
|
6
10
|
from parallex.ai.open_ai_client import OpenAIClient
|
7
11
|
from parallex.file_management.utils import file_in_temp_dir
|
8
12
|
from parallex.models.batch_file import BatchFile
|
@@ -17,6 +21,7 @@ async def upload_images_for_processing(
|
|
17
21
|
image_files: list[ImageFile],
|
18
22
|
temp_directory: str,
|
19
23
|
prompt_text: str,
|
24
|
+
model: Optional[type[BaseModel]] = None,
|
20
25
|
) -> list[BatchFile]:
|
21
26
|
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
22
27
|
trace_id = image_files[0].trace_id
|
@@ -43,7 +48,7 @@ async def upload_images_for_processing(
|
|
43
48
|
prompt_custom_id = (
|
44
49
|
f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
|
45
50
|
)
|
46
|
-
jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
|
51
|
+
jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text, model)
|
47
52
|
with open(upload_file_location, "a") as jsonl_file:
|
48
53
|
jsonl_file.write(json.dumps(jsonl) + "\n")
|
49
54
|
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
@@ -52,7 +57,10 @@ async def upload_images_for_processing(
|
|
52
57
|
|
53
58
|
|
54
59
|
async def upload_prompts_for_processing(
|
55
|
-
client: OpenAIClient,
|
60
|
+
client: OpenAIClient,
|
61
|
+
prompts: list[str], temp_directory: str,
|
62
|
+
trace_id: UUID,
|
63
|
+
model: Optional[type[BaseModel]] = None
|
56
64
|
) -> list[BatchFile]:
|
57
65
|
"""Creates jsonl file and uploads for processing"""
|
58
66
|
current_index = 0
|
@@ -73,7 +81,7 @@ async def upload_prompts_for_processing(
|
|
73
81
|
)
|
74
82
|
|
75
83
|
prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
|
76
|
-
jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
|
84
|
+
jsonl = _simple_jsonl_format(prompt_custom_id, prompt, model)
|
77
85
|
with open(upload_file_location, "a") as jsonl_file:
|
78
86
|
jsonl_file.write(json.dumps(jsonl) + "\n")
|
79
87
|
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
@@ -119,20 +127,36 @@ async def _create_batch_file(
|
|
119
127
|
)
|
120
128
|
|
121
129
|
|
122
|
-
def
|
130
|
+
def _response_format(model: type[BaseModel]) -> dict:
|
131
|
+
schema = to_strict_json_schema(model)
|
123
132
|
return {
|
133
|
+
"type": "json_schema",
|
134
|
+
"json_schema": {
|
135
|
+
"name": model.__name__,
|
136
|
+
"strict": True,
|
137
|
+
"schema": schema
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
|
142
|
+
def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str, model: Optional[type[BaseModel]]) -> dict:
|
143
|
+
payload = {
|
124
144
|
"custom_id": prompt_custom_id,
|
125
145
|
"method": "POST",
|
126
146
|
"url": "/chat/completions",
|
127
147
|
"body": {
|
128
148
|
"model": os.getenv("AZURE_API_DEPLOYMENT"),
|
129
149
|
"messages": [{"role": "user", "content": prompt_text}],
|
150
|
+
"temperature": 0.0, # TODO make configurable
|
130
151
|
},
|
131
152
|
}
|
153
|
+
if model is not None:
|
154
|
+
payload["body"]["response_format"] = _response_format(model)
|
155
|
+
return payload
|
132
156
|
|
133
157
|
|
134
|
-
def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
|
135
|
-
|
158
|
+
def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str, model: Optional[type[BaseModel]] = None) -> dict:
|
159
|
+
payload = {
|
136
160
|
"custom_id": prompt_custom_id,
|
137
161
|
"method": "POST",
|
138
162
|
"url": "/chat/completions",
|
@@ -153,5 +177,9 @@ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text:
|
|
153
177
|
}
|
154
178
|
],
|
155
179
|
"max_tokens": 2000,
|
180
|
+
"response_format": {"type": "json_object"}
|
156
181
|
},
|
157
182
|
}
|
183
|
+
if model is not None:
|
184
|
+
payload["body"]["response_format"] = _response_format(model)
|
185
|
+
return payload
|
@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
|
|
2
2
|
|
3
3
|
|
4
4
|
class PageResponse(BaseModel):
|
5
|
-
output_content: str = Field(description="Markdown generated for the page")
|
5
|
+
output_content: str | BaseModel = Field(description="Markdown generated for the page")
|
6
6
|
page_number: int = Field(description="Page number of the associated PDF")
|
@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
|
|
2
2
|
|
3
3
|
|
4
4
|
class PromptResponse(BaseModel):
|
5
|
-
output_content: str = Field(description="Response from the model")
|
5
|
+
output_content: str | BaseModel = Field(description="Response from the model")
|
6
6
|
prompt_index: int = Field(description="Index corresponding to the given prompts")
|
@@ -4,6 +4,8 @@ import uuid
|
|
4
4
|
from typing import Callable, Optional
|
5
5
|
from uuid import UUID
|
6
6
|
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
7
9
|
from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
|
8
10
|
from parallex.ai.open_ai_client import OpenAIClient
|
9
11
|
from parallex.ai.output_processor import process_images_output, process_prompts_output
|
@@ -32,6 +34,7 @@ async def parallex(
|
|
32
34
|
concurrency: Optional[int] = 20,
|
33
35
|
prompt_text: Optional[str] = DEFAULT_PROMPT,
|
34
36
|
log_level: Optional[str] = "ERROR",
|
37
|
+
response_model: Optional[type[BaseModel]] = None,
|
35
38
|
) -> ParallexCallableOutput:
|
36
39
|
setup_logger(log_level)
|
37
40
|
remote_file_handler = RemoteFileHandler()
|
@@ -43,6 +46,7 @@ async def parallex(
|
|
43
46
|
post_process_callable=post_process_callable,
|
44
47
|
concurrency=concurrency,
|
45
48
|
prompt_text=prompt_text,
|
49
|
+
model=response_model
|
46
50
|
)
|
47
51
|
except Exception as e:
|
48
52
|
logger.error(f"Error occurred: {e}")
|
@@ -57,6 +61,7 @@ async def parallex_simple_prompts(
|
|
57
61
|
post_process_callable: Optional[Callable[..., None]] = None,
|
58
62
|
log_level: Optional[str] = "ERROR",
|
59
63
|
concurrency: Optional[int] = 20,
|
64
|
+
response_model: Optional[type[BaseModel]] = None,
|
60
65
|
) -> ParallexPromptsCallableOutput:
|
61
66
|
setup_logger(log_level)
|
62
67
|
remote_file_handler = RemoteFileHandler()
|
@@ -67,6 +72,7 @@ async def parallex_simple_prompts(
|
|
67
72
|
prompts=prompts,
|
68
73
|
post_process_callable=post_process_callable,
|
69
74
|
concurrency=concurrency,
|
75
|
+
model=response_model,
|
70
76
|
)
|
71
77
|
except Exception as e:
|
72
78
|
logger.error(f"Error occurred: {e}")
|
@@ -80,6 +86,7 @@ async def _prompts_execute(
|
|
80
86
|
prompts: list[str],
|
81
87
|
post_process_callable: Optional[Callable[..., None]] = None,
|
82
88
|
concurrency: Optional[int] = 20,
|
89
|
+
model: Optional[type[BaseModel]] = None,
|
83
90
|
):
|
84
91
|
with tempfile.TemporaryDirectory() as temp_directory:
|
85
92
|
trace_id = uuid.uuid4()
|
@@ -88,6 +95,7 @@ async def _prompts_execute(
|
|
88
95
|
prompts=prompts,
|
89
96
|
temp_directory=temp_directory,
|
90
97
|
trace_id=trace_id,
|
98
|
+
model=model,
|
91
99
|
)
|
92
100
|
start_batch_semaphore = asyncio.Semaphore(concurrency)
|
93
101
|
start_batch_tasks = []
|
@@ -110,7 +118,7 @@ async def _prompts_execute(
|
|
110
118
|
f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
|
111
119
|
)
|
112
120
|
prompt_task = asyncio.create_task(
|
113
|
-
_wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore)
|
121
|
+
_wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model)
|
114
122
|
)
|
115
123
|
prompt_tasks.append(prompt_task)
|
116
124
|
prompt_response_groups = await asyncio.gather(*prompt_tasks)
|
@@ -134,6 +142,7 @@ async def _execute(
|
|
134
142
|
post_process_callable: Optional[Callable[..., None]] = None,
|
135
143
|
concurrency: Optional[int] = 20,
|
136
144
|
prompt_text: Optional[str] = DEFAULT_PROMPT,
|
145
|
+
model: Optional[type[BaseModel]] = None,
|
137
146
|
) -> ParallexCallableOutput:
|
138
147
|
with tempfile.TemporaryDirectory() as temp_directory:
|
139
148
|
raw_file = await add_file_to_temp_directory(
|
@@ -169,7 +178,7 @@ async def _execute(
|
|
169
178
|
for batch in batch_jobs:
|
170
179
|
page_task = asyncio.create_task(
|
171
180
|
_wait_and_create_pages(
|
172
|
-
batch=batch, client=open_ai_client, semaphore=process_semaphore
|
181
|
+
batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model
|
173
182
|
)
|
174
183
|
)
|
175
184
|
pages_tasks.append(page_task)
|
@@ -192,27 +201,27 @@ async def _execute(
|
|
192
201
|
|
193
202
|
|
194
203
|
async def _wait_and_create_pages(
|
195
|
-
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
204
|
+
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
|
196
205
|
):
|
197
206
|
async with semaphore:
|
198
207
|
logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
199
208
|
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
200
209
|
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
201
210
|
page_responses = await process_images_output(
|
202
|
-
client=client, output_file_id=output_file_id
|
211
|
+
client=client, output_file_id=output_file_id, model=model,
|
203
212
|
)
|
204
213
|
return page_responses
|
205
214
|
|
206
215
|
|
207
216
|
async def _wait_and_create_prompt_responses(
|
208
|
-
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
217
|
+
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
|
209
218
|
):
|
210
219
|
async with semaphore:
|
211
220
|
logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
212
221
|
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
213
222
|
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
214
223
|
prompt_responses = await process_prompts_output(
|
215
|
-
client=client, output_file_id=output_file_id
|
224
|
+
client=client, output_file_id=output_file_id, model=model,
|
216
225
|
)
|
217
226
|
return prompt_responses
|
218
227
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "parallex"
|
3
|
-
version = "0.
|
3
|
+
version = "0.4.0"
|
4
4
|
description = "PDF to markdown using Azure OpenAI batch processing"
|
5
5
|
authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
|
6
6
|
repository = "https://github.com/Summed-AI/parallex"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|