parallex 0.2.1__tar.gz → 0.3.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {parallex-0.2.1 → parallex-0.3.1}/PKG-INFO +1 -1
- {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/uploader.py +52 -15
- {parallex-0.2.1 → parallex-0.3.1}/parallex/parallex.py +46 -14
- {parallex-0.2.1 → parallex-0.3.1}/pyproject.toml +1 -1
- {parallex-0.2.1 → parallex-0.3.1}/LICENSE +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/README.md +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/__init__.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/batch_processor.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/open_ai_client.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/output_processor.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/converter.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/file_finder.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/remote_file_handler.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/utils.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/batch_file.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/image_file.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/page_response.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/parallex_callable_output.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/parallex_prompts_callable_output.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/prompt_response.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/raw_file.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/models/upload_batch.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/utils/constants.py +0 -0
- {parallex-0.2.1 → parallex-0.3.1}/parallex/utils/logger.py +0 -0
@@ -9,7 +9,7 @@ from parallex.models.batch_file import BatchFile
|
|
9
9
|
from parallex.models.image_file import ImageFile
|
10
10
|
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
11
11
|
|
12
|
-
MAX_FILE_SIZE =
|
12
|
+
MAX_FILE_SIZE = 180 * 1024 * 1024 # 180 MB in bytes. Limit for Azure is 200MB.
|
13
13
|
|
14
14
|
|
15
15
|
async def upload_images_for_processing(
|
@@ -23,22 +23,18 @@ async def upload_images_for_processing(
|
|
23
23
|
current_index = 0
|
24
24
|
batch_files = []
|
25
25
|
upload_file_location = file_in_temp_dir(
|
26
|
-
directory=temp_directory, file_name=f"
|
26
|
+
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
27
27
|
)
|
28
28
|
|
29
29
|
for image_file in image_files:
|
30
|
-
if (
|
31
|
-
os.path.exists(upload_file_location)
|
32
|
-
and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
|
33
|
-
):
|
30
|
+
if await _approaching_file_size_limit(upload_file_location):
|
34
31
|
"""When approaching upload file limit, upload and start new file"""
|
35
32
|
batch_file = await _create_batch_file(
|
36
33
|
client, trace_id, upload_file_location
|
37
34
|
)
|
38
35
|
batch_files.append(batch_file)
|
39
|
-
|
40
|
-
|
41
|
-
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
36
|
+
upload_file_location = await _increment_batch_file_index(
|
37
|
+
current_index, temp_directory, trace_id
|
42
38
|
)
|
43
39
|
|
44
40
|
with open(image_file.path, "rb") as image:
|
@@ -57,21 +53,62 @@ async def upload_images_for_processing(
|
|
57
53
|
|
58
54
|
async def upload_prompts_for_processing(
|
59
55
|
client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
|
60
|
-
) -> BatchFile:
|
56
|
+
) -> list[BatchFile]:
|
61
57
|
"""Creates jsonl file and uploads for processing"""
|
62
|
-
|
63
|
-
|
58
|
+
current_index = 0
|
59
|
+
batch_files = []
|
60
|
+
|
61
|
+
upload_file_location = await set_file_location(
|
62
|
+
current_index, temp_directory, trace_id
|
64
63
|
)
|
65
64
|
for index, prompt in enumerate(prompts):
|
65
|
+
if await _approaching_file_size_limit(upload_file_location):
|
66
|
+
"""When approaching upload file limit, upload and start new file"""
|
67
|
+
batch_file = await _create_batch_file(
|
68
|
+
client, trace_id, upload_file_location
|
69
|
+
)
|
70
|
+
batch_files.append(batch_file)
|
71
|
+
upload_file_location = await _increment_batch_file_index(
|
72
|
+
current_index, temp_directory, trace_id
|
73
|
+
)
|
74
|
+
|
66
75
|
prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
|
67
76
|
jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
|
68
77
|
with open(upload_file_location, "a") as jsonl_file:
|
69
78
|
jsonl_file.write(json.dumps(jsonl) + "\n")
|
70
79
|
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
71
|
-
|
80
|
+
batch_files.append(batch_file)
|
81
|
+
return batch_files
|
82
|
+
|
83
|
+
|
84
|
+
async def set_file_location(
|
85
|
+
current_index: int, temp_directory: str, trace_id: UUID
|
86
|
+
) -> str:
|
87
|
+
return file_in_temp_dir(
|
88
|
+
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
async def _approaching_file_size_limit(upload_file_location: str) -> bool:
|
93
|
+
return (
|
94
|
+
os.path.exists(upload_file_location)
|
95
|
+
and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
|
96
|
+
)
|
97
|
+
|
98
|
+
|
99
|
+
async def _increment_batch_file_index(
|
100
|
+
current_index: int, temp_directory: str, trace_id: UUID
|
101
|
+
) -> str:
|
102
|
+
current_index += 1
|
103
|
+
upload_file_location = await set_file_location(
|
104
|
+
current_index, temp_directory, trace_id
|
105
|
+
)
|
106
|
+
return upload_file_location
|
72
107
|
|
73
108
|
|
74
|
-
async def _create_batch_file(
|
109
|
+
async def _create_batch_file(
|
110
|
+
client: OpenAIClient, trace_id: UUID, upload_file_location: str
|
111
|
+
) -> BatchFile:
|
75
112
|
file_response = await client.upload(upload_file_location)
|
76
113
|
return BatchFile(
|
77
114
|
id=file_response.id,
|
@@ -82,7 +119,7 @@ async def _create_batch_file(client, trace_id, upload_file_location):
|
|
82
119
|
)
|
83
120
|
|
84
121
|
|
85
|
-
def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str):
|
122
|
+
def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
|
86
123
|
return {
|
87
124
|
"custom_id": prompt_custom_id,
|
88
125
|
"method": "POST",
|
@@ -46,6 +46,7 @@ async def parallex(
|
|
46
46
|
)
|
47
47
|
except Exception as e:
|
48
48
|
logger.error(f"Error occurred: {e}")
|
49
|
+
raise e
|
49
50
|
finally:
|
50
51
|
await _delete_associated_files(open_ai_client, remote_file_handler)
|
51
52
|
|
@@ -55,6 +56,7 @@ async def parallex_simple_prompts(
|
|
55
56
|
prompts: list[str],
|
56
57
|
post_process_callable: Optional[Callable[..., None]] = None,
|
57
58
|
log_level: Optional[str] = "ERROR",
|
59
|
+
concurrency: Optional[int] = 20,
|
58
60
|
) -> ParallexPromptsCallableOutput:
|
59
61
|
setup_logger(log_level)
|
60
62
|
remote_file_handler = RemoteFileHandler()
|
@@ -64,9 +66,11 @@ async def parallex_simple_prompts(
|
|
64
66
|
open_ai_client=open_ai_client,
|
65
67
|
prompts=prompts,
|
66
68
|
post_process_callable=post_process_callable,
|
69
|
+
concurrency=concurrency,
|
67
70
|
)
|
68
71
|
except Exception as e:
|
69
72
|
logger.error(f"Error occurred: {e}")
|
73
|
+
raise e
|
70
74
|
finally:
|
71
75
|
await _delete_associated_files(open_ai_client, remote_file_handler)
|
72
76
|
|
@@ -75,27 +79,42 @@ async def _prompts_execute(
|
|
75
79
|
open_ai_client: OpenAIClient,
|
76
80
|
prompts: list[str],
|
77
81
|
post_process_callable: Optional[Callable[..., None]] = None,
|
82
|
+
concurrency: Optional[int] = 20,
|
78
83
|
):
|
79
84
|
with tempfile.TemporaryDirectory() as temp_directory:
|
80
85
|
trace_id = uuid.uuid4()
|
81
|
-
|
86
|
+
batch_files = await upload_prompts_for_processing(
|
82
87
|
client=open_ai_client,
|
83
88
|
prompts=prompts,
|
84
89
|
temp_directory=temp_directory,
|
85
90
|
trace_id=trace_id,
|
86
91
|
)
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
92
|
+
start_batch_semaphore = asyncio.Semaphore(concurrency)
|
93
|
+
start_batch_tasks = []
|
94
|
+
for file in batch_files:
|
95
|
+
batch_task = asyncio.create_task(
|
96
|
+
_create_batch_jobs(
|
97
|
+
batch_file=file,
|
98
|
+
client=open_ai_client,
|
99
|
+
trace_id=trace_id,
|
100
|
+
semaphore=start_batch_semaphore,
|
101
|
+
)
|
102
|
+
)
|
103
|
+
start_batch_tasks.append(batch_task)
|
104
|
+
batch_jobs = await asyncio.gather(*start_batch_tasks)
|
105
|
+
|
106
|
+
process_semaphore = asyncio.Semaphore(concurrency)
|
107
|
+
prompt_tasks = []
|
108
|
+
for batch in batch_jobs:
|
109
|
+
prompt_task = asyncio.create_task(
|
110
|
+
_wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore)
|
111
|
+
)
|
112
|
+
prompt_tasks.append(prompt_task)
|
113
|
+
prompt_response_groups = await asyncio.gather(*prompt_tasks)
|
114
|
+
|
115
|
+
flat_responses = [response for batch in prompt_response_groups for response in batch]
|
116
|
+
|
117
|
+
sorted_responses = sorted(flat_responses, key=lambda x: x.prompt_index)
|
99
118
|
callable_output = ParallexPromptsCallableOutput(
|
100
119
|
original_prompts=prompts,
|
101
120
|
trace_id=trace_id,
|
@@ -155,7 +174,7 @@ async def _execute(
|
|
155
174
|
|
156
175
|
pages = [page for batch_pages in page_groups for page in batch_pages]
|
157
176
|
logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
|
158
|
-
sorted_pages = sorted(pages, key=lambda x: x.
|
177
|
+
sorted_pages = sorted(pages, key=lambda x: x.prompt_index)
|
159
178
|
|
160
179
|
# TODO add combined version of MD to output / save to file system
|
161
180
|
callable_output = ParallexCallableOutput(
|
@@ -182,6 +201,19 @@ async def _wait_and_create_pages(
|
|
182
201
|
return page_responses
|
183
202
|
|
184
203
|
|
204
|
+
async def _wait_and_create_prompt_responses(
|
205
|
+
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
206
|
+
):
|
207
|
+
async with semaphore:
|
208
|
+
logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
209
|
+
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
210
|
+
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
211
|
+
prompt_responses = await process_prompts_output(
|
212
|
+
client=client, output_file_id=output_file_id
|
213
|
+
)
|
214
|
+
return prompt_responses
|
215
|
+
|
216
|
+
|
185
217
|
async def _create_batch_jobs(
|
186
218
|
batch_file: BatchFile,
|
187
219
|
client: OpenAIClient,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "parallex"
|
3
|
-
version = "0.
|
3
|
+
version = "0.3.1"
|
4
4
|
description = "PDF to markdown using Azure OpenAI batch processing"
|
5
5
|
authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
|
6
6
|
repository = "https://github.com/Summed-AI/parallex"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|