parallex 0.2.1__tar.gz → 0.3.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (24) hide show
  1. {parallex-0.2.1 → parallex-0.3.1}/PKG-INFO +1 -1
  2. {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/uploader.py +52 -15
  3. {parallex-0.2.1 → parallex-0.3.1}/parallex/parallex.py +46 -14
  4. {parallex-0.2.1 → parallex-0.3.1}/pyproject.toml +1 -1
  5. {parallex-0.2.1 → parallex-0.3.1}/LICENSE +0 -0
  6. {parallex-0.2.1 → parallex-0.3.1}/README.md +0 -0
  7. {parallex-0.2.1 → parallex-0.3.1}/parallex/__init__.py +0 -0
  8. {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/batch_processor.py +0 -0
  9. {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/open_ai_client.py +0 -0
  10. {parallex-0.2.1 → parallex-0.3.1}/parallex/ai/output_processor.py +0 -0
  11. {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/converter.py +0 -0
  12. {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/file_finder.py +0 -0
  13. {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/remote_file_handler.py +0 -0
  14. {parallex-0.2.1 → parallex-0.3.1}/parallex/file_management/utils.py +0 -0
  15. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/batch_file.py +0 -0
  16. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/image_file.py +0 -0
  17. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/page_response.py +0 -0
  18. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/parallex_callable_output.py +0 -0
  19. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/parallex_prompts_callable_output.py +0 -0
  20. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/prompt_response.py +0 -0
  21. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/raw_file.py +0 -0
  22. {parallex-0.2.1 → parallex-0.3.1}/parallex/models/upload_batch.py +0 -0
  23. {parallex-0.2.1 → parallex-0.3.1}/parallex/utils/constants.py +0 -0
  24. {parallex-0.2.1 → parallex-0.3.1}/parallex/utils/logger.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -9,7 +9,7 @@ from parallex.models.batch_file import BatchFile
9
9
  from parallex.models.image_file import ImageFile
10
10
  from parallex.utils.constants import CUSTOM_ID_DELINEATOR
11
11
 
12
- MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
12
+ MAX_FILE_SIZE = 180 * 1024 * 1024 # 180 MB in bytes. Limit for Azure is 200MB.
13
13
 
14
14
 
15
15
  async def upload_images_for_processing(
@@ -23,22 +23,18 @@ async def upload_images_for_processing(
23
23
  current_index = 0
24
24
  batch_files = []
25
25
  upload_file_location = file_in_temp_dir(
26
- directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
26
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
27
27
  )
28
28
 
29
29
  for image_file in image_files:
30
- if (
31
- os.path.exists(upload_file_location)
32
- and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
33
- ):
30
+ if await _approaching_file_size_limit(upload_file_location):
34
31
  """When approaching upload file limit, upload and start new file"""
35
32
  batch_file = await _create_batch_file(
36
33
  client, trace_id, upload_file_location
37
34
  )
38
35
  batch_files.append(batch_file)
39
- current_index += 1
40
- upload_file_location = file_in_temp_dir(
41
- directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
36
+ upload_file_location = await _increment_batch_file_index(
37
+ current_index, temp_directory, trace_id
42
38
  )
43
39
 
44
40
  with open(image_file.path, "rb") as image:
@@ -57,21 +53,62 @@ async def upload_images_for_processing(
57
53
 
58
54
  async def upload_prompts_for_processing(
59
55
  client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
60
- ) -> BatchFile:
56
+ ) -> list[BatchFile]:
61
57
  """Creates jsonl file and uploads for processing"""
62
- upload_file_location = file_in_temp_dir(
63
- directory=temp_directory, file_name=f"prompts-{trace_id}.jsonl"
58
+ current_index = 0
59
+ batch_files = []
60
+
61
+ upload_file_location = await set_file_location(
62
+ current_index, temp_directory, trace_id
64
63
  )
65
64
  for index, prompt in enumerate(prompts):
65
+ if await _approaching_file_size_limit(upload_file_location):
66
+ """When approaching upload file limit, upload and start new file"""
67
+ batch_file = await _create_batch_file(
68
+ client, trace_id, upload_file_location
69
+ )
70
+ batch_files.append(batch_file)
71
+ upload_file_location = await _increment_batch_file_index(
72
+ current_index, temp_directory, trace_id
73
+ )
74
+
66
75
  prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
67
76
  jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
68
77
  with open(upload_file_location, "a") as jsonl_file:
69
78
  jsonl_file.write(json.dumps(jsonl) + "\n")
70
79
  batch_file = await _create_batch_file(client, trace_id, upload_file_location)
71
- return batch_file
80
+ batch_files.append(batch_file)
81
+ return batch_files
82
+
83
+
84
+ async def set_file_location(
85
+ current_index: int, temp_directory: str, trace_id: UUID
86
+ ) -> str:
87
+ return file_in_temp_dir(
88
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
89
+ )
90
+
91
+
92
+ async def _approaching_file_size_limit(upload_file_location: str) -> bool:
93
+ return (
94
+ os.path.exists(upload_file_location)
95
+ and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
96
+ )
97
+
98
+
99
+ async def _increment_batch_file_index(
100
+ current_index: int, temp_directory: str, trace_id: UUID
101
+ ) -> str:
102
+ current_index += 1
103
+ upload_file_location = await set_file_location(
104
+ current_index, temp_directory, trace_id
105
+ )
106
+ return upload_file_location
72
107
 
73
108
 
74
- async def _create_batch_file(client, trace_id, upload_file_location):
109
+ async def _create_batch_file(
110
+ client: OpenAIClient, trace_id: UUID, upload_file_location: str
111
+ ) -> BatchFile:
75
112
  file_response = await client.upload(upload_file_location)
76
113
  return BatchFile(
77
114
  id=file_response.id,
@@ -82,7 +119,7 @@ async def _create_batch_file(client, trace_id, upload_file_location):
82
119
  )
83
120
 
84
121
 
85
- def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str):
122
+ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
86
123
  return {
87
124
  "custom_id": prompt_custom_id,
88
125
  "method": "POST",
@@ -46,6 +46,7 @@ async def parallex(
46
46
  )
47
47
  except Exception as e:
48
48
  logger.error(f"Error occurred: {e}")
49
+ raise e
49
50
  finally:
50
51
  await _delete_associated_files(open_ai_client, remote_file_handler)
51
52
 
@@ -55,6 +56,7 @@ async def parallex_simple_prompts(
55
56
  prompts: list[str],
56
57
  post_process_callable: Optional[Callable[..., None]] = None,
57
58
  log_level: Optional[str] = "ERROR",
59
+ concurrency: Optional[int] = 20,
58
60
  ) -> ParallexPromptsCallableOutput:
59
61
  setup_logger(log_level)
60
62
  remote_file_handler = RemoteFileHandler()
@@ -64,9 +66,11 @@ async def parallex_simple_prompts(
64
66
  open_ai_client=open_ai_client,
65
67
  prompts=prompts,
66
68
  post_process_callable=post_process_callable,
69
+ concurrency=concurrency,
67
70
  )
68
71
  except Exception as e:
69
72
  logger.error(f"Error occurred: {e}")
73
+ raise e
70
74
  finally:
71
75
  await _delete_associated_files(open_ai_client, remote_file_handler)
72
76
 
@@ -75,27 +79,42 @@ async def _prompts_execute(
75
79
  open_ai_client: OpenAIClient,
76
80
  prompts: list[str],
77
81
  post_process_callable: Optional[Callable[..., None]] = None,
82
+ concurrency: Optional[int] = 20,
78
83
  ):
79
84
  with tempfile.TemporaryDirectory() as temp_directory:
80
85
  trace_id = uuid.uuid4()
81
- batch_file = await upload_prompts_for_processing(
86
+ batch_files = await upload_prompts_for_processing(
82
87
  client=open_ai_client,
83
88
  prompts=prompts,
84
89
  temp_directory=temp_directory,
85
90
  trace_id=trace_id,
86
91
  )
87
- batch = await create_batch(
88
- client=open_ai_client, file_id=batch_file.id, trace_id=trace_id
89
- )
90
- logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
91
- output_file_id = await wait_for_batch_completion(
92
- client=open_ai_client, batch=batch
93
- )
94
- logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
95
- prompts_output = await process_prompts_output(
96
- client=open_ai_client, output_file_id=output_file_id
97
- )
98
- sorted_responses = sorted(prompts_output, key=lambda x: x.prompt_index)
92
+ start_batch_semaphore = asyncio.Semaphore(concurrency)
93
+ start_batch_tasks = []
94
+ for file in batch_files:
95
+ batch_task = asyncio.create_task(
96
+ _create_batch_jobs(
97
+ batch_file=file,
98
+ client=open_ai_client,
99
+ trace_id=trace_id,
100
+ semaphore=start_batch_semaphore,
101
+ )
102
+ )
103
+ start_batch_tasks.append(batch_task)
104
+ batch_jobs = await asyncio.gather(*start_batch_tasks)
105
+
106
+ process_semaphore = asyncio.Semaphore(concurrency)
107
+ prompt_tasks = []
108
+ for batch in batch_jobs:
109
+ prompt_task = asyncio.create_task(
110
+ _wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore)
111
+ )
112
+ prompt_tasks.append(prompt_task)
113
+ prompt_response_groups = await asyncio.gather(*prompt_tasks)
114
+
115
+ flat_responses = [response for batch in prompt_response_groups for response in batch]
116
+
117
+ sorted_responses = sorted(flat_responses, key=lambda x: x.prompt_index)
99
118
  callable_output = ParallexPromptsCallableOutput(
100
119
  original_prompts=prompts,
101
120
  trace_id=trace_id,
@@ -155,7 +174,7 @@ async def _execute(
155
174
 
156
175
  pages = [page for batch_pages in page_groups for page in batch_pages]
157
176
  logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
158
- sorted_pages = sorted(pages, key=lambda x: x.page_number)
177
+ sorted_pages = sorted(pages, key=lambda x: x.prompt_index)
159
178
 
160
179
  # TODO add combined version of MD to output / save to file system
161
180
  callable_output = ParallexCallableOutput(
@@ -182,6 +201,19 @@ async def _wait_and_create_pages(
182
201
  return page_responses
183
202
 
184
203
 
204
+ async def _wait_and_create_prompt_responses(
205
+ batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
206
+ ):
207
+ async with semaphore:
208
+ logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
209
+ output_file_id = await wait_for_batch_completion(client=client, batch=batch)
210
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
211
+ prompt_responses = await process_prompts_output(
212
+ client=client, output_file_id=output_file_id
213
+ )
214
+ return prompt_responses
215
+
216
+
185
217
  async def _create_batch_jobs(
186
218
  batch_file: BatchFile,
187
219
  client: OpenAIClient,
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "parallex"
3
- version = "0.2.1"
3
+ version = "0.3.1"
4
4
  description = "PDF to markdown using Azure OpenAI batch processing"
5
5
  authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
6
6
  repository = "https://github.com/Summed-AI/parallex"
File without changes
File without changes
File without changes