parallex 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parallex/ai/uploader.py CHANGED
@@ -9,7 +9,7 @@ from parallex.models.batch_file import BatchFile
9
9
  from parallex.models.image_file import ImageFile
10
10
  from parallex.utils.constants import CUSTOM_ID_DELINEATOR
11
11
 
12
- MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
12
+ MAX_FILE_SIZE = 180 * 1024 * 1024 # 180 MB in bytes. Limit for Azure is 200MB.
13
13
 
14
14
 
15
15
  async def upload_images_for_processing(
@@ -23,22 +23,18 @@ async def upload_images_for_processing(
23
23
  current_index = 0
24
24
  batch_files = []
25
25
  upload_file_location = file_in_temp_dir(
26
- directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
26
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
27
27
  )
28
28
 
29
29
  for image_file in image_files:
30
- if (
31
- os.path.exists(upload_file_location)
32
- and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
33
- ):
30
+ if await _approaching_file_size_limit(upload_file_location):
34
31
  """When approaching upload file limit, upload and start new file"""
35
32
  batch_file = await _create_batch_file(
36
33
  client, trace_id, upload_file_location
37
34
  )
38
35
  batch_files.append(batch_file)
39
- current_index += 1
40
- upload_file_location = file_in_temp_dir(
41
- directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
36
+ upload_file_location = await _increment_batch_file_index(
37
+ current_index, temp_directory, trace_id
42
38
  )
43
39
 
44
40
  with open(image_file.path, "rb") as image:
@@ -57,21 +53,62 @@ async def upload_images_for_processing(
57
53
 
58
54
  async def upload_prompts_for_processing(
59
55
  client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
60
- ) -> BatchFile:
56
+ ) -> list[BatchFile]:
61
57
  """Creates jsonl file and uploads for processing"""
62
- upload_file_location = file_in_temp_dir(
63
- directory=temp_directory, file_name=f"prompts-{trace_id}.jsonl"
58
+ current_index = 0
59
+ batch_files = []
60
+
61
+ upload_file_location = await set_file_location(
62
+ current_index, temp_directory, trace_id
64
63
  )
65
64
  for index, prompt in enumerate(prompts):
65
+ if await _approaching_file_size_limit(upload_file_location):
66
+ """When approaching upload file limit, upload and start new file"""
67
+ batch_file = await _create_batch_file(
68
+ client, trace_id, upload_file_location
69
+ )
70
+ batch_files.append(batch_file)
71
+ upload_file_location = await _increment_batch_file_index(
72
+ current_index, temp_directory, trace_id
73
+ )
74
+
66
75
  prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
67
76
  jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
68
77
  with open(upload_file_location, "a") as jsonl_file:
69
78
  jsonl_file.write(json.dumps(jsonl) + "\n")
70
- batch_file = await _create_batch_file(client, trace_id, upload_file_location)
71
- return batch_file
79
+ batch_file = await _create_batch_file(client, trace_id, upload_file_location)
80
+ batch_files.append(batch_file)
81
+ return batch_files
82
+
83
+
84
+ async def set_file_location(
85
+ current_index: int, temp_directory: str, trace_id: UUID
86
+ ) -> str:
87
+ return file_in_temp_dir(
88
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
89
+ )
90
+
91
+
92
+ async def _approaching_file_size_limit(upload_file_location: str) -> bool:
93
+ return (
94
+ os.path.exists(upload_file_location)
95
+ and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
96
+ )
97
+
98
+
99
+ async def _increment_batch_file_index(
100
+ current_index: int, temp_directory: str, trace_id: UUID
101
+ ) -> str:
102
+ current_index += 1
103
+ upload_file_location = await set_file_location(
104
+ current_index, temp_directory, trace_id
105
+ )
106
+ return upload_file_location
72
107
 
73
108
 
74
- async def _create_batch_file(client, trace_id, upload_file_location):
109
+ async def _create_batch_file(
110
+ client: OpenAIClient, trace_id: UUID, upload_file_location: str
111
+ ) -> BatchFile:
75
112
  file_response = await client.upload(upload_file_location)
76
113
  return BatchFile(
77
114
  id=file_response.id,
@@ -82,7 +119,7 @@ async def _create_batch_file(client, trace_id, upload_file_location):
82
119
  )
83
120
 
84
121
 
85
- def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str):
122
+ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
86
123
  return {
87
124
  "custom_id": prompt_custom_id,
88
125
  "method": "POST",
parallex/parallex.py CHANGED
@@ -55,6 +55,7 @@ async def parallex_simple_prompts(
55
55
  prompts: list[str],
56
56
  post_process_callable: Optional[Callable[..., None]] = None,
57
57
  log_level: Optional[str] = "ERROR",
58
+ concurrency: Optional[int] = 20,
58
59
  ) -> ParallexPromptsCallableOutput:
59
60
  setup_logger(log_level)
60
61
  remote_file_handler = RemoteFileHandler()
@@ -64,6 +65,7 @@ async def parallex_simple_prompts(
64
65
  open_ai_client=open_ai_client,
65
66
  prompts=prompts,
66
67
  post_process_callable=post_process_callable,
68
+ concurrency=concurrency,
67
69
  )
68
70
  except Exception as e:
69
71
  logger.error(f"Error occurred: {e}")
@@ -75,27 +77,54 @@ async def _prompts_execute(
75
77
  open_ai_client: OpenAIClient,
76
78
  prompts: list[str],
77
79
  post_process_callable: Optional[Callable[..., None]] = None,
80
+ concurrency: Optional[int] = 20,
78
81
  ):
79
82
  with tempfile.TemporaryDirectory() as temp_directory:
80
83
  trace_id = uuid.uuid4()
81
- batch_file = await upload_prompts_for_processing(
84
+ batch_files = await upload_prompts_for_processing(
82
85
  client=open_ai_client,
83
86
  prompts=prompts,
84
87
  temp_directory=temp_directory,
85
88
  trace_id=trace_id,
86
89
  )
87
- batch = await create_batch(
88
- client=open_ai_client, file_id=batch_file.id, trace_id=trace_id
89
- )
90
- logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
91
- output_file_id = await wait_for_batch_completion(
92
- client=open_ai_client, batch=batch
93
- )
94
- logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
95
- prompts_output = await process_prompts_output(
96
- client=open_ai_client, output_file_id=output_file_id
97
- )
98
- sorted_responses = sorted(prompts_output, key=lambda x: x.prompt_index)
90
+ start_batch_semaphore = asyncio.Semaphore(concurrency)
91
+ start_batch_tasks = []
92
+ for file in batch_files:
93
+ batch_task = asyncio.create_task(
94
+ _create_batch_jobs(
95
+ batch_file=file,
96
+ client=open_ai_client,
97
+ trace_id=trace_id,
98
+ semaphore=start_batch_semaphore,
99
+ )
100
+ )
101
+ start_batch_tasks.append(batch_task)
102
+ batch_jobs = await asyncio.gather(*start_batch_tasks)
103
+
104
+ prompt_tasks = []
105
+ for batch in batch_jobs:
106
+ logger.info(
107
+ f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
108
+ )
109
+ page_task = asyncio.create_task(
110
+ await wait_for_batch_completion(client=open_ai_client, batch=batch)
111
+ )
112
+ prompt_tasks.append(page_task)
113
+
114
+ output_file_ids = await asyncio.gather(*prompt_tasks)
115
+
116
+ prompts_output = []
117
+ for output_file_id in output_file_ids:
118
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
119
+ prompts_output.append(
120
+ await process_prompts_output(
121
+ client=open_ai_client, output_file_id=output_file_id
122
+ )
123
+ )
124
+
125
+ flat_prompts = [page for batch in prompts_output for page in batch]
126
+
127
+ sorted_responses = sorted(flat_prompts, key=lambda x: x.prompt_index)
99
128
  callable_output = ParallexPromptsCallableOutput(
100
129
  original_prompts=prompts,
101
130
  trace_id=trace_id,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -2,7 +2,7 @@ parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
3
3
  parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
4
4
  parallex/ai/output_processor.py,sha256=Rwp8dkLo4xsqooeBh3Xv-uGVbJMG1JQkwyxdUoOs2tQ,1800
5
- parallex/ai/uploader.py,sha256=_Z6-XBg_OmgJkXY55y-BxzQt0BG4iLByDaNWYwCDX1c,4273
5
+ parallex/ai/uploader.py,sha256=9GvrzuaQAxqRiYN5dUHWjFeIFXezH0Y7ARnzBkEHbL0,5451
6
6
  parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
7
  parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
8
  parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
@@ -15,10 +15,10 @@ parallex/models/parallex_prompts_callable_output.py,sha256=IlNX9627_E8aXWQ-vDBuv
15
15
  parallex/models/prompt_response.py,sha256=LcctuyqwiTHWrZHSahwauMaSBsin5Ws6fQRAzGXTsAA,230
16
16
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
17
17
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
18
- parallex/parallex.py,sha256=12N0-r3OMR6XtYXvGm7D3L6i7sQOeKyIAmBfJzpoetY,7546
18
+ parallex/parallex.py,sha256=7YFKnKOkFHoTC7CCHhrXG1JTxprbvw0QkNGOEPYJbvQ,8500
19
19
  parallex/utils/constants.py,sha256=508ieZLZ5kse0T4_QyNJp57Aq0DMNFjjyFlsKa0xtek,366
20
20
  parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
21
- parallex-0.2.1.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
22
- parallex-0.2.1.dist-info/METADATA,sha256=Dt5XmxUHWonlr54qebQ66rfVYSxC_3dJXL2V2EeWnAA,4461
23
- parallex-0.2.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
24
- parallex-0.2.1.dist-info/RECORD,,
21
+ parallex-0.3.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
22
+ parallex-0.3.0.dist-info/METADATA,sha256=hIIhGrV5PE-E-lkWf-kBE3QBPevKSVRHkw0hUx_iqik,4461
23
+ parallex-0.3.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
24
+ parallex-0.3.0.dist-info/RECORD,,