parallex 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,25 +1,57 @@
1
1
  import json
2
+ from typing import TypeVar, Callable
2
3
 
3
4
  from parallex.ai.open_ai_client import OpenAIClient
4
5
  from parallex.models.page_response import PageResponse
6
+ from parallex.models.prompt_response import PromptResponse
5
7
  from parallex.utils.constants import CUSTOM_ID_DELINEATOR
6
8
 
7
9
 
8
- async def process_output(
10
+ async def process_images_output(
9
11
  client: OpenAIClient, output_file_id: str
10
12
  ) -> list[PageResponse]:
11
- """Gets content from completed Batch to create PageResponse with created markdown"""
13
+ return await _process_output(
14
+ client,
15
+ output_file_id,
16
+ lambda content, identifier: PageResponse(
17
+ output_content=content, page_number=int(identifier)
18
+ ),
19
+ )
20
+
21
+
22
+ async def process_prompts_output(
23
+ client: OpenAIClient, output_file_id: str
24
+ ) -> list[PromptResponse]:
25
+ """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
26
+ return await _process_output(
27
+ client,
28
+ output_file_id,
29
+ lambda content, identifier: PromptResponse(
30
+ output_content=content, prompt_index=int(identifier)
31
+ ),
32
+ )
33
+
34
+
35
+ ResponseType = TypeVar("ResponseType")
36
+
37
+
38
+ async def _process_output(
39
+ client: OpenAIClient,
40
+ output_file_id: str,
41
+ response_builder: Callable[[str, str], ResponseType],
42
+ ) -> list[ResponseType]:
12
43
  file_response = await client.retrieve_file(output_file_id)
13
44
  raw_responses = file_response.text.strip().split("\n")
45
+ responses = []
14
46
 
15
- pages = []
16
47
  for raw_response in raw_responses:
17
48
  json_response = json.loads(raw_response)
18
49
  custom_id = json_response["custom_id"]
19
- page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
50
+ identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
20
51
  output_content = json_response["response"]["body"]["choices"][0]["message"][
21
52
  "content"
22
53
  ]
23
- page = PageResponse(output_content=output_content, page_number=int(page_number))
24
- pages.append(page)
25
- return pages
54
+ response = response_builder(output_content, identifier)
55
+ responses.append(response)
56
+
57
+ return responses
parallex/ai/uploader.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import base64
2
2
  import json
3
3
  import os
4
+ from uuid import UUID
4
5
 
5
6
  from parallex.ai.open_ai_client import OpenAIClient
6
7
  from parallex.file_management.utils import file_in_temp_dir
@@ -8,7 +9,7 @@ from parallex.models.batch_file import BatchFile
8
9
  from parallex.models.image_file import ImageFile
9
10
  from parallex.utils.constants import CUSTOM_ID_DELINEATOR
10
11
 
11
- MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
12
+ MAX_FILE_SIZE = 180 * 1024 * 1024 # 180 MB in bytes. Limit for Azure is 200MB.
12
13
 
13
14
 
14
15
  async def upload_images_for_processing(
@@ -22,22 +23,18 @@ async def upload_images_for_processing(
22
23
  current_index = 0
23
24
  batch_files = []
24
25
  upload_file_location = file_in_temp_dir(
25
- directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
26
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
26
27
  )
27
28
 
28
29
  for image_file in image_files:
29
- if (
30
- os.path.exists(upload_file_location)
31
- and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
32
- ):
30
+ if await _approaching_file_size_limit(upload_file_location):
33
31
  """When approaching upload file limit, upload and start new file"""
34
32
  batch_file = await _create_batch_file(
35
33
  client, trace_id, upload_file_location
36
34
  )
37
35
  batch_files.append(batch_file)
38
- current_index += 1
39
- upload_file_location = file_in_temp_dir(
40
- directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
36
+ upload_file_location = await _increment_batch_file_index(
37
+ current_index, temp_directory, trace_id
41
38
  )
42
39
 
43
40
  with open(image_file.path, "rb") as image:
@@ -46,7 +43,7 @@ async def upload_images_for_processing(
46
43
  prompt_custom_id = (
47
44
  f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
48
45
  )
49
- jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
46
+ jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
50
47
  with open(upload_file_location, "a") as jsonl_file:
51
48
  jsonl_file.write(json.dumps(jsonl) + "\n")
52
49
  batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -54,7 +51,64 @@ async def upload_images_for_processing(
54
51
  return batch_files
55
52
 
56
53
 
57
- async def _create_batch_file(client, trace_id, upload_file_location):
54
+ async def upload_prompts_for_processing(
55
+ client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
56
+ ) -> list[BatchFile]:
57
+ """Creates jsonl file and uploads for processing"""
58
+ current_index = 0
59
+ batch_files = []
60
+
61
+ upload_file_location = await set_file_location(
62
+ current_index, temp_directory, trace_id
63
+ )
64
+ for index, prompt in enumerate(prompts):
65
+ if await _approaching_file_size_limit(upload_file_location):
66
+ """When approaching upload file limit, upload and start new file"""
67
+ batch_file = await _create_batch_file(
68
+ client, trace_id, upload_file_location
69
+ )
70
+ batch_files.append(batch_file)
71
+ upload_file_location = await _increment_batch_file_index(
72
+ current_index, temp_directory, trace_id
73
+ )
74
+
75
+ prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
76
+ jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
77
+ with open(upload_file_location, "a") as jsonl_file:
78
+ jsonl_file.write(json.dumps(jsonl) + "\n")
79
+ batch_file = await _create_batch_file(client, trace_id, upload_file_location)
80
+ batch_files.append(batch_file)
81
+ return batch_files
82
+
83
+
84
+ async def set_file_location(
85
+ current_index: int, temp_directory: str, trace_id: UUID
86
+ ) -> str:
87
+ return file_in_temp_dir(
88
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
89
+ )
90
+
91
+
92
+ async def _approaching_file_size_limit(upload_file_location: str) -> bool:
93
+ return (
94
+ os.path.exists(upload_file_location)
95
+ and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
96
+ )
97
+
98
+
99
+ async def _increment_batch_file_index(
100
+ current_index: int, temp_directory: str, trace_id: UUID
101
+ ) -> str:
102
+ current_index += 1
103
+ upload_file_location = await set_file_location(
104
+ current_index, temp_directory, trace_id
105
+ )
106
+ return upload_file_location
107
+
108
+
109
+ async def _create_batch_file(
110
+ client: OpenAIClient, trace_id: UUID, upload_file_location: str
111
+ ) -> BatchFile:
58
112
  file_response = await client.upload(upload_file_location)
59
113
  return BatchFile(
60
114
  id=file_response.id,
@@ -65,7 +119,19 @@ async def _create_batch_file(client, trace_id, upload_file_location):
65
119
  )
66
120
 
67
121
 
68
- def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
122
+ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
123
+ return {
124
+ "custom_id": prompt_custom_id,
125
+ "method": "POST",
126
+ "url": "/chat/completions",
127
+ "body": {
128
+ "model": os.getenv("AZURE_API_DEPLOYMENT"),
129
+ "messages": [{"role": "user", "content": prompt_text}],
130
+ },
131
+ }
132
+
133
+
134
+ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
69
135
  return {
70
136
  "custom_id": prompt_custom_id,
71
137
  "method": "POST",
@@ -0,0 +1,13 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from parallex.models.prompt_response import PromptResponse
6
+
7
+
8
+ class ParallexPromptsCallableOutput(BaseModel):
9
+ original_prompts: list[str] = Field(description="List of given prompts")
10
+ trace_id: UUID = Field(description="Unique trace for each file")
11
+ responses: list[PromptResponse] = Field(
12
+ description="List of PromptResponse objects"
13
+ )
@@ -0,0 +1,6 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class PromptResponse(BaseModel):
5
+ output_content: str = Field(description="Response from the model")
6
+ prompt_index: int = Field(description="Index corresponding to the given prompts")
parallex/parallex.py CHANGED
@@ -1,17 +1,24 @@
1
1
  import asyncio
2
2
  import tempfile
3
+ import uuid
3
4
  from typing import Callable, Optional
4
5
  from uuid import UUID
5
6
 
6
7
  from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
7
8
  from parallex.ai.open_ai_client import OpenAIClient
8
- from parallex.ai.output_processor import process_output
9
- from parallex.ai.uploader import upload_images_for_processing
9
+ from parallex.ai.output_processor import process_images_output, process_prompts_output
10
+ from parallex.ai.uploader import (
11
+ upload_images_for_processing,
12
+ upload_prompts_for_processing,
13
+ )
10
14
  from parallex.file_management.converter import convert_pdf_to_images
11
15
  from parallex.file_management.file_finder import add_file_to_temp_directory
12
16
  from parallex.file_management.remote_file_handler import RemoteFileHandler
13
17
  from parallex.models.batch_file import BatchFile
14
18
  from parallex.models.parallex_callable_output import ParallexCallableOutput
19
+ from parallex.models.parallex_prompts_callable_output import (
20
+ ParallexPromptsCallableOutput,
21
+ )
15
22
  from parallex.models.upload_batch import UploadBatch
16
23
  from parallex.utils.constants import DEFAULT_PROMPT
17
24
  from parallex.utils.logger import logger, setup_logger
@@ -40,9 +47,92 @@ async def parallex(
40
47
  except Exception as e:
41
48
  logger.error(f"Error occurred: {e}")
42
49
  finally:
43
- for file in remote_file_handler.created_files:
44
- logger.info(f"deleting - {file}")
45
- await open_ai_client.delete_file(file)
50
+ await _delete_associated_files(open_ai_client, remote_file_handler)
51
+
52
+
53
+ async def parallex_simple_prompts(
54
+ model: str,
55
+ prompts: list[str],
56
+ post_process_callable: Optional[Callable[..., None]] = None,
57
+ log_level: Optional[str] = "ERROR",
58
+ concurrency: Optional[int] = 20,
59
+ ) -> ParallexPromptsCallableOutput:
60
+ setup_logger(log_level)
61
+ remote_file_handler = RemoteFileHandler()
62
+ open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
63
+ try:
64
+ return await _prompts_execute(
65
+ open_ai_client=open_ai_client,
66
+ prompts=prompts,
67
+ post_process_callable=post_process_callable,
68
+ concurrency=concurrency,
69
+ )
70
+ except Exception as e:
71
+ logger.error(f"Error occurred: {e}")
72
+ finally:
73
+ await _delete_associated_files(open_ai_client, remote_file_handler)
74
+
75
+
76
+ async def _prompts_execute(
77
+ open_ai_client: OpenAIClient,
78
+ prompts: list[str],
79
+ post_process_callable: Optional[Callable[..., None]] = None,
80
+ concurrency: Optional[int] = 20,
81
+ ):
82
+ with tempfile.TemporaryDirectory() as temp_directory:
83
+ trace_id = uuid.uuid4()
84
+ batch_files = await upload_prompts_for_processing(
85
+ client=open_ai_client,
86
+ prompts=prompts,
87
+ temp_directory=temp_directory,
88
+ trace_id=trace_id,
89
+ )
90
+ start_batch_semaphore = asyncio.Semaphore(concurrency)
91
+ start_batch_tasks = []
92
+ for file in batch_files:
93
+ batch_task = asyncio.create_task(
94
+ _create_batch_jobs(
95
+ batch_file=file,
96
+ client=open_ai_client,
97
+ trace_id=trace_id,
98
+ semaphore=start_batch_semaphore,
99
+ )
100
+ )
101
+ start_batch_tasks.append(batch_task)
102
+ batch_jobs = await asyncio.gather(*start_batch_tasks)
103
+
104
+ prompt_tasks = []
105
+ for batch in batch_jobs:
106
+ logger.info(
107
+ f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
108
+ )
109
+ page_task = asyncio.create_task(
110
+ await wait_for_batch_completion(client=open_ai_client, batch=batch)
111
+ )
112
+ prompt_tasks.append(page_task)
113
+
114
+ output_file_ids = await asyncio.gather(*prompt_tasks)
115
+
116
+ prompts_output = []
117
+ for output_file_id in output_file_ids:
118
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
119
+ prompts_output.append(
120
+ await process_prompts_output(
121
+ client=open_ai_client, output_file_id=output_file_id
122
+ )
123
+ )
124
+
125
+ flat_prompts = [page for batch in prompts_output for page in batch]
126
+
127
+ sorted_responses = sorted(flat_prompts, key=lambda x: x.prompt_index)
128
+ callable_output = ParallexPromptsCallableOutput(
129
+ original_prompts=prompts,
130
+ trace_id=trace_id,
131
+ responses=sorted_responses,
132
+ )
133
+ if post_process_callable is not None:
134
+ post_process_callable(output=callable_output)
135
+ return callable_output
46
136
 
47
137
 
48
138
  async def _execute(
@@ -115,7 +205,7 @@ async def _wait_and_create_pages(
115
205
  logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
116
206
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
117
207
  logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
118
- page_responses = await process_output(
208
+ page_responses = await process_images_output(
119
209
  client=client, output_file_id=output_file_id
120
210
  )
121
211
  return page_responses
@@ -132,3 +222,9 @@ async def _create_batch_jobs(
132
222
  client=client, file_id=batch_file.id, trace_id=trace_id
133
223
  )
134
224
  return upload_batch
225
+
226
+
227
+ async def _delete_associated_files(open_ai_client, remote_file_handler):
228
+ for file in remote_file_handler.created_files:
229
+ logger.info(f"deleting - {file}")
230
+ await open_ai_client.delete_file(file)
@@ -6,4 +6,4 @@ DEFAULT_PROMPT = """
6
6
  If unable to parse, return an empty string.
7
7
  """
8
8
 
9
- CUSTOM_ID_DELINEATOR = "--page--"
9
+ CUSTOM_ID_DELINEATOR = "--parallex--"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -96,3 +96,28 @@ class PageResponse(BaseModel):
96
96
  """
97
97
  ```
98
98
 
99
+ ### Batch processing for list of prompts
100
+ If you do not need to process images, but just want to process prompts using the Batch API,
101
+ you can call;
102
+ ```python
103
+ response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
104
+ model=model,
105
+ prompts=["Some prompt", "Some other prompt"],
106
+ post_process_callable=example_post_process
107
+ )
108
+ responses = response_data.responses
109
+ ```
110
+ This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
111
+
112
+ Responses have the following structure;
113
+ ```python
114
+ class ParallexPromptsCallableOutput(BaseModel):
115
+ original_prompts: list[str] = Field(description="List of given prompts")
116
+ trace_id: UUID = Field(description="Unique trace for each file")
117
+ responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
118
+
119
+ class PromptResponse(BaseModel):
120
+ output_content: str = Field(description="Response from the model")
121
+ prompt_index: int = Field(description="Index corresponding to the given prompts")
122
+ ```
123
+
@@ -1,8 +1,8 @@
1
1
  parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
3
3
  parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
4
- parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
5
- parallex/ai/uploader.py,sha256=92P0LxLuRgtjtD4kLtM0n8WUww_8-GImLxb3pbl-kkg,3174
4
+ parallex/ai/output_processor.py,sha256=Rwp8dkLo4xsqooeBh3Xv-uGVbJMG1JQkwyxdUoOs2tQ,1800
5
+ parallex/ai/uploader.py,sha256=9GvrzuaQAxqRiYN5dUHWjFeIFXezH0Y7ARnzBkEHbL0,5451
6
6
  parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
7
  parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
8
  parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
@@ -11,12 +11,14 @@ parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M
11
11
  parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
12
12
  parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
13
13
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
14
+ parallex/models/parallex_prompts_callable_output.py,sha256=IlNX9627_E8aXWQ-vDBuv2-9jMFXqn4LFBbShPzxoc4,421
15
+ parallex/models/prompt_response.py,sha256=LcctuyqwiTHWrZHSahwauMaSBsin5Ws6fQRAzGXTsAA,230
14
16
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
15
17
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
16
- parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
17
- parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
18
+ parallex/parallex.py,sha256=7YFKnKOkFHoTC7CCHhrXG1JTxprbvw0QkNGOEPYJbvQ,8500
19
+ parallex/utils/constants.py,sha256=508ieZLZ5kse0T4_QyNJp57Aq0DMNFjjyFlsKa0xtek,366
18
20
  parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
19
- parallex-0.2.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
20
- parallex-0.2.0.dist-info/METADATA,sha256=Aq2RRlLwkXcLZ_wNXLZAsydFYJqTSe47eVhAq78oja8,3416
21
- parallex-0.2.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
22
- parallex-0.2.0.dist-info/RECORD,,
21
+ parallex-0.3.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
22
+ parallex-0.3.0.dist-info/METADATA,sha256=hIIhGrV5PE-E-lkWf-kBE3QBPevKSVRHkw0hUx_iqik,4461
23
+ parallex-0.3.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
24
+ parallex-0.3.0.dist-info/RECORD,,