parallex 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,7 @@
1
1
  import json
2
- from typing import TypeVar, Callable
2
+ from typing import TypeVar, Callable, Optional
3
+
4
+ from pydantic import BaseModel
3
5
 
4
6
  from parallex.ai.open_ai_client import OpenAIClient
5
7
  from parallex.models.page_response import PageResponse
@@ -8,11 +10,12 @@ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
8
10
 
9
11
 
10
12
  async def process_images_output(
11
- client: OpenAIClient, output_file_id: str
13
+ client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
12
14
  ) -> list[PageResponse]:
13
15
  return await _process_output(
14
16
  client,
15
17
  output_file_id,
18
+ model,
16
19
  lambda content, identifier: PageResponse(
17
20
  output_content=content, page_number=int(identifier)
18
21
  ),
@@ -20,12 +23,13 @@ async def process_images_output(
20
23
 
21
24
 
22
25
  async def process_prompts_output(
23
- client: OpenAIClient, output_file_id: str
26
+ client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
24
27
  ) -> list[PromptResponse]:
25
28
  """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
26
29
  return await _process_output(
27
30
  client,
28
31
  output_file_id,
32
+ model,
29
33
  lambda content, identifier: PromptResponse(
30
34
  output_content=content, prompt_index=int(identifier)
31
35
  ),
@@ -38,6 +42,7 @@ ResponseType = TypeVar("ResponseType")
38
42
  async def _process_output(
39
43
  client: OpenAIClient,
40
44
  output_file_id: str,
45
+ model: Optional[type[BaseModel]],
41
46
  response_builder: Callable[[str, str], ResponseType],
42
47
  ) -> list[ResponseType]:
43
48
  file_response = await client.retrieve_file(output_file_id)
@@ -48,9 +53,10 @@ async def _process_output(
48
53
  json_response = json.loads(raw_response)
49
54
  custom_id = json_response["custom_id"]
50
55
  identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
51
- output_content = json_response["response"]["body"]["choices"][0]["message"][
52
- "content"
53
- ]
56
+ output_content = json_response["response"]["body"]["choices"][0]["message"]["content"]
57
+ if model:
58
+ json_data = json.loads(output_content)
59
+ output_content = model(**json_data)
54
60
  response = response_builder(output_content, identifier)
55
61
  responses.append(response)
56
62
 
parallex/ai/uploader.py CHANGED
@@ -1,8 +1,12 @@
1
1
  import base64
2
2
  import json
3
3
  import os
4
+ from typing import Optional
4
5
  from uuid import UUID
5
6
 
7
+ from openai.lib._pydantic import to_strict_json_schema
8
+ from pydantic import BaseModel
9
+
6
10
  from parallex.ai.open_ai_client import OpenAIClient
7
11
  from parallex.file_management.utils import file_in_temp_dir
8
12
  from parallex.models.batch_file import BatchFile
@@ -17,6 +21,7 @@ async def upload_images_for_processing(
17
21
  image_files: list[ImageFile],
18
22
  temp_directory: str,
19
23
  prompt_text: str,
24
+ model: Optional[type[BaseModel]] = None,
20
25
  ) -> list[BatchFile]:
21
26
  """Base64 encodes image, converts to expected jsonl format and uploads"""
22
27
  trace_id = image_files[0].trace_id
@@ -43,7 +48,7 @@ async def upload_images_for_processing(
43
48
  prompt_custom_id = (
44
49
  f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
45
50
  )
46
- jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
51
+ jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text, model)
47
52
  with open(upload_file_location, "a") as jsonl_file:
48
53
  jsonl_file.write(json.dumps(jsonl) + "\n")
49
54
  batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -52,7 +57,10 @@ async def upload_images_for_processing(
52
57
 
53
58
 
54
59
  async def upload_prompts_for_processing(
55
- client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
60
+ client: OpenAIClient,
61
+ prompts: list[str], temp_directory: str,
62
+ trace_id: UUID,
63
+ model: Optional[type[BaseModel]] = None
56
64
  ) -> list[BatchFile]:
57
65
  """Creates jsonl file and uploads for processing"""
58
66
  current_index = 0
@@ -73,7 +81,7 @@ async def upload_prompts_for_processing(
73
81
  )
74
82
 
75
83
  prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
76
- jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
84
+ jsonl = _simple_jsonl_format(prompt_custom_id, prompt, model)
77
85
  with open(upload_file_location, "a") as jsonl_file:
78
86
  jsonl_file.write(json.dumps(jsonl) + "\n")
79
87
  batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -119,8 +127,20 @@ async def _create_batch_file(
119
127
  )
120
128
 
121
129
 
122
- def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
130
+ def _response_format(model: type[BaseModel]) -> dict:
131
+ schema = to_strict_json_schema(model)
123
132
  return {
133
+ "type": "json_schema",
134
+ "json_schema": {
135
+ "name": model.__name__,
136
+ "strict": True,
137
+ "schema": schema
138
+ }
139
+ }
140
+
141
+
142
+ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str, model: Optional[type[BaseModel]]) -> dict:
143
+ payload = {
124
144
  "custom_id": prompt_custom_id,
125
145
  "method": "POST",
126
146
  "url": "/chat/completions",
@@ -130,10 +150,13 @@ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
130
150
  "temperature": 0.0, # TODO make configurable
131
151
  },
132
152
  }
153
+ if model is not None:
154
+ payload["body"]["response_format"] = _response_format(model)
155
+ return payload
133
156
 
134
157
 
135
- def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
136
- return {
158
+ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str, model: Optional[type[BaseModel]] = None) -> dict:
159
+ payload = {
137
160
  "custom_id": prompt_custom_id,
138
161
  "method": "POST",
139
162
  "url": "/chat/completions",
@@ -154,5 +177,9 @@ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text:
154
177
  }
155
178
  ],
156
179
  "max_tokens": 2000,
180
+ "response_format": {"type": "json_object"}
157
181
  },
158
182
  }
183
+ if model is not None:
184
+ payload["body"]["response_format"] = _response_format(model)
185
+ return payload
@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
2
2
 
3
3
 
4
4
  class PageResponse(BaseModel):
5
- output_content: str = Field(description="Markdown generated for the page")
5
+ output_content: str | BaseModel = Field(description="Markdown generated for the page")
6
6
  page_number: int = Field(description="Page number of the associated PDF")
@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
2
2
 
3
3
 
4
4
  class PromptResponse(BaseModel):
5
- output_content: str = Field(description="Response from the model")
5
+ output_content: str | BaseModel = Field(description="Response from the model")
6
6
  prompt_index: int = Field(description="Index corresponding to the given prompts")
parallex/parallex.py CHANGED
@@ -4,6 +4,8 @@ import uuid
4
4
  from typing import Callable, Optional
5
5
  from uuid import UUID
6
6
 
7
+ from pydantic import BaseModel
8
+
7
9
  from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
8
10
  from parallex.ai.open_ai_client import OpenAIClient
9
11
  from parallex.ai.output_processor import process_images_output, process_prompts_output
@@ -32,6 +34,7 @@ async def parallex(
32
34
  concurrency: Optional[int] = 20,
33
35
  prompt_text: Optional[str] = DEFAULT_PROMPT,
34
36
  log_level: Optional[str] = "ERROR",
37
+ response_model: Optional[type[BaseModel]] = None,
35
38
  ) -> ParallexCallableOutput:
36
39
  setup_logger(log_level)
37
40
  remote_file_handler = RemoteFileHandler()
@@ -43,6 +46,7 @@ async def parallex(
43
46
  post_process_callable=post_process_callable,
44
47
  concurrency=concurrency,
45
48
  prompt_text=prompt_text,
49
+ model=response_model
46
50
  )
47
51
  except Exception as e:
48
52
  logger.error(f"Error occurred: {e}")
@@ -57,6 +61,7 @@ async def parallex_simple_prompts(
57
61
  post_process_callable: Optional[Callable[..., None]] = None,
58
62
  log_level: Optional[str] = "ERROR",
59
63
  concurrency: Optional[int] = 20,
64
+ response_model: Optional[type[BaseModel]] = None,
60
65
  ) -> ParallexPromptsCallableOutput:
61
66
  setup_logger(log_level)
62
67
  remote_file_handler = RemoteFileHandler()
@@ -67,6 +72,7 @@ async def parallex_simple_prompts(
67
72
  prompts=prompts,
68
73
  post_process_callable=post_process_callable,
69
74
  concurrency=concurrency,
75
+ model=response_model,
70
76
  )
71
77
  except Exception as e:
72
78
  logger.error(f"Error occurred: {e}")
@@ -80,6 +86,7 @@ async def _prompts_execute(
80
86
  prompts: list[str],
81
87
  post_process_callable: Optional[Callable[..., None]] = None,
82
88
  concurrency: Optional[int] = 20,
89
+ model: Optional[type[BaseModel]] = None,
83
90
  ):
84
91
  with tempfile.TemporaryDirectory() as temp_directory:
85
92
  trace_id = uuid.uuid4()
@@ -88,6 +95,7 @@ async def _prompts_execute(
88
95
  prompts=prompts,
89
96
  temp_directory=temp_directory,
90
97
  trace_id=trace_id,
98
+ model=model,
91
99
  )
92
100
  start_batch_semaphore = asyncio.Semaphore(concurrency)
93
101
  start_batch_tasks = []
@@ -110,7 +118,7 @@ async def _prompts_execute(
110
118
  f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
111
119
  )
112
120
  prompt_task = asyncio.create_task(
113
- _wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore)
121
+ _wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model)
114
122
  )
115
123
  prompt_tasks.append(prompt_task)
116
124
  prompt_response_groups = await asyncio.gather(*prompt_tasks)
@@ -134,6 +142,7 @@ async def _execute(
134
142
  post_process_callable: Optional[Callable[..., None]] = None,
135
143
  concurrency: Optional[int] = 20,
136
144
  prompt_text: Optional[str] = DEFAULT_PROMPT,
145
+ model: Optional[type[BaseModel]] = None,
137
146
  ) -> ParallexCallableOutput:
138
147
  with tempfile.TemporaryDirectory() as temp_directory:
139
148
  raw_file = await add_file_to_temp_directory(
@@ -169,7 +178,7 @@ async def _execute(
169
178
  for batch in batch_jobs:
170
179
  page_task = asyncio.create_task(
171
180
  _wait_and_create_pages(
172
- batch=batch, client=open_ai_client, semaphore=process_semaphore
181
+ batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model
173
182
  )
174
183
  )
175
184
  pages_tasks.append(page_task)
@@ -192,27 +201,27 @@ async def _execute(
192
201
 
193
202
 
194
203
  async def _wait_and_create_pages(
195
- batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
204
+ batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
196
205
  ):
197
206
  async with semaphore:
198
207
  logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
199
208
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
200
209
  logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
201
210
  page_responses = await process_images_output(
202
- client=client, output_file_id=output_file_id
211
+ client=client, output_file_id=output_file_id, model=model,
203
212
  )
204
213
  return page_responses
205
214
 
206
215
 
207
216
  async def _wait_and_create_prompt_responses(
208
- batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
217
+ batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
209
218
  ):
210
219
  async with semaphore:
211
220
  logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
212
221
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
213
222
  logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
214
223
  prompt_responses = await process_prompts_output(
215
- client=client, output_file_id=output_file_id
224
+ client=client, output_file_id=output_file_id, model=model,
216
225
  )
217
226
  return prompt_responses
218
227
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.3.4
3
+ Version: 0.4.0
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -1,24 +1,24 @@
1
1
  parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
3
3
  parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
4
- parallex/ai/output_processor.py,sha256=Rwp8dkLo4xsqooeBh3Xv-uGVbJMG1JQkwyxdUoOs2tQ,1800
5
- parallex/ai/uploader.py,sha256=Il4dllaPn6NGoU1YWi56ZJkzaOQzKg9lUngfc3ANOKg,5500
4
+ parallex/ai/output_processor.py,sha256=kd50DwB2txhzz4_MPYl97bPOtLMl0KV2UP_eFmUtq34,2087
5
+ parallex/ai/uploader.py,sha256=FKleSK8GWextqpUUAthvTtxGHSwN-aYF127t1YmGOcw,6375
6
6
  parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
7
  parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
8
  parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
9
9
  parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
10
10
  parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
11
11
  parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
12
- parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
12
+ parallex/models/page_response.py,sha256=uqVdHXoEWX3NVvr0Y2_izSA1cpw3EXFZRe1HmI4ypLk,240
13
13
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
14
14
  parallex/models/parallex_prompts_callable_output.py,sha256=IlNX9627_E8aXWQ-vDBuv2-9jMFXqn4LFBbShPzxoc4,421
15
- parallex/models/prompt_response.py,sha256=LcctuyqwiTHWrZHSahwauMaSBsin5Ws6fQRAzGXTsAA,230
15
+ parallex/models/prompt_response.py,sha256=2Zmnwlj8Ou2VgEHmi1VZrlnv5XRzw5VLMEkpQ1VelQQ,242
16
16
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
17
17
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
18
- parallex/parallex.py,sha256=0nOfEXeiuTKi0gQSnqdNyPxIYvuE7Wfp4HtmSbVsEs4,8864
18
+ parallex/parallex.py,sha256=JogDmjB-HdsauCis6hyfSBF_tQi2IdmXfltK72roi28,9322
19
19
  parallex/utils/constants.py,sha256=508ieZLZ5kse0T4_QyNJp57Aq0DMNFjjyFlsKa0xtek,366
20
20
  parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
21
- parallex-0.3.4.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
22
- parallex-0.3.4.dist-info/METADATA,sha256=gIXiPBgPJVnqZbfa8xsxMN0cTDJjalZmplnOUHfI9-0,4461
23
- parallex-0.3.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
24
- parallex-0.3.4.dist-info/RECORD,,
21
+ parallex-0.4.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
22
+ parallex-0.4.0.dist-info/METADATA,sha256=Hdq1xbDWVVPhR-61O88E9Glv7rn3LzKfz72--rzJovo,4461
23
+ parallex-0.4.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
24
+ parallex-0.4.0.dist-info/RECORD,,