parallex 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,9 +15,9 @@ class OpenAIClient:
15
15
  self.file_handler = remote_file_handler
16
16
 
17
17
  self._client = AsyncAzureOpenAI(
18
- azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
19
- api_key=os.getenv("AZURE_OPENAI_API_KEY"),
20
- api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
18
+ azure_endpoint=os.getenv("AZURE_API_BASE"),
19
+ api_key=os.getenv("AZURE_API_KEY"),
20
+ api_version=os.getenv("AZURE_API_VERSION"),
21
21
  )
22
22
 
23
23
  async def upload(self, file_path: str) -> FileObject:
@@ -1,25 +1,57 @@
1
1
  import json
2
+ from typing import TypeVar, Callable
2
3
 
3
4
  from parallex.ai.open_ai_client import OpenAIClient
4
5
  from parallex.models.page_response import PageResponse
6
+ from parallex.models.prompt_response import PromptResponse
5
7
  from parallex.utils.constants import CUSTOM_ID_DELINEATOR
6
8
 
7
9
 
8
- async def process_output(
10
+ async def process_images_output(
9
11
  client: OpenAIClient, output_file_id: str
10
12
  ) -> list[PageResponse]:
11
- """Gets content from completed Batch to create PageResponse with created markdown"""
13
+ return await _process_output(
14
+ client,
15
+ output_file_id,
16
+ lambda content, identifier: PageResponse(
17
+ output_content=content, page_number=int(identifier)
18
+ ),
19
+ )
20
+
21
+
22
+ async def process_prompts_output(
23
+ client: OpenAIClient, output_file_id: str
24
+ ) -> list[PromptResponse]:
25
+ """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
26
+ return await _process_output(
27
+ client,
28
+ output_file_id,
29
+ lambda content, identifier: PromptResponse(
30
+ output_content=content, prompt_index=int(identifier)
31
+ ),
32
+ )
33
+
34
+
35
+ ResponseType = TypeVar("ResponseType")
36
+
37
+
38
+ async def _process_output(
39
+ client: OpenAIClient,
40
+ output_file_id: str,
41
+ response_builder: Callable[[str, str], ResponseType],
42
+ ) -> list[ResponseType]:
12
43
  file_response = await client.retrieve_file(output_file_id)
13
44
  raw_responses = file_response.text.strip().split("\n")
45
+ responses = []
14
46
 
15
- pages = []
16
47
  for raw_response in raw_responses:
17
48
  json_response = json.loads(raw_response)
18
49
  custom_id = json_response["custom_id"]
19
- page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
50
+ identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
20
51
  output_content = json_response["response"]["body"]["choices"][0]["message"][
21
52
  "content"
22
53
  ]
23
- page = PageResponse(output_content=output_content, page_number=int(page_number))
24
- pages.append(page)
25
- return pages
54
+ response = response_builder(output_content, identifier)
55
+ responses.append(response)
56
+
57
+ return responses
parallex/ai/uploader.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import base64
2
2
  import json
3
3
  import os
4
+ from uuid import UUID
4
5
 
5
6
  from parallex.ai.open_ai_client import OpenAIClient
6
7
  from parallex.file_management.utils import file_in_temp_dir
@@ -46,7 +47,7 @@ async def upload_images_for_processing(
46
47
  prompt_custom_id = (
47
48
  f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
48
49
  )
49
- jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
50
+ jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
50
51
  with open(upload_file_location, "a") as jsonl_file:
51
52
  jsonl_file.write(json.dumps(jsonl) + "\n")
52
53
  batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -54,6 +55,22 @@ async def upload_images_for_processing(
54
55
  return batch_files
55
56
 
56
57
 
58
+ async def upload_prompts_for_processing(
59
+ client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
60
+ ) -> BatchFile:
61
+ """Creates jsonl file and uploads for processing"""
62
+ upload_file_location = file_in_temp_dir(
63
+ directory=temp_directory, file_name=f"prompts-{trace_id}.jsonl"
64
+ )
65
+ for index, prompt in enumerate(prompts):
66
+ prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
67
+ jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
68
+ with open(upload_file_location, "a") as jsonl_file:
69
+ jsonl_file.write(json.dumps(jsonl) + "\n")
70
+ batch_file = await _create_batch_file(client, trace_id, upload_file_location)
71
+ return batch_file
72
+
73
+
57
74
  async def _create_batch_file(client, trace_id, upload_file_location):
58
75
  file_response = await client.upload(upload_file_location)
59
76
  return BatchFile(
@@ -65,13 +82,25 @@ async def _create_batch_file(client, trace_id, upload_file_location):
65
82
  )
66
83
 
67
84
 
68
- def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
85
+ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str):
86
+ return {
87
+ "custom_id": prompt_custom_id,
88
+ "method": "POST",
89
+ "url": "/chat/completions",
90
+ "body": {
91
+ "model": os.getenv("AZURE_API_DEPLOYMENT"),
92
+ "messages": [{"role": "user", "content": prompt_text}],
93
+ },
94
+ }
95
+
96
+
97
+ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
69
98
  return {
70
99
  "custom_id": prompt_custom_id,
71
100
  "method": "POST",
72
101
  "url": "/chat/completions",
73
102
  "body": {
74
- "model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
103
+ "model": os.getenv("AZURE_API_DEPLOYMENT"),
75
104
  "messages": [
76
105
  {
77
106
  "role": "user",
@@ -0,0 +1,13 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from parallex.models.prompt_response import PromptResponse
6
+
7
+
8
+ class ParallexPromptsCallableOutput(BaseModel):
9
+ original_prompts: list[str] = Field(description="List of given prompts")
10
+ trace_id: UUID = Field(description="Unique trace for each file")
11
+ responses: list[PromptResponse] = Field(
12
+ description="List of PromptResponse objects"
13
+ )
@@ -0,0 +1,6 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class PromptResponse(BaseModel):
5
+ output_content: str = Field(description="Response from the model")
6
+ prompt_index: int = Field(description="Index corresponding to the given prompts")
parallex/parallex.py CHANGED
@@ -1,17 +1,24 @@
1
1
  import asyncio
2
2
  import tempfile
3
+ import uuid
3
4
  from typing import Callable, Optional
4
5
  from uuid import UUID
5
6
 
6
7
  from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
7
8
  from parallex.ai.open_ai_client import OpenAIClient
8
- from parallex.ai.output_processor import process_output
9
- from parallex.ai.uploader import upload_images_for_processing
9
+ from parallex.ai.output_processor import process_images_output, process_prompts_output
10
+ from parallex.ai.uploader import (
11
+ upload_images_for_processing,
12
+ upload_prompts_for_processing,
13
+ )
10
14
  from parallex.file_management.converter import convert_pdf_to_images
11
15
  from parallex.file_management.file_finder import add_file_to_temp_directory
12
16
  from parallex.file_management.remote_file_handler import RemoteFileHandler
13
17
  from parallex.models.batch_file import BatchFile
14
18
  from parallex.models.parallex_callable_output import ParallexCallableOutput
19
+ from parallex.models.parallex_prompts_callable_output import (
20
+ ParallexPromptsCallableOutput,
21
+ )
15
22
  from parallex.models.upload_batch import UploadBatch
16
23
  from parallex.utils.constants import DEFAULT_PROMPT
17
24
  from parallex.utils.logger import logger, setup_logger
@@ -40,9 +47,63 @@ async def parallex(
40
47
  except Exception as e:
41
48
  logger.error(f"Error occurred: {e}")
42
49
  finally:
43
- for file in remote_file_handler.created_files:
44
- logger.info(f"deleting - {file}")
45
- await open_ai_client.delete_file(file)
50
+ await _delete_associated_files(open_ai_client, remote_file_handler)
51
+
52
+
53
+ async def parallex_simple_prompts(
54
+ model: str,
55
+ prompts: list[str],
56
+ post_process_callable: Optional[Callable[..., None]] = None,
57
+ log_level: Optional[str] = "ERROR",
58
+ ) -> ParallexPromptsCallableOutput:
59
+ setup_logger(log_level)
60
+ remote_file_handler = RemoteFileHandler()
61
+ open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
62
+ try:
63
+ return await _prompts_execute(
64
+ open_ai_client=open_ai_client,
65
+ prompts=prompts,
66
+ post_process_callable=post_process_callable,
67
+ )
68
+ except Exception as e:
69
+ logger.error(f"Error occurred: {e}")
70
+ finally:
71
+ await _delete_associated_files(open_ai_client, remote_file_handler)
72
+
73
+
74
+ async def _prompts_execute(
75
+ open_ai_client: OpenAIClient,
76
+ prompts: list[str],
77
+ post_process_callable: Optional[Callable[..., None]] = None,
78
+ ):
79
+ with tempfile.TemporaryDirectory() as temp_directory:
80
+ trace_id = uuid.uuid4()
81
+ batch_file = await upload_prompts_for_processing(
82
+ client=open_ai_client,
83
+ prompts=prompts,
84
+ temp_directory=temp_directory,
85
+ trace_id=trace_id,
86
+ )
87
+ batch = await create_batch(
88
+ client=open_ai_client, file_id=batch_file.id, trace_id=trace_id
89
+ )
90
+ logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
91
+ output_file_id = await wait_for_batch_completion(
92
+ client=open_ai_client, batch=batch
93
+ )
94
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
95
+ prompts_output = await process_prompts_output(
96
+ client=open_ai_client, output_file_id=output_file_id
97
+ )
98
+ sorted_responses = sorted(prompts_output, key=lambda x: x.prompt_index)
99
+ callable_output = ParallexPromptsCallableOutput(
100
+ original_prompts=prompts,
101
+ trace_id=trace_id,
102
+ responses=sorted_responses,
103
+ )
104
+ if post_process_callable is not None:
105
+ post_process_callable(output=callable_output)
106
+ return callable_output
46
107
 
47
108
 
48
109
  async def _execute(
@@ -115,7 +176,7 @@ async def _wait_and_create_pages(
115
176
  logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
116
177
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
117
178
  logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
118
- page_responses = await process_output(
179
+ page_responses = await process_images_output(
119
180
  client=client, output_file_id=output_file_id
120
181
  )
121
182
  return page_responses
@@ -132,3 +193,9 @@ async def _create_batch_jobs(
132
193
  client=client, file_id=batch_file.id, trace_id=trace_id
133
194
  )
134
195
  return upload_batch
196
+
197
+
198
+ async def _delete_associated_files(open_ai_client, remote_file_handler):
199
+ for file in remote_file_handler.created_files:
200
+ logger.info(f"deleting - {file}")
201
+ await open_ai_client.delete_file(file)
@@ -6,4 +6,4 @@ DEFAULT_PROMPT = """
6
6
  If unable to parse, return an empty string.
7
7
  """
8
8
 
9
- CUSTOM_ID_DELINEATOR = "--page--"
9
+ CUSTOM_ID_DELINEATOR = "--parallex--"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.1.4
3
+ Version: 0.2.1
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -45,10 +45,10 @@ import os
45
45
  from parallex.models.parallex_callable_output import ParallexCallableOutput
46
46
  from parallex.parallex import parallex
47
47
 
48
- os.environ["AZURE_OPENAI_API_KEY"] = "key"
49
- os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
50
- os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
51
- os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
48
+ os.environ["AZURE_API_KEY"] = "key"
49
+ os.environ["AZURE_API_BASE"] = "your-endpoint.com"
50
+ os.environ["AZURE_API_VERSION"] = "deployment_version"
51
+ os.environ["AZURE_API_DEPLOYMENT"] = "deployment_name"
52
52
 
53
53
  model = "gpt-4o"
54
54
 
@@ -96,3 +96,28 @@ class PageResponse(BaseModel):
96
96
  """
97
97
  ```
98
98
 
99
+ ### Batch processing for list of prompts
100
+ If you do not need to process images, but just want to process prompts using the Batch API,
101
+ you can call;
102
+ ```python
103
+ response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
104
+ model=model,
105
+ prompts=["Some prompt", "Some other prompt"],
106
+ post_process_callable=example_post_process
107
+ )
108
+ responses = response_data.responses
109
+ ```
110
+ This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
111
+
112
+ Responses have the following structure;
113
+ ```python
114
+ class ParallexPromptsCallableOutput(BaseModel):
115
+ original_prompts: list[str] = Field(description="List of given prompts")
116
+ trace_id: UUID = Field(description="Unique trace for each file")
117
+ responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
118
+
119
+ class PromptResponse(BaseModel):
120
+ output_content: str = Field(description="Response from the model")
121
+ prompt_index: int = Field(description="Index corresponding to the given prompts")
122
+ ```
123
+
@@ -1,8 +1,8 @@
1
1
  parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
3
- parallex/ai/open_ai_client.py,sha256=UOb7tOCGCTVlA5Yj8eqOmuZKpOO4ioUw6GKbPY9zZwQ,2068
4
- parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
5
- parallex/ai/uploader.py,sha256=Vt7hF7dm-meimH6hVXbc3IZ2B7WPjWDMwoGiQQSB31Q,3181
3
+ parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
4
+ parallex/ai/output_processor.py,sha256=Rwp8dkLo4xsqooeBh3Xv-uGVbJMG1JQkwyxdUoOs2tQ,1800
5
+ parallex/ai/uploader.py,sha256=_Z6-XBg_OmgJkXY55y-BxzQt0BG4iLByDaNWYwCDX1c,4273
6
6
  parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
7
  parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
8
  parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
@@ -11,12 +11,14 @@ parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M
11
11
  parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
12
12
  parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
13
13
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
14
+ parallex/models/parallex_prompts_callable_output.py,sha256=IlNX9627_E8aXWQ-vDBuv2-9jMFXqn4LFBbShPzxoc4,421
15
+ parallex/models/prompt_response.py,sha256=LcctuyqwiTHWrZHSahwauMaSBsin5Ws6fQRAzGXTsAA,230
14
16
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
15
17
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
16
- parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
17
- parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
18
+ parallex/parallex.py,sha256=12N0-r3OMR6XtYXvGm7D3L6i7sQOeKyIAmBfJzpoetY,7546
19
+ parallex/utils/constants.py,sha256=508ieZLZ5kse0T4_QyNJp57Aq0DMNFjjyFlsKa0xtek,366
18
20
  parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
19
- parallex-0.1.4.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
20
- parallex-0.1.4.dist-info/METADATA,sha256=x3dPVyDeYjMN0254jputI8-Tr7ruesZXSRc2_Yc--ik,3444
21
- parallex-0.1.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
22
- parallex-0.1.4.dist-info/RECORD,,
21
+ parallex-0.2.1.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
22
+ parallex-0.2.1.dist-info/METADATA,sha256=Dt5XmxUHWonlr54qebQ66rfVYSxC_3dJXL2V2EeWnAA,4461
23
+ parallex-0.2.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
24
+ parallex-0.2.1.dist-info/RECORD,,