parallex 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -15,9 +15,9 @@ class OpenAIClient:
15
15
  self.file_handler = remote_file_handler
16
16
 
17
17
  self._client = AsyncAzureOpenAI(
18
- azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
19
- api_key=os.getenv("AZURE_OPENAI_API_KEY"),
20
- api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
18
+ azure_endpoint=os.getenv("AZURE_API_BASE"),
19
+ api_key=os.getenv("AZURE_API_KEY"),
20
+ api_version=os.getenv("AZURE_API_VERSION"),
21
21
  )
22
22
 
23
23
  async def upload(self, file_path: str) -> FileObject:
@@ -1,25 +1,57 @@
1
1
  import json
2
+ from typing import TypeVar, Callable
2
3
 
3
4
  from parallex.ai.open_ai_client import OpenAIClient
4
5
  from parallex.models.page_response import PageResponse
6
+ from parallex.models.prompt_response import PromptResponse
5
7
  from parallex.utils.constants import CUSTOM_ID_DELINEATOR
6
8
 
7
9
 
8
- async def process_output(
10
+ async def process_images_output(
9
11
  client: OpenAIClient, output_file_id: str
10
12
  ) -> list[PageResponse]:
11
- """Gets content from completed Batch to create PageResponse with created markdown"""
13
+ return await _process_output(
14
+ client,
15
+ output_file_id,
16
+ lambda content, identifier: PageResponse(
17
+ output_content=content, page_number=int(identifier)
18
+ ),
19
+ )
20
+
21
+
22
+ async def process_prompts_output(
23
+ client: OpenAIClient, output_file_id: str
24
+ ) -> list[PromptResponse]:
25
+ """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
26
+ return await _process_output(
27
+ client,
28
+ output_file_id,
29
+ lambda content, identifier: PromptResponse(
30
+ output_content=content, prompt_index=int(identifier)
31
+ ),
32
+ )
33
+
34
+
35
+ ResponseType = TypeVar("ResponseType")
36
+
37
+
38
+ async def _process_output(
39
+ client: OpenAIClient,
40
+ output_file_id: str,
41
+ response_builder: Callable[[str, str], ResponseType],
42
+ ) -> list[ResponseType]:
12
43
  file_response = await client.retrieve_file(output_file_id)
13
44
  raw_responses = file_response.text.strip().split("\n")
45
+ responses = []
14
46
 
15
- pages = []
16
47
  for raw_response in raw_responses:
17
48
  json_response = json.loads(raw_response)
18
49
  custom_id = json_response["custom_id"]
19
- page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
50
+ identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
20
51
  output_content = json_response["response"]["body"]["choices"][0]["message"][
21
52
  "content"
22
53
  ]
23
- page = PageResponse(output_content=output_content, page_number=int(page_number))
24
- pages.append(page)
25
- return pages
54
+ response = response_builder(output_content, identifier)
55
+ responses.append(response)
56
+
57
+ return responses
parallex/ai/uploader.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import base64
2
2
  import json
3
3
  import os
4
+ from uuid import UUID
4
5
 
5
6
  from parallex.ai.open_ai_client import OpenAIClient
6
7
  from parallex.file_management.utils import file_in_temp_dir
@@ -46,7 +47,7 @@ async def upload_images_for_processing(
46
47
  prompt_custom_id = (
47
48
  f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
48
49
  )
49
- jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
50
+ jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
50
51
  with open(upload_file_location, "a") as jsonl_file:
51
52
  jsonl_file.write(json.dumps(jsonl) + "\n")
52
53
  batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -54,6 +55,22 @@ async def upload_images_for_processing(
54
55
  return batch_files
55
56
 
56
57
 
58
+ async def upload_prompts_for_processing(
59
+ client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
60
+ ) -> BatchFile:
61
+ """Creates jsonl file and uploads for processing"""
62
+ upload_file_location = file_in_temp_dir(
63
+ directory=temp_directory, file_name=f"prompts-{trace_id}.jsonl"
64
+ )
65
+ for index, prompt in enumerate(prompts):
66
+ prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
67
+ jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
68
+ with open(upload_file_location, "a") as jsonl_file:
69
+ jsonl_file.write(json.dumps(jsonl) + "\n")
70
+ batch_file = await _create_batch_file(client, trace_id, upload_file_location)
71
+ return batch_file
72
+
73
+
57
74
  async def _create_batch_file(client, trace_id, upload_file_location):
58
75
  file_response = await client.upload(upload_file_location)
59
76
  return BatchFile(
@@ -65,13 +82,25 @@ async def _create_batch_file(client, trace_id, upload_file_location):
65
82
  )
66
83
 
67
84
 
68
- def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
85
+ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str):
86
+ return {
87
+ "custom_id": prompt_custom_id,
88
+ "method": "POST",
89
+ "url": "/chat/completions",
90
+ "body": {
91
+ "model": os.getenv("AZURE_API_DEPLOYMENT"),
92
+ "messages": [{"role": "user", "content": prompt_text}],
93
+ },
94
+ }
95
+
96
+
97
+ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
69
98
  return {
70
99
  "custom_id": prompt_custom_id,
71
100
  "method": "POST",
72
101
  "url": "/chat/completions",
73
102
  "body": {
74
- "model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
103
+ "model": os.getenv("AZURE_API_DEPLOYMENT"),
75
104
  "messages": [
76
105
  {
77
106
  "role": "user",
@@ -0,0 +1,13 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from parallex.models.prompt_response import PromptResponse
6
+
7
+
8
+ class ParallexPromptsCallableOutput(BaseModel):
9
+ original_prompts: list[str] = Field(description="List of given prompts")
10
+ trace_id: UUID = Field(description="Unique trace for each file")
11
+ responses: list[PromptResponse] = Field(
12
+ description="List of PromptResponse objects"
13
+ )
@@ -0,0 +1,6 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class PromptResponse(BaseModel):
5
+ output_content: str = Field(description="Response from the model")
6
+ prompt_index: int = Field(description="Index corresponding to the given prompts")
parallex/parallex.py CHANGED
@@ -1,17 +1,24 @@
1
1
  import asyncio
2
2
  import tempfile
3
+ import uuid
3
4
  from typing import Callable, Optional
4
5
  from uuid import UUID
5
6
 
6
7
  from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
7
8
  from parallex.ai.open_ai_client import OpenAIClient
8
- from parallex.ai.output_processor import process_output
9
- from parallex.ai.uploader import upload_images_for_processing
9
+ from parallex.ai.output_processor import process_images_output, process_prompts_output
10
+ from parallex.ai.uploader import (
11
+ upload_images_for_processing,
12
+ upload_prompts_for_processing,
13
+ )
10
14
  from parallex.file_management.converter import convert_pdf_to_images
11
15
  from parallex.file_management.file_finder import add_file_to_temp_directory
12
16
  from parallex.file_management.remote_file_handler import RemoteFileHandler
13
17
  from parallex.models.batch_file import BatchFile
14
18
  from parallex.models.parallex_callable_output import ParallexCallableOutput
19
+ from parallex.models.parallex_prompts_callable_output import (
20
+ ParallexPromptsCallableOutput,
21
+ )
15
22
  from parallex.models.upload_batch import UploadBatch
16
23
  from parallex.utils.constants import DEFAULT_PROMPT
17
24
  from parallex.utils.logger import logger, setup_logger
@@ -40,9 +47,63 @@ async def parallex(
40
47
  except Exception as e:
41
48
  logger.error(f"Error occurred: {e}")
42
49
  finally:
43
- for file in remote_file_handler.created_files:
44
- logger.info(f"deleting - {file}")
45
- await open_ai_client.delete_file(file)
50
+ await _delete_associated_files(open_ai_client, remote_file_handler)
51
+
52
+
53
+ async def parallex_simple_prompts(
54
+ model: str,
55
+ prompts: list[str],
56
+ post_process_callable: Optional[Callable[..., None]] = None,
57
+ log_level: Optional[str] = "ERROR",
58
+ ) -> ParallexPromptsCallableOutput:
59
+ setup_logger(log_level)
60
+ remote_file_handler = RemoteFileHandler()
61
+ open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
62
+ try:
63
+ return await _prompts_execute(
64
+ open_ai_client=open_ai_client,
65
+ prompts=prompts,
66
+ post_process_callable=post_process_callable,
67
+ )
68
+ except Exception as e:
69
+ logger.error(f"Error occurred: {e}")
70
+ finally:
71
+ await _delete_associated_files(open_ai_client, remote_file_handler)
72
+
73
+
74
+ async def _prompts_execute(
75
+ open_ai_client: OpenAIClient,
76
+ prompts: list[str],
77
+ post_process_callable: Optional[Callable[..., None]] = None,
78
+ ):
79
+ with tempfile.TemporaryDirectory() as temp_directory:
80
+ trace_id = uuid.uuid4()
81
+ batch_file = await upload_prompts_for_processing(
82
+ client=open_ai_client,
83
+ prompts=prompts,
84
+ temp_directory=temp_directory,
85
+ trace_id=trace_id,
86
+ )
87
+ batch = await create_batch(
88
+ client=open_ai_client, file_id=batch_file.id, trace_id=trace_id
89
+ )
90
+ logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
91
+ output_file_id = await wait_for_batch_completion(
92
+ client=open_ai_client, batch=batch
93
+ )
94
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
95
+ prompts_output = await process_prompts_output(
96
+ client=open_ai_client, output_file_id=output_file_id
97
+ )
98
+ sorted_responses = sorted(prompts_output, key=lambda x: x.prompt_index)
99
+ callable_output = ParallexPromptsCallableOutput(
100
+ original_prompts=prompts,
101
+ trace_id=trace_id,
102
+ responses=sorted_responses,
103
+ )
104
+ if post_process_callable is not None:
105
+ post_process_callable(output=callable_output)
106
+ return callable_output
46
107
 
47
108
 
48
109
  async def _execute(
@@ -115,7 +176,7 @@ async def _wait_and_create_pages(
115
176
  logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
116
177
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
117
178
  logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
118
- page_responses = await process_output(
179
+ page_responses = await process_images_output(
119
180
  client=client, output_file_id=output_file_id
120
181
  )
121
182
  return page_responses
@@ -132,3 +193,9 @@ async def _create_batch_jobs(
132
193
  client=client, file_id=batch_file.id, trace_id=trace_id
133
194
  )
134
195
  return upload_batch
196
+
197
+
198
+ async def _delete_associated_files(open_ai_client, remote_file_handler):
199
+ for file in remote_file_handler.created_files:
200
+ logger.info(f"deleting - {file}")
201
+ await open_ai_client.delete_file(file)
@@ -6,4 +6,4 @@ DEFAULT_PROMPT = """
6
6
  If unable to parse, return an empty string.
7
7
  """
8
8
 
9
- CUSTOM_ID_DELINEATOR = "--page--"
9
+ CUSTOM_ID_DELINEATOR = "--parallex--"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.1.4
3
+ Version: 0.2.1
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -45,10 +45,10 @@ import os
45
45
  from parallex.models.parallex_callable_output import ParallexCallableOutput
46
46
  from parallex.parallex import parallex
47
47
 
48
- os.environ["AZURE_OPENAI_API_KEY"] = "key"
49
- os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
50
- os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
51
- os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
48
+ os.environ["AZURE_API_KEY"] = "key"
49
+ os.environ["AZURE_API_BASE"] = "your-endpoint.com"
50
+ os.environ["AZURE_API_VERSION"] = "deployment_version"
51
+ os.environ["AZURE_API_DEPLOYMENT"] = "deployment_name"
52
52
 
53
53
  model = "gpt-4o"
54
54
 
@@ -96,3 +96,28 @@ class PageResponse(BaseModel):
96
96
  """
97
97
  ```
98
98
 
99
+ ### Batch processing for list of prompts
100
+ If you do not need to process images, but just want to process prompts using the Batch API,
101
+ you can call;
102
+ ```python
103
+ response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
104
+ model=model,
105
+ prompts=["Some prompt", "Some other prompt"],
106
+ post_process_callable=example_post_process
107
+ )
108
+ responses = response_data.responses
109
+ ```
110
+ This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
111
+
112
+ Responses have the following structure;
113
+ ```python
114
+ class ParallexPromptsCallableOutput(BaseModel):
115
+ original_prompts: list[str] = Field(description="List of given prompts")
116
+ trace_id: UUID = Field(description="Unique trace for each file")
117
+ responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
118
+
119
+ class PromptResponse(BaseModel):
120
+ output_content: str = Field(description="Response from the model")
121
+ prompt_index: int = Field(description="Index corresponding to the given prompts")
122
+ ```
123
+
@@ -1,8 +1,8 @@
1
1
  parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
3
- parallex/ai/open_ai_client.py,sha256=UOb7tOCGCTVlA5Yj8eqOmuZKpOO4ioUw6GKbPY9zZwQ,2068
4
- parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
5
- parallex/ai/uploader.py,sha256=Vt7hF7dm-meimH6hVXbc3IZ2B7WPjWDMwoGiQQSB31Q,3181
3
+ parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
4
+ parallex/ai/output_processor.py,sha256=Rwp8dkLo4xsqooeBh3Xv-uGVbJMG1JQkwyxdUoOs2tQ,1800
5
+ parallex/ai/uploader.py,sha256=_Z6-XBg_OmgJkXY55y-BxzQt0BG4iLByDaNWYwCDX1c,4273
6
6
  parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
7
  parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
8
  parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
@@ -11,12 +11,14 @@ parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M
11
11
  parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
12
12
  parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
13
13
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
14
+ parallex/models/parallex_prompts_callable_output.py,sha256=IlNX9627_E8aXWQ-vDBuv2-9jMFXqn4LFBbShPzxoc4,421
15
+ parallex/models/prompt_response.py,sha256=LcctuyqwiTHWrZHSahwauMaSBsin5Ws6fQRAzGXTsAA,230
14
16
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
15
17
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
16
- parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
17
- parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
18
+ parallex/parallex.py,sha256=12N0-r3OMR6XtYXvGm7D3L6i7sQOeKyIAmBfJzpoetY,7546
19
+ parallex/utils/constants.py,sha256=508ieZLZ5kse0T4_QyNJp57Aq0DMNFjjyFlsKa0xtek,366
18
20
  parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
19
- parallex-0.1.4.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
20
- parallex-0.1.4.dist-info/METADATA,sha256=x3dPVyDeYjMN0254jputI8-Tr7ruesZXSRc2_Yc--ik,3444
21
- parallex-0.1.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
22
- parallex-0.1.4.dist-info/RECORD,,
21
+ parallex-0.2.1.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
22
+ parallex-0.2.1.dist-info/METADATA,sha256=Dt5XmxUHWonlr54qebQ66rfVYSxC_3dJXL2V2EeWnAA,4461
23
+ parallex-0.2.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
24
+ parallex-0.2.1.dist-info/RECORD,,