PyPI - parallex - Versions diffs - 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

parallex 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

parallex/ai/output_processor.py +12 -6
parallex/ai/uploader.py +33 -6
parallex/models/page_response.py +1 -1
parallex/models/prompt_response.py +1 -1
parallex/parallex.py +15 -6
{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/METADATA +1 -1
{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/RECORD +9 -9
{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/LICENSE +0 -0
{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/WHEEL +0 -0

parallex/ai/output_processor.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import json
-from typing import TypeVar, Callable
+from typing import TypeVar, Callable, Optional
+from pydantic import BaseModel
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.models.page_response import PageResponse
@@ -8,11 +10,12 @@ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
 async def process_images_output(
-    client: OpenAIClient, output_file_id: str
+    client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
 ) -> list[PageResponse]:
     return await _process_output(
         client,
         output_file_id,
+        model,
         lambda content, identifier: PageResponse(
             output_content=content, page_number=int(identifier)
         ),
@@ -20,12 +23,13 @@ async def process_images_output(
 async def process_prompts_output(
-    client: OpenAIClient, output_file_id: str
+    client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
 ) -> list[PromptResponse]:
     """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
     return await _process_output(
         client,
         output_file_id,
+        model,
         lambda content, identifier: PromptResponse(
             output_content=content, prompt_index=int(identifier)
         ),
@@ -38,6 +42,7 @@ ResponseType = TypeVar("ResponseType")
 async def _process_output(
     client: OpenAIClient,
     output_file_id: str,
+    model: Optional[type[BaseModel]],
     response_builder: Callable[[str, str], ResponseType],
 ) -> list[ResponseType]:
     file_response = await client.retrieve_file(output_file_id)
@@ -48,9 +53,10 @@ async def _process_output(
         json_response = json.loads(raw_response)
         custom_id = json_response["custom_id"]
         identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
-        output_content = json_response["response"]["body"]["choices"][0]["message"][
-            "content"
-        ]
+        output_content = json_response["response"]["body"]["choices"][0]["message"]["content"]
+        if model:
+            json_data = json.loads(output_content)
+            output_content = model(**json_data)
         response = response_builder(output_content, identifier)
         responses.append(response)

parallex/ai/uploader.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import base64
 import json
 import os
+from typing import Optional
 from uuid import UUID
+from openai.lib._pydantic import to_strict_json_schema
+from pydantic import BaseModel
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.file_management.utils import file_in_temp_dir
 from parallex.models.batch_file import BatchFile
@@ -17,6 +21,7 @@ async def upload_images_for_processing(
     image_files: list[ImageFile],
     temp_directory: str,
     prompt_text: str,
+    model: Optional[type[BaseModel]] = None,
 ) -> list[BatchFile]:
     """Base64 encodes image, converts to expected jsonl format and uploads"""
     trace_id = image_files[0].trace_id
@@ -43,7 +48,7 @@ async def upload_images_for_processing(
         prompt_custom_id = (
             f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
         )
-        jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
+        jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text, model)
         with open(upload_file_location, "a") as jsonl_file:
             jsonl_file.write(json.dumps(jsonl) + "\n")
     batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -52,7 +57,10 @@ async def upload_images_for_processing(
 async def upload_prompts_for_processing(
-    client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
+    client: OpenAIClient,
+    prompts: list[str], temp_directory: str,
+    trace_id: UUID,
+    model: Optional[type[BaseModel]] = None
 ) -> list[BatchFile]:
     """Creates jsonl file and uploads for processing"""
     current_index = 0
@@ -73,7 +81,7 @@ async def upload_prompts_for_processing(
             )
         prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
-        jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
+        jsonl = _simple_jsonl_format(prompt_custom_id, prompt, model)
         with open(upload_file_location, "a") as jsonl_file:
             jsonl_file.write(json.dumps(jsonl) + "\n")
     batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -119,8 +127,20 @@ async def _create_batch_file(
     )
-def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
+def _response_format(model: type[BaseModel]) -> dict:
+    schema = to_strict_json_schema(model)
     return {
+        "type": "json_schema",
+        "json_schema": {
+            "name": model.__name__,
+            "strict": True,
+            "schema": schema
+        }
+    }
+def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str, model: Optional[type[BaseModel]]) -> dict:
+    payload = {
         "custom_id": prompt_custom_id,
         "method": "POST",
         "url": "/chat/completions",
@@ -130,10 +150,13 @@ def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
             "temperature": 0.0, # TODO make configurable
         },
     }
+    if model is not None:
+        payload["body"]["response_format"] = _response_format(model)
+    return payload
-def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
-    return {
+def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str, model: Optional[type[BaseModel]] = None) -> dict:
+    payload = {
         "custom_id": prompt_custom_id,
         "method": "POST",
         "url": "/chat/completions",
@@ -154,5 +177,9 @@ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text:
                 }
             ],
             "max_tokens": 2000,
+            "response_format": {"type": "json_object"}
         },
     }
+    if model is not None:
+        payload["body"]["response_format"] = _response_format(model)
+    return payload

parallex/models/page_response.py CHANGED Viewed

@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
 class PageResponse(BaseModel):
-    output_content: str = Field(description="Markdown generated for the page")
+    output_content: str | BaseModel = Field(description="Markdown generated for the page")
     page_number: int = Field(description="Page number of the associated PDF")

parallex/models/prompt_response.py CHANGED Viewed

@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
 class PromptResponse(BaseModel):
-    output_content: str = Field(description="Response from the model")
+    output_content: str | BaseModel = Field(description="Response from the model")
     prompt_index: int = Field(description="Index corresponding to the given prompts")

parallex/parallex.py CHANGED Viewed

@@ -4,6 +4,8 @@ import uuid
 from typing import Callable, Optional
 from uuid import UUID
+from pydantic import BaseModel
 from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.ai.output_processor import process_images_output, process_prompts_output
@@ -32,6 +34,7 @@ async def parallex(
     concurrency: Optional[int] = 20,
     prompt_text: Optional[str] = DEFAULT_PROMPT,
     log_level: Optional[str] = "ERROR",
+    response_model: Optional[type[BaseModel]] = None,
 ) -> ParallexCallableOutput:
     setup_logger(log_level)
     remote_file_handler = RemoteFileHandler()
@@ -43,6 +46,7 @@ async def parallex(
             post_process_callable=post_process_callable,
             concurrency=concurrency,
             prompt_text=prompt_text,
+            model=response_model
         )
     except Exception as e:
         logger.error(f"Error occurred: {e}")
@@ -57,6 +61,7 @@ async def parallex_simple_prompts(
     post_process_callable: Optional[Callable[..., None]] = None,
     log_level: Optional[str] = "ERROR",
     concurrency: Optional[int] = 20,
+    response_model: Optional[type[BaseModel]] = None,
 ) -> ParallexPromptsCallableOutput:
     setup_logger(log_level)
     remote_file_handler = RemoteFileHandler()
@@ -67,6 +72,7 @@ async def parallex_simple_prompts(
             prompts=prompts,
             post_process_callable=post_process_callable,
             concurrency=concurrency,
+            model=response_model,
         )
     except Exception as e:
         logger.error(f"Error occurred: {e}")
@@ -80,6 +86,7 @@ async def _prompts_execute(
     prompts: list[str],
     post_process_callable: Optional[Callable[..., None]] = None,
     concurrency: Optional[int] = 20,
+    model: Optional[type[BaseModel]] = None,
 ):
     with tempfile.TemporaryDirectory() as temp_directory:
         trace_id = uuid.uuid4()
@@ -88,6 +95,7 @@ async def _prompts_execute(
             prompts=prompts,
             temp_directory=temp_directory,
             trace_id=trace_id,
+            model=model,
         )
         start_batch_semaphore = asyncio.Semaphore(concurrency)
         start_batch_tasks = []
@@ -110,7 +118,7 @@ async def _prompts_execute(
                 f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
             )
             prompt_task = asyncio.create_task(
-                _wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore)
+                _wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model)
             )
             prompt_tasks.append(prompt_task)
         prompt_response_groups = await asyncio.gather(*prompt_tasks)
@@ -134,6 +142,7 @@ async def _execute(
     post_process_callable: Optional[Callable[..., None]] = None,
     concurrency: Optional[int] = 20,
     prompt_text: Optional[str] = DEFAULT_PROMPT,
+    model: Optional[type[BaseModel]] = None,
 ) -> ParallexCallableOutput:
     with tempfile.TemporaryDirectory() as temp_directory:
         raw_file = await add_file_to_temp_directory(
@@ -169,7 +178,7 @@ async def _execute(
         for batch in batch_jobs:
             page_task = asyncio.create_task(
                 _wait_and_create_pages(
-                    batch=batch, client=open_ai_client, semaphore=process_semaphore
+                    batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model
                 )
             )
             pages_tasks.append(page_task)
@@ -192,27 +201,27 @@ async def _execute(
 async def _wait_and_create_pages(
-    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
+    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
 ):
     async with semaphore:
         logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
         output_file_id = await wait_for_batch_completion(client=client, batch=batch)
         logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
         page_responses = await process_images_output(
-            client=client, output_file_id=output_file_id
+            client=client, output_file_id=output_file_id, model=model,
         )
         return page_responses
 async def _wait_and_create_prompt_responses(
-    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
+    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
 ):
     async with semaphore:
         logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
         output_file_id = await wait_for_batch_completion(client=client, batch=batch)
         logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
         prompt_responses = await process_prompts_output(
-            client=client, output_file_id=output_file_id
+            client=client, output_file_id=output_file_id, model=model,
         )
         return prompt_responses

{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: parallex
-Version: 0.3.4
+Version: 0.4.0
 Summary: PDF to markdown using Azure OpenAI batch processing
 Home-page: https://github.com/Summed-AI/parallex
 Author: Jeff Hostetler

{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,24 @@
 parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
 parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
-parallex/ai/output_processor.py,sha256=Rwp8dkLo4xsqooeBh3Xv-uGVbJMG1JQkwyxdUoOs2tQ,1800
-parallex/ai/uploader.py,sha256=Il4dllaPn6NGoU1YWi56ZJkzaOQzKg9lUngfc3ANOKg,5500
+parallex/ai/output_processor.py,sha256=kd50DwB2txhzz4_MPYl97bPOtLMl0KV2UP_eFmUtq34,2087
+parallex/ai/uploader.py,sha256=FKleSK8GWextqpUUAthvTtxGHSwN-aYF127t1YmGOcw,6375
 parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
 parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
 parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
 parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
 parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
 parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
-parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
+parallex/models/page_response.py,sha256=uqVdHXoEWX3NVvr0Y2_izSA1cpw3EXFZRe1HmI4ypLk,240
 parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
 parallex/models/parallex_prompts_callable_output.py,sha256=IlNX9627_E8aXWQ-vDBuv2-9jMFXqn4LFBbShPzxoc4,421
-parallex/models/prompt_response.py,sha256=LcctuyqwiTHWrZHSahwauMaSBsin5Ws6fQRAzGXTsAA,230
+parallex/models/prompt_response.py,sha256=2Zmnwlj8Ou2VgEHmi1VZrlnv5XRzw5VLMEkpQ1VelQQ,242
 parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
 parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
-parallex/parallex.py,sha256=0nOfEXeiuTKi0gQSnqdNyPxIYvuE7Wfp4HtmSbVsEs4,8864
+parallex/parallex.py,sha256=JogDmjB-HdsauCis6hyfSBF_tQi2IdmXfltK72roi28,9322
 parallex/utils/constants.py,sha256=508ieZLZ5kse0T4_QyNJp57Aq0DMNFjjyFlsKa0xtek,366
 parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
-parallex-0.3.4.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
-parallex-0.3.4.dist-info/METADATA,sha256=gIXiPBgPJVnqZbfa8xsxMN0cTDJjalZmplnOUHfI9-0,4461
-parallex-0.3.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-parallex-0.3.4.dist-info/RECORD,,
+parallex-0.4.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
+parallex-0.4.0.dist-info/METADATA,sha256=Hdq1xbDWVVPhR-61O88E9Glv7rn3LzKfz72--rzJovo,4461
+parallex-0.4.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+parallex-0.4.0.dist-info/RECORD,,

{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{parallex-0.3.4.dist-info → parallex-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

parallex 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

parallex 0.3.4py3-none-any.whl → 0.4.0py3-none-any.whl