PyPI - parallex - Versions diffs - 0.3.3__tar.gz → 0.4.0__tar.gz - Mend

parallex 0.3.3tar.gz → 0.4.0tar.gz

Files changed (24) hide show

{parallex-0.3.3 → parallex-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: parallex
-Version: 0.3.3
+Version: 0.4.0
 Summary: PDF to markdown using Azure OpenAI batch processing
 Home-page: https://github.com/Summed-AI/parallex
 Author: Jeff Hostetler

{parallex-0.3.3 → parallex-0.4.0}/parallex/ai/open_ai_client.py RENAMED Viewed

@@ -18,7 +18,6 @@ class OpenAIClient:
             azure_endpoint=os.getenv("AZURE_API_BASE"),
             api_key=os.getenv("AZURE_API_KEY"),
             api_version=os.getenv("AZURE_API_VERSION"),
-            timeout=3000,
         )
     async def upload(self, file_path: str) -> FileObject:

{parallex-0.3.3 → parallex-0.4.0}/parallex/ai/output_processor.py RENAMED Viewed

@@ -1,5 +1,7 @@
 import json
-from typing import TypeVar, Callable
+from typing import TypeVar, Callable, Optional
+from pydantic import BaseModel
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.models.page_response import PageResponse
@@ -8,11 +10,12 @@ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
 async def process_images_output(
-    client: OpenAIClient, output_file_id: str
+    client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
 ) -> list[PageResponse]:
     return await _process_output(
         client,
         output_file_id,
+        model,
         lambda content, identifier: PageResponse(
             output_content=content, page_number=int(identifier)
         ),
@@ -20,12 +23,13 @@ async def process_images_output(
 async def process_prompts_output(
-    client: OpenAIClient, output_file_id: str
+    client: OpenAIClient, output_file_id: str, model: Optional[type[BaseModel]] = None
 ) -> list[PromptResponse]:
     """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
     return await _process_output(
         client,
         output_file_id,
+        model,
         lambda content, identifier: PromptResponse(
             output_content=content, prompt_index=int(identifier)
         ),
@@ -38,6 +42,7 @@ ResponseType = TypeVar("ResponseType")
 async def _process_output(
     client: OpenAIClient,
     output_file_id: str,
+    model: Optional[type[BaseModel]],
     response_builder: Callable[[str, str], ResponseType],
 ) -> list[ResponseType]:
     file_response = await client.retrieve_file(output_file_id)
@@ -48,9 +53,10 @@ async def _process_output(
         json_response = json.loads(raw_response)
         custom_id = json_response["custom_id"]
         identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
-        output_content = json_response["response"]["body"]["choices"][0]["message"][
-            "content"
-        ]
+        output_content = json_response["response"]["body"]["choices"][0]["message"]["content"]
+        if model:
+            json_data = json.loads(output_content)
+            output_content = model(**json_data)
         response = response_builder(output_content, identifier)
         responses.append(response)

{parallex-0.3.3 → parallex-0.4.0}/parallex/ai/uploader.py RENAMED Viewed

@@ -1,8 +1,12 @@
 import base64
 import json
 import os
+from typing import Optional
 from uuid import UUID
+from openai.lib._pydantic import to_strict_json_schema
+from pydantic import BaseModel
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.file_management.utils import file_in_temp_dir
 from parallex.models.batch_file import BatchFile
@@ -17,6 +21,7 @@ async def upload_images_for_processing(
     image_files: list[ImageFile],
     temp_directory: str,
     prompt_text: str,
+    model: Optional[type[BaseModel]] = None,
 ) -> list[BatchFile]:
     """Base64 encodes image, converts to expected jsonl format and uploads"""
     trace_id = image_files[0].trace_id
@@ -43,7 +48,7 @@ async def upload_images_for_processing(
         prompt_custom_id = (
             f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
         )
-        jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
+        jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text, model)
         with open(upload_file_location, "a") as jsonl_file:
             jsonl_file.write(json.dumps(jsonl) + "\n")
     batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -52,7 +57,10 @@ async def upload_images_for_processing(
 async def upload_prompts_for_processing(
-    client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
+    client: OpenAIClient,
+    prompts: list[str], temp_directory: str,
+    trace_id: UUID,
+    model: Optional[type[BaseModel]] = None
 ) -> list[BatchFile]:
     """Creates jsonl file and uploads for processing"""
     current_index = 0
@@ -73,7 +81,7 @@ async def upload_prompts_for_processing(
             )
         prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
-        jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
+        jsonl = _simple_jsonl_format(prompt_custom_id, prompt, model)
         with open(upload_file_location, "a") as jsonl_file:
             jsonl_file.write(json.dumps(jsonl) + "\n")
     batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -119,20 +127,36 @@ async def _create_batch_file(
     )
-def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
+def _response_format(model: type[BaseModel]) -> dict:
+    schema = to_strict_json_schema(model)
     return {
+        "type": "json_schema",
+        "json_schema": {
+            "name": model.__name__,
+            "strict": True,
+            "schema": schema
+        }
+    }
+def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str, model: Optional[type[BaseModel]]) -> dict:
+    payload = {
         "custom_id": prompt_custom_id,
         "method": "POST",
         "url": "/chat/completions",
         "body": {
             "model": os.getenv("AZURE_API_DEPLOYMENT"),
             "messages": [{"role": "user", "content": prompt_text}],
+            "temperature": 0.0, # TODO make configurable
         },
     }
+    if model is not None:
+        payload["body"]["response_format"] = _response_format(model)
+    return payload
-def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
-    return {
+def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str, model: Optional[type[BaseModel]] = None) -> dict:
+    payload = {
         "custom_id": prompt_custom_id,
         "method": "POST",
         "url": "/chat/completions",
@@ -153,5 +177,9 @@ def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text:
                 }
             ],
             "max_tokens": 2000,
+            "response_format": {"type": "json_object"}
         },
     }
+    if model is not None:
+        payload["body"]["response_format"] = _response_format(model)
+    return payload

{parallex-0.3.3 → parallex-0.4.0}/parallex/models/page_response.py RENAMED Viewed

@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
 class PageResponse(BaseModel):
-    output_content: str = Field(description="Markdown generated for the page")
+    output_content: str | BaseModel = Field(description="Markdown generated for the page")
     page_number: int = Field(description="Page number of the associated PDF")

{parallex-0.3.3 → parallex-0.4.0}/parallex/models/prompt_response.py RENAMED Viewed

@@ -2,5 +2,5 @@ from pydantic import BaseModel, Field
 class PromptResponse(BaseModel):
-    output_content: str = Field(description="Response from the model")
+    output_content: str | BaseModel = Field(description="Response from the model")
     prompt_index: int = Field(description="Index corresponding to the given prompts")

{parallex-0.3.3 → parallex-0.4.0}/parallex/parallex.py RENAMED Viewed

@@ -4,6 +4,8 @@ import uuid
 from typing import Callable, Optional
 from uuid import UUID
+from pydantic import BaseModel
 from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.ai.output_processor import process_images_output, process_prompts_output
@@ -32,6 +34,7 @@ async def parallex(
     concurrency: Optional[int] = 20,
     prompt_text: Optional[str] = DEFAULT_PROMPT,
     log_level: Optional[str] = "ERROR",
+    response_model: Optional[type[BaseModel]] = None,
 ) -> ParallexCallableOutput:
     setup_logger(log_level)
     remote_file_handler = RemoteFileHandler()
@@ -43,6 +46,7 @@ async def parallex(
             post_process_callable=post_process_callable,
             concurrency=concurrency,
             prompt_text=prompt_text,
+            model=response_model
         )
     except Exception as e:
         logger.error(f"Error occurred: {e}")
@@ -57,6 +61,7 @@ async def parallex_simple_prompts(
     post_process_callable: Optional[Callable[..., None]] = None,
     log_level: Optional[str] = "ERROR",
     concurrency: Optional[int] = 20,
+    response_model: Optional[type[BaseModel]] = None,
 ) -> ParallexPromptsCallableOutput:
     setup_logger(log_level)
     remote_file_handler = RemoteFileHandler()
@@ -67,6 +72,7 @@ async def parallex_simple_prompts(
             prompts=prompts,
             post_process_callable=post_process_callable,
             concurrency=concurrency,
+            model=response_model,
         )
     except Exception as e:
         logger.error(f"Error occurred: {e}")
@@ -80,6 +86,7 @@ async def _prompts_execute(
     prompts: list[str],
     post_process_callable: Optional[Callable[..., None]] = None,
     concurrency: Optional[int] = 20,
+    model: Optional[type[BaseModel]] = None,
 ):
     with tempfile.TemporaryDirectory() as temp_directory:
         trace_id = uuid.uuid4()
@@ -88,6 +95,7 @@ async def _prompts_execute(
             prompts=prompts,
             temp_directory=temp_directory,
             trace_id=trace_id,
+            model=model,
         )
         start_batch_semaphore = asyncio.Semaphore(concurrency)
         start_batch_tasks = []
@@ -110,7 +118,7 @@ async def _prompts_execute(
                 f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
             )
             prompt_task = asyncio.create_task(
-                _wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore)
+                _wait_and_create_prompt_responses(batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model)
             )
             prompt_tasks.append(prompt_task)
         prompt_response_groups = await asyncio.gather(*prompt_tasks)
@@ -134,6 +142,7 @@ async def _execute(
     post_process_callable: Optional[Callable[..., None]] = None,
     concurrency: Optional[int] = 20,
     prompt_text: Optional[str] = DEFAULT_PROMPT,
+    model: Optional[type[BaseModel]] = None,
 ) -> ParallexCallableOutput:
     with tempfile.TemporaryDirectory() as temp_directory:
         raw_file = await add_file_to_temp_directory(
@@ -169,7 +178,7 @@ async def _execute(
         for batch in batch_jobs:
             page_task = asyncio.create_task(
                 _wait_and_create_pages(
-                    batch=batch, client=open_ai_client, semaphore=process_semaphore
+                    batch=batch, client=open_ai_client, semaphore=process_semaphore, model=model
                 )
             )
             pages_tasks.append(page_task)
@@ -192,27 +201,27 @@ async def _execute(
 async def _wait_and_create_pages(
-    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
+    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
 ):
     async with semaphore:
         logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
         output_file_id = await wait_for_batch_completion(client=client, batch=batch)
         logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
         page_responses = await process_images_output(
-            client=client, output_file_id=output_file_id
+            client=client, output_file_id=output_file_id, model=model,
         )
         return page_responses
 async def _wait_and_create_prompt_responses(
-    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
+    batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore, model: Optional[type[BaseModel]] = None
 ):
     async with semaphore:
         logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
         output_file_id = await wait_for_batch_completion(client=client, batch=batch)
         logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
         prompt_responses = await process_prompts_output(
-            client=client, output_file_id=output_file_id
+            client=client, output_file_id=output_file_id, model=model,
         )
         return prompt_responses

{parallex-0.3.3 → parallex-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "parallex"
-version = "0.3.3"
+version = "0.4.0"
 description = "PDF to markdown using Azure OpenAI batch processing"
 authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
 repository = "https://github.com/Summed-AI/parallex"