PyPI - parallex - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

parallex 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

parallex/ai/output_processor.py +39 -7
parallex/ai/uploader.py +78 -12
parallex/models/parallex_prompts_callable_output.py +13 -0
parallex/models/prompt_response.py +6 -0
parallex/parallex.py +102 -6
parallex/utils/constants.py +1 -1
{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/METADATA +26 -1
{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/RECORD +10 -8
{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/LICENSE +0 -0
{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/WHEEL +0 -0

parallex/ai/output_processor.py CHANGED Viewed

@@ -1,25 +1,57 @@
 import json
+from typing import TypeVar, Callable
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.models.page_response import PageResponse
+from parallex.models.prompt_response import PromptResponse
 from parallex.utils.constants import CUSTOM_ID_DELINEATOR
-async def process_output(
+async def process_images_output(
     client: OpenAIClient, output_file_id: str
 ) -> list[PageResponse]:
-    """Gets content from completed Batch to create PageResponse with created markdown"""
+    return await _process_output(
+        client,
+        output_file_id,
+        lambda content, identifier: PageResponse(
+            output_content=content, page_number=int(identifier)
+        ),
+    )
+async def process_prompts_output(
+    client: OpenAIClient, output_file_id: str
+) -> list[PromptResponse]:
+    """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
+    return await _process_output(
+        client,
+        output_file_id,
+        lambda content, identifier: PromptResponse(
+            output_content=content, prompt_index=int(identifier)
+        ),
+    )
+ResponseType = TypeVar("ResponseType")
+async def _process_output(
+    client: OpenAIClient,
+    output_file_id: str,
+    response_builder: Callable[[str, str], ResponseType],
+) -> list[ResponseType]:
     file_response = await client.retrieve_file(output_file_id)
     raw_responses = file_response.text.strip().split("\n")
+    responses = []
-    pages = []
     for raw_response in raw_responses:
         json_response = json.loads(raw_response)
         custom_id = json_response["custom_id"]
-        page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
+        identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
         output_content = json_response["response"]["body"]["choices"][0]["message"][
             "content"
         ]
-        page = PageResponse(output_content=output_content, page_number=int(page_number))
-        pages.append(page)
-    return pages
+        response = response_builder(output_content, identifier)
+        responses.append(response)
+    return responses

parallex/ai/uploader.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import base64
 import json
 import os
+from uuid import UUID
 from parallex.ai.open_ai_client import OpenAIClient
 from parallex.file_management.utils import file_in_temp_dir
@@ -8,7 +9,7 @@ from parallex.models.batch_file import BatchFile
 from parallex.models.image_file import ImageFile
 from parallex.utils.constants import CUSTOM_ID_DELINEATOR
-MAX_FILE_SIZE = 150 * 1024 * 1024  # 150 MB in bytes
+MAX_FILE_SIZE = 180 * 1024 * 1024  # 180 MB in bytes. Limit for Azure is 200MB.
 async def upload_images_for_processing(
@@ -22,22 +23,18 @@ async def upload_images_for_processing(
     current_index = 0
     batch_files = []
     upload_file_location = file_in_temp_dir(
-        directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
+        directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
     )
     for image_file in image_files:
-        if (
-            os.path.exists(upload_file_location)
-            and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
-        ):
+        if await _approaching_file_size_limit(upload_file_location):
             """When approaching upload file limit, upload and start new file"""
             batch_file = await _create_batch_file(
                 client, trace_id, upload_file_location
             )
             batch_files.append(batch_file)
-            current_index += 1
-            upload_file_location = file_in_temp_dir(
-                directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
+            upload_file_location = await _increment_batch_file_index(
+                current_index, temp_directory, trace_id
             )
         with open(image_file.path, "rb") as image:
@@ -46,7 +43,7 @@ async def upload_images_for_processing(
         prompt_custom_id = (
             f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
         )
-        jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
+        jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
         with open(upload_file_location, "a") as jsonl_file:
             jsonl_file.write(json.dumps(jsonl) + "\n")
     batch_file = await _create_batch_file(client, trace_id, upload_file_location)
@@ -54,7 +51,64 @@ async def upload_images_for_processing(
     return batch_files
-async def _create_batch_file(client, trace_id, upload_file_location):
+async def upload_prompts_for_processing(
+    client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
+) -> list[BatchFile]:
+    """Creates jsonl file and uploads for processing"""
+    current_index = 0
+    batch_files = []
+    upload_file_location = await set_file_location(
+        current_index, temp_directory, trace_id
+    )
+    for index, prompt in enumerate(prompts):
+        if await _approaching_file_size_limit(upload_file_location):
+            """When approaching upload file limit, upload and start new file"""
+            batch_file = await _create_batch_file(
+                client, trace_id, upload_file_location
+            )
+            batch_files.append(batch_file)
+            upload_file_location = await _increment_batch_file_index(
+                current_index, temp_directory, trace_id
+            )
+        prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
+        jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
+        with open(upload_file_location, "a") as jsonl_file:
+            jsonl_file.write(json.dumps(jsonl) + "\n")
+        batch_file = await _create_batch_file(client, trace_id, upload_file_location)
+        batch_files.append(batch_file)
+    return batch_files
+async def set_file_location(
+    current_index: int, temp_directory: str, trace_id: UUID
+) -> str:
+    return file_in_temp_dir(
+        directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
+    )
+async def _approaching_file_size_limit(upload_file_location: str) -> bool:
+    return (
+        os.path.exists(upload_file_location)
+        and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
+    )
+async def _increment_batch_file_index(
+    current_index: int, temp_directory: str, trace_id: UUID
+) -> str:
+    current_index += 1
+    upload_file_location = await set_file_location(
+        current_index, temp_directory, trace_id
+    )
+    return upload_file_location
+async def _create_batch_file(
+    client: OpenAIClient, trace_id: UUID, upload_file_location: str
+) -> BatchFile:
     file_response = await client.upload(upload_file_location)
     return BatchFile(
         id=file_response.id,
@@ -65,7 +119,19 @@ async def _create_batch_file(client, trace_id, upload_file_location):
     )
-def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
+def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
+    return {
+        "custom_id": prompt_custom_id,
+        "method": "POST",
+        "url": "/chat/completions",
+        "body": {
+            "model": os.getenv("AZURE_API_DEPLOYMENT"),
+            "messages": [{"role": "user", "content": prompt_text}],
+        },
+    }
+def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
     return {
         "custom_id": prompt_custom_id,
         "method": "POST",

parallex/models/parallex_prompts_callable_output.py ADDED Viewed

@@ -0,0 +1,13 @@
+from uuid import UUID
+from pydantic import BaseModel, Field
+from parallex.models.prompt_response import PromptResponse
+class ParallexPromptsCallableOutput(BaseModel):
+    original_prompts: list[str] = Field(description="List of given prompts")
+    trace_id: UUID = Field(description="Unique trace for each file")
+    responses: list[PromptResponse] = Field(
+        description="List of PromptResponse objects"
+    )

parallex/models/prompt_response.py ADDED Viewed

@@ -0,0 +1,6 @@
+from pydantic import BaseModel, Field
+class PromptResponse(BaseModel):
+    output_content: str = Field(description="Response from the model")
+    prompt_index: int = Field(description="Index corresponding to the given prompts")

parallex/parallex.py CHANGED Viewed

@@ -1,17 +1,24 @@
 import asyncio
 import tempfile
+import uuid
 from typing import Callable, Optional
 from uuid import UUID
 from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
 from parallex.ai.open_ai_client import OpenAIClient
-from parallex.ai.output_processor import process_output
-from parallex.ai.uploader import upload_images_for_processing
+from parallex.ai.output_processor import process_images_output, process_prompts_output
+from parallex.ai.uploader import (
+    upload_images_for_processing,
+    upload_prompts_for_processing,
+)
 from parallex.file_management.converter import convert_pdf_to_images
 from parallex.file_management.file_finder import add_file_to_temp_directory
 from parallex.file_management.remote_file_handler import RemoteFileHandler
 from parallex.models.batch_file import BatchFile
 from parallex.models.parallex_callable_output import ParallexCallableOutput
+from parallex.models.parallex_prompts_callable_output import (
+    ParallexPromptsCallableOutput,
+)
 from parallex.models.upload_batch import UploadBatch
 from parallex.utils.constants import DEFAULT_PROMPT
 from parallex.utils.logger import logger, setup_logger
@@ -40,9 +47,92 @@ async def parallex(
     except Exception as e:
         logger.error(f"Error occurred: {e}")
     finally:
-        for file in remote_file_handler.created_files:
-            logger.info(f"deleting - {file}")
-            await open_ai_client.delete_file(file)
+        await _delete_associated_files(open_ai_client, remote_file_handler)
+async def parallex_simple_prompts(
+    model: str,
+    prompts: list[str],
+    post_process_callable: Optional[Callable[..., None]] = None,
+    log_level: Optional[str] = "ERROR",
+    concurrency: Optional[int] = 20,
+) -> ParallexPromptsCallableOutput:
+    setup_logger(log_level)
+    remote_file_handler = RemoteFileHandler()
+    open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
+    try:
+        return await _prompts_execute(
+            open_ai_client=open_ai_client,
+            prompts=prompts,
+            post_process_callable=post_process_callable,
+            concurrency=concurrency,
+        )
+    except Exception as e:
+        logger.error(f"Error occurred: {e}")
+    finally:
+        await _delete_associated_files(open_ai_client, remote_file_handler)
+async def _prompts_execute(
+    open_ai_client: OpenAIClient,
+    prompts: list[str],
+    post_process_callable: Optional[Callable[..., None]] = None,
+    concurrency: Optional[int] = 20,
+):
+    with tempfile.TemporaryDirectory() as temp_directory:
+        trace_id = uuid.uuid4()
+        batch_files = await upload_prompts_for_processing(
+            client=open_ai_client,
+            prompts=prompts,
+            temp_directory=temp_directory,
+            trace_id=trace_id,
+        )
+        start_batch_semaphore = asyncio.Semaphore(concurrency)
+        start_batch_tasks = []
+        for file in batch_files:
+            batch_task = asyncio.create_task(
+                _create_batch_jobs(
+                    batch_file=file,
+                    client=open_ai_client,
+                    trace_id=trace_id,
+                    semaphore=start_batch_semaphore,
+                )
+            )
+            start_batch_tasks.append(batch_task)
+        batch_jobs = await asyncio.gather(*start_batch_tasks)
+        prompt_tasks = []
+        for batch in batch_jobs:
+            logger.info(
+                f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
+            )
+            page_task = asyncio.create_task(
+                await wait_for_batch_completion(client=open_ai_client, batch=batch)
+            )
+            prompt_tasks.append(page_task)
+        output_file_ids = await asyncio.gather(*prompt_tasks)
+        prompts_output = []
+        for output_file_id in output_file_ids:
+            logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
+            prompts_output.append(
+                await process_prompts_output(
+                    client=open_ai_client, output_file_id=output_file_id
+                )
+            )
+        flat_prompts = [page for batch in prompts_output for page in batch]
+        sorted_responses = sorted(flat_prompts, key=lambda x: x.prompt_index)
+        callable_output = ParallexPromptsCallableOutput(
+            original_prompts=prompts,
+            trace_id=trace_id,
+            responses=sorted_responses,
+        )
+        if post_process_callable is not None:
+            post_process_callable(output=callable_output)
+        return callable_output
 async def _execute(
@@ -115,7 +205,7 @@ async def _wait_and_create_pages(
         logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
         output_file_id = await wait_for_batch_completion(client=client, batch=batch)
         logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
-        page_responses = await process_output(
+        page_responses = await process_images_output(
             client=client, output_file_id=output_file_id
         )
         return page_responses
@@ -132,3 +222,9 @@ async def _create_batch_jobs(
             client=client, file_id=batch_file.id, trace_id=trace_id
         )
         return upload_batch
+async def _delete_associated_files(open_ai_client, remote_file_handler):
+    for file in remote_file_handler.created_files:
+        logger.info(f"deleting - {file}")
+        await open_ai_client.delete_file(file)

parallex/utils/constants.py CHANGED Viewed

@@ -6,4 +6,4 @@ DEFAULT_PROMPT = """
     If unable to parse, return an empty string.
     """
-CUSTOM_ID_DELINEATOR = "--page--"
+CUSTOM_ID_DELINEATOR = "--parallex--"

{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: parallex
-Version: 0.2.0
+Version: 0.3.0
 Summary: PDF to markdown using Azure OpenAI batch processing
 Home-page: https://github.com/Summed-AI/parallex
 Author: Jeff Hostetler
@@ -96,3 +96,28 @@ class PageResponse(BaseModel):
 """
 ```
+### Batch processing for list of prompts
+If you do not need to process images, but just want to process prompts using the Batch API,
+you can call;
+```python
+response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
+    model=model,
+    prompts=["Some prompt", "Some other prompt"],
+    post_process_callable=example_post_process
+)
+responses = response_data.responses
+```
+This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
+Responses have the following structure;
+```python
+class ParallexPromptsCallableOutput(BaseModel):
+    original_prompts: list[str] = Field(description="List of given prompts")
+    trace_id: UUID = Field(description="Unique trace for each file")
+    responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
+class PromptResponse(BaseModel):
+    output_content: str = Field(description="Response from the model")
+    prompt_index: int = Field(description="Index corresponding to the given prompts")
+```

{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
 parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
-parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
-parallex/ai/uploader.py,sha256=92P0LxLuRgtjtD4kLtM0n8WUww_8-GImLxb3pbl-kkg,3174
+parallex/ai/output_processor.py,sha256=Rwp8dkLo4xsqooeBh3Xv-uGVbJMG1JQkwyxdUoOs2tQ,1800
+parallex/ai/uploader.py,sha256=9GvrzuaQAxqRiYN5dUHWjFeIFXezH0Y7ARnzBkEHbL0,5451
 parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
 parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
 parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
@@ -11,12 +11,14 @@ parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M
 parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
 parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
 parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
+parallex/models/parallex_prompts_callable_output.py,sha256=IlNX9627_E8aXWQ-vDBuv2-9jMFXqn4LFBbShPzxoc4,421
+parallex/models/prompt_response.py,sha256=LcctuyqwiTHWrZHSahwauMaSBsin5Ws6fQRAzGXTsAA,230
 parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
 parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
-parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
-parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
+parallex/parallex.py,sha256=7YFKnKOkFHoTC7CCHhrXG1JTxprbvw0QkNGOEPYJbvQ,8500
+parallex/utils/constants.py,sha256=508ieZLZ5kse0T4_QyNJp57Aq0DMNFjjyFlsKa0xtek,366
 parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
-parallex-0.2.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
-parallex-0.2.0.dist-info/METADATA,sha256=Aq2RRlLwkXcLZ_wNXLZAsydFYJqTSe47eVhAq78oja8,3416
-parallex-0.2.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-parallex-0.2.0.dist-info/RECORD,,
+parallex-0.3.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
+parallex-0.3.0.dist-info/METADATA,sha256=hIIhGrV5PE-E-lkWf-kBE3QBPevKSVRHkw0hUx_iqik,4461
+parallex-0.3.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+parallex-0.3.0.dist-info/RECORD,,

{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{parallex-0.2.0.dist-info → parallex-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

parallex 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

parallex 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl