PyPI - parallex - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

parallex 0.2.0tar.gz → 0.3.0tar.gz

Files changed (26) hide show

{parallex-0.2.0 → parallex-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: parallex
-Version: 0.2.0
+Version: 0.3.0
 Summary: PDF to markdown using Azure OpenAI batch processing
 Home-page: https://github.com/Summed-AI/parallex
 Author: Jeff Hostetler
@@ -96,3 +96,28 @@ class PageResponse(BaseModel):
 """
 ```
+### Batch processing for list of prompts
+If you do not need to process images, but just want to process prompts using the Batch API,
+you can call;
+```python
+response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
+    model=model,
+    prompts=["Some prompt", "Some other prompt"],
+    post_process_callable=example_post_process
+)
+responses = response_data.responses
+```
+This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
+Responses have the following structure;
+```python
+class ParallexPromptsCallableOutput(BaseModel):
+    original_prompts: list[str] = Field(description="List of given prompts")
+    trace_id: UUID = Field(description="Unique trace for each file")
+    responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
+class PromptResponse(BaseModel):
+    output_content: str = Field(description="Response from the model")
+    prompt_index: int = Field(description="Index corresponding to the given prompts")
+```

{parallex-0.2.0 → parallex-0.3.0}/README.md RENAMED Viewed

@@ -75,3 +75,28 @@ class PageResponse(BaseModel):
     If unable to parse, return an empty string.
 """
 ```
+### Batch processing for list of prompts
+If you do not need to process images, but just want to process prompts using the Batch API,
+you can call;
+```python
+response_data: ParallexPromptsCallableOutput = await parallex_simple_prompts(
+    model=model,
+    prompts=["Some prompt", "Some other prompt"],
+    post_process_callable=example_post_process
+)
+responses = response_data.responses
+```
+This will create a batch that includes all the prompts in `prompts` and responses can be tied back to the prompt by index.
+Responses have the following structure;
+```python
+class ParallexPromptsCallableOutput(BaseModel):
+    original_prompts: list[str] = Field(description="List of given prompts")
+    trace_id: UUID = Field(description="Unique trace for each file")
+    responses: list[PromptResponse] = Field(description="List of PromptResponse objects")
+class PromptResponse(BaseModel):
+    output_content: str = Field(description="Response from the model")
+    prompt_index: int = Field(description="Index corresponding to the given prompts")
+```

parallex-0.3.0/parallex/ai/output_processor.py ADDED Viewed

@@ -0,0 +1,57 @@
+import json
+from typing import TypeVar, Callable
+from parallex.ai.open_ai_client import OpenAIClient
+from parallex.models.page_response import PageResponse
+from parallex.models.prompt_response import PromptResponse
+from parallex.utils.constants import CUSTOM_ID_DELINEATOR
+async def process_images_output(
+    client: OpenAIClient, output_file_id: str
+) -> list[PageResponse]:
+    return await _process_output(
+        client,
+        output_file_id,
+        lambda content, identifier: PageResponse(
+            output_content=content, page_number=int(identifier)
+        ),
+    )
+async def process_prompts_output(
+    client: OpenAIClient, output_file_id: str
+) -> list[PromptResponse]:
+    """Gets content from completed Batch to create PromptResponse with LLM answers to given prompts"""
+    return await _process_output(
+        client,
+        output_file_id,
+        lambda content, identifier: PromptResponse(
+            output_content=content, prompt_index=int(identifier)
+        ),
+    )
+ResponseType = TypeVar("ResponseType")
+async def _process_output(
+    client: OpenAIClient,
+    output_file_id: str,
+    response_builder: Callable[[str, str], ResponseType],
+) -> list[ResponseType]:
+    file_response = await client.retrieve_file(output_file_id)
+    raw_responses = file_response.text.strip().split("\n")
+    responses = []
+    for raw_response in raw_responses:
+        json_response = json.loads(raw_response)
+        custom_id = json_response["custom_id"]
+        identifier = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
+        output_content = json_response["response"]["body"]["choices"][0]["message"][
+            "content"
+        ]
+        response = response_builder(output_content, identifier)
+        responses.append(response)
+    return responses

parallex-0.3.0/parallex/ai/uploader.py ADDED Viewed

@@ -0,0 +1,157 @@
+import base64
+import json
+import os
+from uuid import UUID
+from parallex.ai.open_ai_client import OpenAIClient
+from parallex.file_management.utils import file_in_temp_dir
+from parallex.models.batch_file import BatchFile
+from parallex.models.image_file import ImageFile
+from parallex.utils.constants import CUSTOM_ID_DELINEATOR
+MAX_FILE_SIZE = 180 * 1024 * 1024  # 180 MB in bytes. Limit for Azure is 200MB.
+async def upload_images_for_processing(
+    client: OpenAIClient,
+    image_files: list[ImageFile],
+    temp_directory: str,
+    prompt_text: str,
+) -> list[BatchFile]:
+    """Base64 encodes image, converts to expected jsonl format and uploads"""
+    trace_id = image_files[0].trace_id
+    current_index = 0
+    batch_files = []
+    upload_file_location = file_in_temp_dir(
+        directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
+    )
+    for image_file in image_files:
+        if await _approaching_file_size_limit(upload_file_location):
+            """When approaching upload file limit, upload and start new file"""
+            batch_file = await _create_batch_file(
+                client, trace_id, upload_file_location
+            )
+            batch_files.append(batch_file)
+            upload_file_location = await _increment_batch_file_index(
+                current_index, temp_directory, trace_id
+            )
+        with open(image_file.path, "rb") as image:
+            base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
+        prompt_custom_id = (
+            f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
+        )
+        jsonl = _image_jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
+        with open(upload_file_location, "a") as jsonl_file:
+            jsonl_file.write(json.dumps(jsonl) + "\n")
+    batch_file = await _create_batch_file(client, trace_id, upload_file_location)
+    batch_files.append(batch_file)
+    return batch_files
+async def upload_prompts_for_processing(
+    client: OpenAIClient, prompts: list[str], temp_directory: str, trace_id: UUID
+) -> list[BatchFile]:
+    """Creates jsonl file and uploads for processing"""
+    current_index = 0
+    batch_files = []
+    upload_file_location = await set_file_location(
+        current_index, temp_directory, trace_id
+    )
+    for index, prompt in enumerate(prompts):
+        if await _approaching_file_size_limit(upload_file_location):
+            """When approaching upload file limit, upload and start new file"""
+            batch_file = await _create_batch_file(
+                client, trace_id, upload_file_location
+            )
+            batch_files.append(batch_file)
+            upload_file_location = await _increment_batch_file_index(
+                current_index, temp_directory, trace_id
+            )
+        prompt_custom_id = f"{trace_id}{CUSTOM_ID_DELINEATOR}{index}.jsonl"
+        jsonl = _simple_jsonl_format(prompt_custom_id, prompt)
+        with open(upload_file_location, "a") as jsonl_file:
+            jsonl_file.write(json.dumps(jsonl) + "\n")
+        batch_file = await _create_batch_file(client, trace_id, upload_file_location)
+        batch_files.append(batch_file)
+    return batch_files
+async def set_file_location(
+    current_index: int, temp_directory: str, trace_id: UUID
+) -> str:
+    return file_in_temp_dir(
+        directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
+    )
+async def _approaching_file_size_limit(upload_file_location: str) -> bool:
+    return (
+        os.path.exists(upload_file_location)
+        and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
+    )
+async def _increment_batch_file_index(
+    current_index: int, temp_directory: str, trace_id: UUID
+) -> str:
+    current_index += 1
+    upload_file_location = await set_file_location(
+        current_index, temp_directory, trace_id
+    )
+    return upload_file_location
+async def _create_batch_file(
+    client: OpenAIClient, trace_id: UUID, upload_file_location: str
+) -> BatchFile:
+    file_response = await client.upload(upload_file_location)
+    return BatchFile(
+        id=file_response.id,
+        name=file_response.filename,
+        purpose=file_response.purpose,
+        status=file_response.status,
+        trace_id=trace_id,
+    )
+def _simple_jsonl_format(prompt_custom_id: str, prompt_text: str) -> dict:
+    return {
+        "custom_id": prompt_custom_id,
+        "method": "POST",
+        "url": "/chat/completions",
+        "body": {
+            "model": os.getenv("AZURE_API_DEPLOYMENT"),
+            "messages": [{"role": "user", "content": prompt_text}],
+        },
+    }
+def _image_jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
+    return {
+        "custom_id": prompt_custom_id,
+        "method": "POST",
+        "url": "/chat/completions",
+        "body": {
+            "model": os.getenv("AZURE_API_DEPLOYMENT"),
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt_text},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{encoded_image}"
+                            },
+                        },
+                    ],
+                }
+            ],
+            "max_tokens": 2000,
+        },
+    }

parallex-0.3.0/parallex/models/parallex_prompts_callable_output.py ADDED Viewed

@@ -0,0 +1,13 @@
+from uuid import UUID
+from pydantic import BaseModel, Field
+from parallex.models.prompt_response import PromptResponse
+class ParallexPromptsCallableOutput(BaseModel):
+    original_prompts: list[str] = Field(description="List of given prompts")
+    trace_id: UUID = Field(description="Unique trace for each file")
+    responses: list[PromptResponse] = Field(
+        description="List of PromptResponse objects"
+    )

parallex-0.3.0/parallex/models/prompt_response.py ADDED Viewed

@@ -0,0 +1,6 @@
+from pydantic import BaseModel, Field
+class PromptResponse(BaseModel):
+    output_content: str = Field(description="Response from the model")
+    prompt_index: int = Field(description="Index corresponding to the given prompts")

{parallex-0.2.0 → parallex-0.3.0}/parallex/parallex.py RENAMED Viewed

@@ -1,17 +1,24 @@
 import asyncio
 import tempfile
+import uuid
 from typing import Callable, Optional
 from uuid import UUID
 from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
 from parallex.ai.open_ai_client import OpenAIClient
-from parallex.ai.output_processor import process_output
-from parallex.ai.uploader import upload_images_for_processing
+from parallex.ai.output_processor import process_images_output, process_prompts_output
+from parallex.ai.uploader import (
+    upload_images_for_processing,
+    upload_prompts_for_processing,
+)
 from parallex.file_management.converter import convert_pdf_to_images
 from parallex.file_management.file_finder import add_file_to_temp_directory
 from parallex.file_management.remote_file_handler import RemoteFileHandler
 from parallex.models.batch_file import BatchFile
 from parallex.models.parallex_callable_output import ParallexCallableOutput
+from parallex.models.parallex_prompts_callable_output import (
+    ParallexPromptsCallableOutput,
+)
 from parallex.models.upload_batch import UploadBatch
 from parallex.utils.constants import DEFAULT_PROMPT
 from parallex.utils.logger import logger, setup_logger
@@ -40,9 +47,92 @@ async def parallex(
     except Exception as e:
         logger.error(f"Error occurred: {e}")
     finally:
-        for file in remote_file_handler.created_files:
-            logger.info(f"deleting - {file}")
-            await open_ai_client.delete_file(file)
+        await _delete_associated_files(open_ai_client, remote_file_handler)
+async def parallex_simple_prompts(
+    model: str,
+    prompts: list[str],
+    post_process_callable: Optional[Callable[..., None]] = None,
+    log_level: Optional[str] = "ERROR",
+    concurrency: Optional[int] = 20,
+) -> ParallexPromptsCallableOutput:
+    setup_logger(log_level)
+    remote_file_handler = RemoteFileHandler()
+    open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
+    try:
+        return await _prompts_execute(
+            open_ai_client=open_ai_client,
+            prompts=prompts,
+            post_process_callable=post_process_callable,
+            concurrency=concurrency,
+        )
+    except Exception as e:
+        logger.error(f"Error occurred: {e}")
+    finally:
+        await _delete_associated_files(open_ai_client, remote_file_handler)
+async def _prompts_execute(
+    open_ai_client: OpenAIClient,
+    prompts: list[str],
+    post_process_callable: Optional[Callable[..., None]] = None,
+    concurrency: Optional[int] = 20,
+):
+    with tempfile.TemporaryDirectory() as temp_directory:
+        trace_id = uuid.uuid4()
+        batch_files = await upload_prompts_for_processing(
+            client=open_ai_client,
+            prompts=prompts,
+            temp_directory=temp_directory,
+            trace_id=trace_id,
+        )
+        start_batch_semaphore = asyncio.Semaphore(concurrency)
+        start_batch_tasks = []
+        for file in batch_files:
+            batch_task = asyncio.create_task(
+                _create_batch_jobs(
+                    batch_file=file,
+                    client=open_ai_client,
+                    trace_id=trace_id,
+                    semaphore=start_batch_semaphore,
+                )
+            )
+            start_batch_tasks.append(batch_task)
+        batch_jobs = await asyncio.gather(*start_batch_tasks)
+        prompt_tasks = []
+        for batch in batch_jobs:
+            logger.info(
+                f"waiting for batch to complete - {batch.id} - {batch.trace_id}"
+            )
+            page_task = asyncio.create_task(
+                await wait_for_batch_completion(client=open_ai_client, batch=batch)
+            )
+            prompt_tasks.append(page_task)
+        output_file_ids = await asyncio.gather(*prompt_tasks)
+        prompts_output = []
+        for output_file_id in output_file_ids:
+            logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
+            prompts_output.append(
+                await process_prompts_output(
+                    client=open_ai_client, output_file_id=output_file_id
+                )
+            )
+        flat_prompts = [page for batch in prompts_output for page in batch]
+        sorted_responses = sorted(flat_prompts, key=lambda x: x.prompt_index)
+        callable_output = ParallexPromptsCallableOutput(
+            original_prompts=prompts,
+            trace_id=trace_id,
+            responses=sorted_responses,
+        )
+        if post_process_callable is not None:
+            post_process_callable(output=callable_output)
+        return callable_output
 async def _execute(
@@ -115,7 +205,7 @@ async def _wait_and_create_pages(
         logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
         output_file_id = await wait_for_batch_completion(client=client, batch=batch)
         logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
-        page_responses = await process_output(
+        page_responses = await process_images_output(
             client=client, output_file_id=output_file_id
         )
         return page_responses
@@ -132,3 +222,9 @@ async def _create_batch_jobs(
             client=client, file_id=batch_file.id, trace_id=trace_id
         )
         return upload_batch
+async def _delete_associated_files(open_ai_client, remote_file_handler):
+    for file in remote_file_handler.created_files:
+        logger.info(f"deleting - {file}")
+        await open_ai_client.delete_file(file)

{parallex-0.2.0 → parallex-0.3.0}/parallex/utils/constants.py RENAMED Viewed

@@ -6,4 +6,4 @@ DEFAULT_PROMPT = """
     If unable to parse, return an empty string.
     """
-CUSTOM_ID_DELINEATOR = "--page--"
+CUSTOM_ID_DELINEATOR = "--parallex--"

{parallex-0.2.0 → parallex-0.3.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "parallex"
-version = "0.2.0"
+version = "0.3.0"
 description = "PDF to markdown using Azure OpenAI batch processing"
 authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
 repository = "https://github.com/Summed-AI/parallex"

parallex-0.2.0/parallex/ai/output_processor.py DELETED Viewed

@@ -1,25 +0,0 @@
-import json
-from parallex.ai.open_ai_client import OpenAIClient
-from parallex.models.page_response import PageResponse
-from parallex.utils.constants import CUSTOM_ID_DELINEATOR
-async def process_output(
-    client: OpenAIClient, output_file_id: str
-) -> list[PageResponse]:
-    """Gets content from completed Batch to create PageResponse with created markdown"""
-    file_response = await client.retrieve_file(output_file_id)
-    raw_responses = file_response.text.strip().split("\n")
-    pages = []
-    for raw_response in raw_responses:
-        json_response = json.loads(raw_response)
-        custom_id = json_response["custom_id"]
-        page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
-        output_content = json_response["response"]["body"]["choices"][0]["message"][
-            "content"
-        ]
-        page = PageResponse(output_content=output_content, page_number=int(page_number))
-        pages.append(page)
-    return pages

parallex-0.2.0/parallex/ai/uploader.py DELETED Viewed

@@ -1,91 +0,0 @@
-import base64
-import json
-import os
-from parallex.ai.open_ai_client import OpenAIClient
-from parallex.file_management.utils import file_in_temp_dir
-from parallex.models.batch_file import BatchFile
-from parallex.models.image_file import ImageFile
-from parallex.utils.constants import CUSTOM_ID_DELINEATOR
-MAX_FILE_SIZE = 150 * 1024 * 1024  # 150 MB in bytes
-async def upload_images_for_processing(
-    client: OpenAIClient,
-    image_files: list[ImageFile],
-    temp_directory: str,
-    prompt_text: str,
-) -> list[BatchFile]:
-    """Base64 encodes image, converts to expected jsonl format and uploads"""
-    trace_id = image_files[0].trace_id
-    current_index = 0
-    batch_files = []
-    upload_file_location = file_in_temp_dir(
-        directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
-    )
-    for image_file in image_files:
-        if (
-            os.path.exists(upload_file_location)
-            and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
-        ):
-            """When approaching upload file limit, upload and start new file"""
-            batch_file = await _create_batch_file(
-                client, trace_id, upload_file_location
-            )
-            batch_files.append(batch_file)
-            current_index += 1
-            upload_file_location = file_in_temp_dir(
-                directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
-            )
-        with open(image_file.path, "rb") as image:
-            base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
-        prompt_custom_id = (
-            f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
-        )
-        jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
-        with open(upload_file_location, "a") as jsonl_file:
-            jsonl_file.write(json.dumps(jsonl) + "\n")
-    batch_file = await _create_batch_file(client, trace_id, upload_file_location)
-    batch_files.append(batch_file)
-    return batch_files
-async def _create_batch_file(client, trace_id, upload_file_location):
-    file_response = await client.upload(upload_file_location)
-    return BatchFile(
-        id=file_response.id,
-        name=file_response.filename,
-        purpose=file_response.purpose,
-        status=file_response.status,
-        trace_id=trace_id,
-    )
-def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
-    return {
-        "custom_id": prompt_custom_id,
-        "method": "POST",
-        "url": "/chat/completions",
-        "body": {
-            "model": os.getenv("AZURE_API_DEPLOYMENT"),
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt_text},
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/png;base64,{encoded_image}"
-                            },
-                        },
-                    ],
-                }
-            ],
-            "max_tokens": 2000,
-        },
-    }