parallex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parallex/__init__.py ADDED
File without changes
@@ -0,0 +1,42 @@
1
+ import asyncio
2
+ from uuid import UUID
3
+
4
+ from openai import BadRequestError
5
+
6
+ from parallex.ai.open_ai_client import OpenAIClient
7
+ from parallex.models.upload_batch import build_batch, UploadBatch
8
+
9
+
10
+ async def create_batch(
11
+ client: OpenAIClient, file_id: str, trace_id: UUID
12
+ ) -> UploadBatch:
13
+ """Creates a Batch for the given file_id"""
14
+ max_retries = 10
15
+ backoff_delay = 5
16
+
17
+ for attempt in range(max_retries):
18
+ try:
19
+ batch_response = await client.create_batch(upload_file_id=file_id)
20
+ batch = build_batch(open_ai_batch=batch_response, trace_id=trace_id)
21
+ return batch # Return batch if successful
22
+ except BadRequestError as e:
23
+ if attempt == max_retries - 1:
24
+ raise e
25
+ await asyncio.sleep(backoff_delay)
26
+ backoff_delay *= 2
27
+
28
+
29
+ # TODO handle errors
30
+ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) -> str:
31
+ """Waits for Batch to complete and returns output_file_id when available"""
32
+ status = "validating"
33
+ delay = 5
34
+ while status not in ("completed", "failed", "canceled"):
35
+ await asyncio.sleep(delay)
36
+ batch_response = await client.retrieve_batch(batch.id)
37
+ status = batch_response.status
38
+ batch.output_file_id = batch_response.output_file_id
39
+ batch.error_file_id = batch_response.error_file_id
40
+ delay = 30
41
+ if status == "completed":
42
+ return batch_response.output_file_id
@@ -0,0 +1,43 @@
1
+ import os
2
+
3
+ from openai import AsyncAzureOpenAI
4
+ from openai._legacy_response import HttpxBinaryResponseContent
5
+ from openai.types import FileObject, Batch, FileDeleted
6
+
7
+ from parallex.utils.logger import logger
8
+
9
+
10
+ # Exceptions for missing keys, etc
11
+ class OpenAIClient:
12
+ def __init__(self, model: str):
13
+ self.model = model
14
+
15
+ self._client = AsyncAzureOpenAI(
16
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
17
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
18
+ api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
19
+ )
20
+
21
+ async def upload(self, file_path: str) -> FileObject:
22
+ return await self._client.files.create(
23
+ file=open(file_path, "rb"), purpose="batch"
24
+ )
25
+
26
+ async def create_batch(self, upload_file_id: str) -> Batch:
27
+ return await self._client.batches.create(
28
+ input_file_id=upload_file_id,
29
+ endpoint="/chat/completions",
30
+ completion_window="24h",
31
+ )
32
+
33
+ async def retrieve_batch(self, batch_id: str) -> Batch:
34
+ return await self._client.batches.retrieve(batch_id)
35
+
36
+ async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
37
+ return await self._client.files.content(file_id)
38
+
39
+ async def delete_file(self, file_id: str) -> FileDeleted:
40
+ try:
41
+ return await self._client.files.delete(file_id)
42
+ except Exception as e:
43
+ logger.info(f"Did not delete file: {e}")
@@ -0,0 +1,25 @@
1
+ import json
2
+
3
+ from parallex.ai.open_ai_client import OpenAIClient
4
+ from parallex.models.page_response import PageResponse
5
+ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
6
+
7
+
8
+ async def process_output(
9
+ client: OpenAIClient, output_file_id: str
10
+ ) -> list[PageResponse]:
11
+ """Gets content from completed Batch to create PageResponse with created markdown"""
12
+ file_response = await client.retrieve_file(output_file_id)
13
+ raw_responses = file_response.text.strip().split("\n")
14
+
15
+ pages = []
16
+ for raw_response in raw_responses:
17
+ json_response = json.loads(raw_response)
18
+ custom_id = json_response["custom_id"]
19
+ page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
20
+ output_content = json_response["response"]["body"]["choices"][0]["message"][
21
+ "content"
22
+ ]
23
+ page = PageResponse(output_content=output_content, page_number=int(page_number))
24
+ pages.append(page)
25
+ return pages
@@ -0,0 +1,91 @@
1
+ import base64
2
+ import json
3
+ import os
4
+
5
+ from parallex.ai.open_ai_client import OpenAIClient
6
+ from parallex.file_management.utils import file_in_temp_dir
7
+ from parallex.models.batch_file import BatchFile
8
+ from parallex.models.image_file import ImageFile
9
+ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
10
+
11
+ MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
12
+
13
+
14
+ async def upload_images_for_processing(
15
+ client: OpenAIClient,
16
+ image_files: list[ImageFile],
17
+ temp_directory: str,
18
+ prompt_text: str,
19
+ ):
20
+ """Base64 encodes image, converts to expected jsonl format and uploads"""
21
+ trace_id = image_files[0].trace_id
22
+ current_index = 0
23
+ batch_files = []
24
+ upload_file_location = file_in_temp_dir(
25
+ directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
26
+ )
27
+
28
+ for image_file in image_files:
29
+ if (
30
+ os.path.exists(upload_file_location)
31
+ and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
32
+ ):
33
+ """When approaching upload file limit, upload and start new file"""
34
+ batch_file = await _create_batch_file(
35
+ client, trace_id, upload_file_location
36
+ )
37
+ batch_files.append(batch_file)
38
+ current_index += 1
39
+ upload_file_location = file_in_temp_dir(
40
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
41
+ )
42
+
43
+ with open(image_file.path, "rb") as image:
44
+ base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
45
+
46
+ prompt_custom_id = (
47
+ f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
48
+ )
49
+ jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
50
+ with open(upload_file_location, "a") as jsonl_file:
51
+ jsonl_file.write(json.dumps(jsonl) + "\n")
52
+ batch_file = await _create_batch_file(client, trace_id, upload_file_location)
53
+ batch_files.append(batch_file)
54
+ return batch_files
55
+
56
+
57
+ async def _create_batch_file(client, trace_id, upload_file_location):
58
+ file_response = await client.upload(upload_file_location)
59
+ return BatchFile(
60
+ id=file_response.id,
61
+ name=file_response.filename,
62
+ purpose=file_response.purpose,
63
+ status=file_response.status,
64
+ trace_id=trace_id,
65
+ )
66
+
67
+
68
+ def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
69
+ return {
70
+ "custom_id": prompt_custom_id,
71
+ "method": "POST",
72
+ "url": "/chat/completions",
73
+ "body": {
74
+ "model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
75
+ "messages": [
76
+ {
77
+ "role": "user",
78
+ "content": [
79
+ {"type": "text", "text": prompt_text},
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {
83
+ "url": f"data:image/png;base64,{encoded_image}"
84
+ },
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ "max_tokens": 2000,
90
+ },
91
+ }
@@ -0,0 +1,37 @@
1
+ import asyncio
2
+
3
+ from pdf2image import convert_from_path
4
+
5
+ from parallex.models.image_file import ImageFile
6
+ from parallex.models.raw_file import RawFile
7
+ from parallex.utils.logger import logger
8
+
9
+
10
+ async def convert_pdf_to_images(
11
+ raw_file: RawFile, temp_directory: str
12
+ ) -> list[ImageFile]:
13
+ """Converts a PDF file to a series of images in the temp_directory. Returns a list ImageFile objects."""
14
+ options = {
15
+ "pdf_path": raw_file.path,
16
+ "output_folder": temp_directory,
17
+ "dpi": 300,
18
+ "fmt": "png",
19
+ "size": (None, 1056),
20
+ "thread_count": 4,
21
+ "use_pdftocairo": True,
22
+ "paths_only": True,
23
+ }
24
+
25
+ try:
26
+ image_paths = await asyncio.to_thread(convert_from_path, **options)
27
+ return [
28
+ ImageFile(
29
+ path=path,
30
+ trace_id=raw_file.trace_id,
31
+ given_file_name=raw_file.given_name,
32
+ page_number=(i + 1),
33
+ )
34
+ for i, path in enumerate(image_paths)
35
+ ]
36
+ except Exception as err:
37
+ logger.error(f"Error converting PDF to images: {err}")
@@ -0,0 +1,44 @@
1
+ import uuid
2
+
3
+ import httpx
4
+
5
+ from parallex.file_management.utils import file_in_temp_dir
6
+ from parallex.models.raw_file import RawFile
7
+
8
+
9
+ # TODO get from URL or from file system
10
+ async def add_file_to_temp_directory(
11
+ pdf_source_url: str, temp_directory: str
12
+ ) -> RawFile:
13
+ """Downloads file and adds to temp directory"""
14
+ given_file_name = pdf_source_url.split("/")[-1]
15
+ file_trace_id = uuid.uuid4()
16
+ async with httpx.AsyncClient() as client:
17
+ async with client.stream("GET", pdf_source_url) as response:
18
+ response.raise_for_status() # Check for HTTP errors
19
+ content_type = response.headers.get("Content-Type")
20
+ file_name = _determine_file_name(
21
+ given_file_name, file_trace_id, content_type
22
+ )
23
+ path = file_in_temp_dir(temp_directory, file_name)
24
+ with open(path, "wb") as file:
25
+ async for chunk in response.aiter_bytes():
26
+ file.write(chunk)
27
+
28
+ return RawFile(
29
+ name=file_name,
30
+ path=path,
31
+ content_type=content_type,
32
+ given_name=given_file_name,
33
+ pdf_source_url=pdf_source_url,
34
+ trace_id=file_trace_id,
35
+ )
36
+
37
+
38
+ def _determine_file_name(given_file_name: str, file_trace_id, content_type: str):
39
+ # TODO custom errors
40
+ # TODO other types besides pdf
41
+ name, extension = given_file_name.split(".")
42
+ if "application/pdf" not in content_type:
43
+ raise ValueError("Content-Type must be application/pdf")
44
+ return f"{file_trace_id}.{extension}"
@@ -0,0 +1,2 @@
1
+ def file_in_temp_dir(directory: str, file_name: str):
2
+ return "/".join([directory, file_name])
@@ -0,0 +1,11 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class BatchFile(BaseModel):
7
+ id: str = Field(description="ID of the OpenAI Batch")
8
+ name: str = Field(description="Name of file batch was created with")
9
+ purpose: str = Field(description="Purpose 'batch")
10
+ status: str = Field(description="Status of the batch")
11
+ trace_id: UUID = Field(description="Unique trace for each file")
@@ -0,0 +1,11 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel
4
+ from pydantic.fields import Field
5
+
6
+
7
+ class ImageFile(BaseModel):
8
+ path: str = Field(description="Path to the image in temp directory")
9
+ page_number: int = Field(description="Associated page of the PDF")
10
+ given_file_name: str = Field(description="Name of the given file")
11
+ trace_id: UUID = Field(description="Unique trace for each file")
@@ -0,0 +1,6 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class PageResponse(BaseModel):
5
+ output_content: str = Field(description="Markdown generated for the page")
6
+ page_number: int = Field(description="Page number of the associated PDF")
@@ -0,0 +1,12 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from parallex.models.page_response import PageResponse
6
+
7
+
8
+ class ParallexCallableOutput(BaseModel):
9
+ file_name: str = Field(description="Name of file that is processed")
10
+ pdf_source_url: str = Field(description="Given URL of the source of output")
11
+ trace_id: UUID = Field(description="Unique trace for each file")
12
+ pages: list[PageResponse] = Field(description="List of PageResponse objects")
@@ -0,0 +1,12 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class RawFile(BaseModel):
7
+ name: str = Field(description="Name of the file given by Parallex")
8
+ path: str = Field(description="Path to file in temp directory")
9
+ content_type: str = Field(description="Given file type")
10
+ given_name: str = Field(description="Name of file given")
11
+ pdf_source_url: str = Field(description="Source of file")
12
+ trace_id: UUID = Field(description="Unique trace for each file")
@@ -0,0 +1,45 @@
1
+ from typing import Optional
2
+ from uuid import UUID
3
+
4
+ from openai.types.batch import Errors, Batch
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class UploadBatch(BaseModel):
9
+ # page_number: int = Field(description="Page number of associated file")
10
+ trace_id: UUID = Field(description="Unique trace for each file")
11
+ id: str = Field(description="ID of the OpenAI Batch")
12
+ completion_window: str = Field(description="When batch can complete (24hrs)")
13
+ created_at: int = Field(description="When batch was created")
14
+ endpoint: str = Field(description="Endpoint used for retreival")
15
+ input_file_id: str = Field(description="File that is input to batch")
16
+ output_file_id: Optional[str] = Field(
17
+ None, description="File that is output when batch completes"
18
+ )
19
+ status: str = Field(description="Current status of the batch")
20
+ cancelled_at: Optional[int] = Field(None, description="When batch cancelled")
21
+ cancelling_at: Optional[int] = Field(
22
+ None, description="When batch started cancelling"
23
+ )
24
+ completed_at: Optional[int] = Field(None, description="When batch completed")
25
+ expired_at: Optional[int] = Field(None, description="When batch expired")
26
+ expires_at: Optional[int] = Field(None, description="When batch expires")
27
+ failed_at: Optional[int] = Field(None, description="When batch failed")
28
+ finalizing_at: Optional[int] = Field(
29
+ None, description="When batch started finalizing"
30
+ )
31
+ in_progress_at: Optional[int] = Field(
32
+ None, description="When batch started processing"
33
+ )
34
+ error_file_id: Optional[str] = Field(
35
+ None, description="File that is created during error of batch"
36
+ )
37
+ errors: Optional[Errors] = Field(None, description="List of errors")
38
+
39
+
40
+ def build_batch(open_ai_batch: Batch, trace_id: UUID) -> UploadBatch:
41
+ fields = UploadBatch.model_fields
42
+ input_fields = {key: getattr(open_ai_batch, key, None) for key in fields}
43
+ input_fields["trace_id"] = trace_id
44
+ # input_fields["page_number"] = page_number
45
+ return UploadBatch(**input_fields)
parallex/parallex.py ADDED
@@ -0,0 +1,117 @@
1
+ import asyncio
2
+ import tempfile
3
+ from typing import Callable, Optional
4
+ from uuid import UUID
5
+
6
+ from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
7
+ from parallex.ai.open_ai_client import OpenAIClient
8
+ from parallex.ai.output_processor import process_output
9
+ from parallex.ai.uploader import upload_images_for_processing
10
+ from parallex.file_management.converter import convert_pdf_to_images
11
+ from parallex.file_management.file_finder import add_file_to_temp_directory
12
+ from parallex.models.image_file import ImageFile
13
+ from parallex.models.parallex_callable_output import ParallexCallableOutput
14
+ from parallex.models.upload_batch import UploadBatch
15
+ from parallex.utils.constants import DEFAULT_PROMPT
16
+ from parallex.utils.logger import logger, setup_logger
17
+
18
+
19
+ # TODO pdf_source_url: str change to be URL or path
20
+ async def parallex(
21
+ model: str,
22
+ pdf_source_url: str,
23
+ post_process_callable: Optional[Callable[..., None]] = None,
24
+ concurrency: int = 20,
25
+ prompt_text: str = DEFAULT_PROMPT,
26
+ log_level: str = "ERROR",
27
+ ) -> ParallexCallableOutput:
28
+ setup_logger(log_level)
29
+ with tempfile.TemporaryDirectory() as temp_directory:
30
+ open_ai_client = OpenAIClient(model=model)
31
+
32
+ raw_file = await add_file_to_temp_directory(
33
+ pdf_source_url=pdf_source_url, temp_directory=temp_directory
34
+ )
35
+ trace_id = raw_file.trace_id
36
+ image_files = await convert_pdf_to_images(
37
+ raw_file=raw_file, temp_directory=temp_directory
38
+ )
39
+
40
+ batch_files = await upload_images_for_processing(
41
+ client=open_ai_client,
42
+ image_files=image_files,
43
+ temp_directory=temp_directory,
44
+ prompt_text=prompt_text,
45
+ )
46
+ start_batch_semaphore = asyncio.Semaphore(concurrency)
47
+ start_batch_tasks = []
48
+ for file in batch_files:
49
+ batch_task = asyncio.create_task(
50
+ _create_images_and_batch_jobs(
51
+ batch_file=file,
52
+ client=open_ai_client,
53
+ trace_id=trace_id,
54
+ semaphore=start_batch_semaphore,
55
+ )
56
+ )
57
+ start_batch_tasks.append(batch_task)
58
+ batches = await asyncio.gather(*start_batch_tasks)
59
+
60
+ pages_tasks = []
61
+ process_semaphore = asyncio.Semaphore(concurrency)
62
+ for batch in batches:
63
+ page_task = asyncio.create_task(
64
+ _wait_and_create_pages(
65
+ batch=batch, client=open_ai_client, semaphore=process_semaphore
66
+ )
67
+ )
68
+ pages_tasks.append(page_task)
69
+ page_groups = await asyncio.gather(*pages_tasks)
70
+
71
+ pages = [page for batch_pages in page_groups for page in batch_pages]
72
+ logger.debug(f"pages done. total pages- {len(pages)} - {trace_id}")
73
+ sorted_pages = sorted(pages, key=lambda x: x.page_number)
74
+
75
+ # TODO add combined version of MD to output / save to file system
76
+ callable_output = ParallexCallableOutput(
77
+ file_name=raw_file.given_name,
78
+ pdf_source_url=raw_file.pdf_source_url,
79
+ trace_id=trace_id,
80
+ pages=sorted_pages,
81
+ )
82
+ if post_process_callable is not None:
83
+ post_process_callable(output=callable_output)
84
+ return callable_output
85
+
86
+
87
+ async def _wait_and_create_pages(
88
+ batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
89
+ ):
90
+ async with semaphore:
91
+ logger.debug(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
92
+ output_file_id = await wait_for_batch_completion(client=client, batch=batch)
93
+ logger.debug(f"batch completed - {batch.id} - {batch.trace_id}")
94
+ page_responses = await process_output(
95
+ client=client, output_file_id=output_file_id
96
+ )
97
+ await _remove_global_batch_files(client=client, batch=batch)
98
+ return page_responses
99
+
100
+
101
+ async def _remove_global_batch_files(client: OpenAIClient, batch: UploadBatch):
102
+ file_ids = [batch.input_file_id, batch.output_file_id, batch.error_file_id]
103
+ for file_id in file_ids:
104
+ await client.delete_file(file_id)
105
+
106
+
107
+ async def _create_images_and_batch_jobs(
108
+ batch_file: ImageFile,
109
+ client: OpenAIClient,
110
+ trace_id: UUID,
111
+ semaphore: asyncio.Semaphore,
112
+ ):
113
+ async with semaphore:
114
+ batch = await create_batch(
115
+ client=client, file_id=batch_file.id, trace_id=trace_id
116
+ )
117
+ return batch
@@ -0,0 +1,9 @@
1
+ DEFAULT_PROMPT = """
2
+ Convert the following PDF page to markdown.
3
+ Return only the markdown with no explanation text.
4
+ Leave out any page numbers and redundant headers or footers.
5
+ Do not include any code blocks (e.g. "```markdown" or "```") in the response.
6
+ If unable to parse, return an empty string.
7
+ """
8
+
9
+ CUSTOM_ID_DELINEATOR = "--page--"
@@ -0,0 +1,20 @@
1
+ import logging
2
+
3
+ from aiologger import Logger
4
+
5
+ logger = Logger.with_default_handlers(name="parallex")
6
+
7
+
8
+ def setup_logger(level: str = "ERROR"):
9
+ level = {
10
+ "CRITICAL": logging.CRITICAL,
11
+ "ERROR": logging.ERROR,
12
+ "WARNING": logging.WARNING,
13
+ "INFO": logging.INFO,
14
+ "DEBUG": logging.DEBUG,
15
+ "NOTSET": logging.NOTSET,
16
+ }.get(level, logging.INFO)
17
+
18
+ logging.basicConfig(
19
+ level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
+ )
@@ -0,0 +1,19 @@
1
+ The MIT License (MIT)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.1
2
+ Name: parallex
3
+ Version: 0.1.0
4
+ Summary:
5
+ Author: Jeff Hostetler
6
+ Author-email: jeff@summed.ai
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
+ Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
+ Requires-Dist: openai (>=1.54.4,<2.0.0)
15
+ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # Parallex
20
+
21
+ ### What it does
22
+ - Converts file into images
23
+ - Makes requests to OpenAI to covert the images to markdown
24
+ - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
+ - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
+ - Post batch processing to do what you wish with the resulting markdown
27
+
28
+
29
+ # Notes for us as we build
30
+ ### Poetry
31
+ - Using [poetry](https://python-poetry.org/docs/) for dependency management
32
+ - add dependency `poetry add pydantic`
33
+ - add dev dependency `poetry add --group dev black`
34
+ - run main script `poetry run python main.py`
35
+ - run dev commands `poetry run black parallex`
36
+
37
+
38
+ # General behavior
39
+ - parallex takes args to do things with file
40
+ - parallex takes args to specify llm model
41
+ - parallex takes a callable to execute once batch process is "ready"
42
+
@@ -0,0 +1,21 @@
1
+ parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ parallex/ai/batch_processor.py,sha256=wftq7-EKWbWO_tzz2PqVZa8XRVf7MoJlr7EcllX4-5I,1481
3
+ parallex/ai/open_ai_client.py,sha256=Yvnvg5MGEyQrmN3HF5k8fEWse9Slthy3J-oumO6ZKkQ,1459
4
+ parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
5
+ parallex/ai/uploader.py,sha256=M8g8dC_bwiGNDI_S5qxcRqJljDu6KSan_eIcQWA-ERA,3162
6
+ parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
+ parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
+ parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
9
+ parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
10
+ parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
11
+ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
12
+ parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
13
+ parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
14
+ parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
15
+ parallex/parallex.py,sha256=2XXmG54eXtXnw2ElC12zjbWDnwDIHPgzKY1ktP8V93M,4472
16
+ parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
17
+ parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
18
+ parallex-0.1.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
+ parallex-0.1.0.dist-info/METADATA,sha256=ICDxk_FnofGhJgeGXv_g-awaZm_g4zU23PkfuXOndNs,1485
20
+ parallex-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
+ parallex-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.9.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any