parallex 0.1.0__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
parallex-0.1.0/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ The MIT License (MIT)
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.1
2
+ Name: parallex
3
+ Version: 0.1.0
4
+ Summary:
5
+ Author: Jeff Hostetler
6
+ Author-email: jeff@summed.ai
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
+ Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
+ Requires-Dist: openai (>=1.54.4,<2.0.0)
15
+ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # Parallex
20
+
21
+ ### What it does
22
+ - Converts file into images
23
+ - Makes requests to OpenAI to covert the images to markdown
24
+ - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
+ - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
+ - Post batch processing to do what you wish with the resulting markdown
27
+
28
+
29
+ # Notes for us as we build
30
+ ### Poetry
31
+ - Using [poetry](https://python-poetry.org/docs/) for dependency management
32
+ - add dependency `poetry add pydantic`
33
+ - add dev dependency `poetry add --group dev black`
34
+ - run main script `poetry run python main.py`
35
+ - run dev commands `poetry run black parallex`
36
+
37
+
38
+ # General behavior
39
+ - parallex takes args to do things with file
40
+ - parallex takes args to specify llm model
41
+ - parallex takes a callable to execute once batch process is "ready"
42
+
@@ -0,0 +1,23 @@
1
+ # Parallex
2
+
3
+ ### What it does
4
+ - Converts file into images
5
+ - Makes requests to OpenAI to covert the images to markdown
6
+ - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
7
+ - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
8
+ - Post batch processing to do what you wish with the resulting markdown
9
+
10
+
11
+ # Notes for us as we build
12
+ ### Poetry
13
+ - Using [poetry](https://python-poetry.org/docs/) for dependency management
14
+ - add dependency `poetry add pydantic`
15
+ - add dev dependency `poetry add --group dev black`
16
+ - run main script `poetry run python main.py`
17
+ - run dev commands `poetry run black parallex`
18
+
19
+
20
+ # General behavior
21
+ - parallex takes args to do things with file
22
+ - parallex takes args to specify llm model
23
+ - parallex takes a callable to execute once batch process is "ready"
File without changes
@@ -0,0 +1,42 @@
1
+ import asyncio
2
+ from uuid import UUID
3
+
4
+ from openai import BadRequestError
5
+
6
+ from parallex.ai.open_ai_client import OpenAIClient
7
+ from parallex.models.upload_batch import build_batch, UploadBatch
8
+
9
+
10
+ async def create_batch(
11
+ client: OpenAIClient, file_id: str, trace_id: UUID
12
+ ) -> UploadBatch:
13
+ """Creates a Batch for the given file_id"""
14
+ max_retries = 10
15
+ backoff_delay = 5
16
+
17
+ for attempt in range(max_retries):
18
+ try:
19
+ batch_response = await client.create_batch(upload_file_id=file_id)
20
+ batch = build_batch(open_ai_batch=batch_response, trace_id=trace_id)
21
+ return batch # Return batch if successful
22
+ except BadRequestError as e:
23
+ if attempt == max_retries - 1:
24
+ raise e
25
+ await asyncio.sleep(backoff_delay)
26
+ backoff_delay *= 2
27
+
28
+
29
+ # TODO handle errors
30
+ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) -> str:
31
+ """Waits for Batch to complete and returns output_file_id when available"""
32
+ status = "validating"
33
+ delay = 5
34
+ while status not in ("completed", "failed", "canceled"):
35
+ await asyncio.sleep(delay)
36
+ batch_response = await client.retrieve_batch(batch.id)
37
+ status = batch_response.status
38
+ batch.output_file_id = batch_response.output_file_id
39
+ batch.error_file_id = batch_response.error_file_id
40
+ delay = 30
41
+ if status == "completed":
42
+ return batch_response.output_file_id
@@ -0,0 +1,43 @@
1
+ import os
2
+
3
+ from openai import AsyncAzureOpenAI
4
+ from openai._legacy_response import HttpxBinaryResponseContent
5
+ from openai.types import FileObject, Batch, FileDeleted
6
+
7
+ from parallex.utils.logger import logger
8
+
9
+
10
+ # Exceptions for missing keys, etc
11
+ class OpenAIClient:
12
+ def __init__(self, model: str):
13
+ self.model = model
14
+
15
+ self._client = AsyncAzureOpenAI(
16
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
17
+ api_key=os.getenv("AZURE_OPENAI_API_KEY"),
18
+ api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
19
+ )
20
+
21
+ async def upload(self, file_path: str) -> FileObject:
22
+ return await self._client.files.create(
23
+ file=open(file_path, "rb"), purpose="batch"
24
+ )
25
+
26
+ async def create_batch(self, upload_file_id: str) -> Batch:
27
+ return await self._client.batches.create(
28
+ input_file_id=upload_file_id,
29
+ endpoint="/chat/completions",
30
+ completion_window="24h",
31
+ )
32
+
33
+ async def retrieve_batch(self, batch_id: str) -> Batch:
34
+ return await self._client.batches.retrieve(batch_id)
35
+
36
+ async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
37
+ return await self._client.files.content(file_id)
38
+
39
+ async def delete_file(self, file_id: str) -> FileDeleted:
40
+ try:
41
+ return await self._client.files.delete(file_id)
42
+ except Exception as e:
43
+ logger.info(f"Did not delete file: {e}")
@@ -0,0 +1,25 @@
1
+ import json
2
+
3
+ from parallex.ai.open_ai_client import OpenAIClient
4
+ from parallex.models.page_response import PageResponse
5
+ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
6
+
7
+
8
+ async def process_output(
9
+ client: OpenAIClient, output_file_id: str
10
+ ) -> list[PageResponse]:
11
+ """Gets content from completed Batch to create PageResponse with created markdown"""
12
+ file_response = await client.retrieve_file(output_file_id)
13
+ raw_responses = file_response.text.strip().split("\n")
14
+
15
+ pages = []
16
+ for raw_response in raw_responses:
17
+ json_response = json.loads(raw_response)
18
+ custom_id = json_response["custom_id"]
19
+ page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
20
+ output_content = json_response["response"]["body"]["choices"][0]["message"][
21
+ "content"
22
+ ]
23
+ page = PageResponse(output_content=output_content, page_number=int(page_number))
24
+ pages.append(page)
25
+ return pages
@@ -0,0 +1,91 @@
1
+ import base64
2
+ import json
3
+ import os
4
+
5
+ from parallex.ai.open_ai_client import OpenAIClient
6
+ from parallex.file_management.utils import file_in_temp_dir
7
+ from parallex.models.batch_file import BatchFile
8
+ from parallex.models.image_file import ImageFile
9
+ from parallex.utils.constants import CUSTOM_ID_DELINEATOR
10
+
11
+ MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
12
+
13
+
14
+ async def upload_images_for_processing(
15
+ client: OpenAIClient,
16
+ image_files: list[ImageFile],
17
+ temp_directory: str,
18
+ prompt_text: str,
19
+ ):
20
+ """Base64 encodes image, converts to expected jsonl format and uploads"""
21
+ trace_id = image_files[0].trace_id
22
+ current_index = 0
23
+ batch_files = []
24
+ upload_file_location = file_in_temp_dir(
25
+ directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
26
+ )
27
+
28
+ for image_file in image_files:
29
+ if (
30
+ os.path.exists(upload_file_location)
31
+ and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
32
+ ):
33
+ """When approaching upload file limit, upload and start new file"""
34
+ batch_file = await _create_batch_file(
35
+ client, trace_id, upload_file_location
36
+ )
37
+ batch_files.append(batch_file)
38
+ current_index += 1
39
+ upload_file_location = file_in_temp_dir(
40
+ directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
41
+ )
42
+
43
+ with open(image_file.path, "rb") as image:
44
+ base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
45
+
46
+ prompt_custom_id = (
47
+ f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
48
+ )
49
+ jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
50
+ with open(upload_file_location, "a") as jsonl_file:
51
+ jsonl_file.write(json.dumps(jsonl) + "\n")
52
+ batch_file = await _create_batch_file(client, trace_id, upload_file_location)
53
+ batch_files.append(batch_file)
54
+ return batch_files
55
+
56
+
57
+ async def _create_batch_file(client, trace_id, upload_file_location):
58
+ file_response = await client.upload(upload_file_location)
59
+ return BatchFile(
60
+ id=file_response.id,
61
+ name=file_response.filename,
62
+ purpose=file_response.purpose,
63
+ status=file_response.status,
64
+ trace_id=trace_id,
65
+ )
66
+
67
+
68
+ def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
69
+ return {
70
+ "custom_id": prompt_custom_id,
71
+ "method": "POST",
72
+ "url": "/chat/completions",
73
+ "body": {
74
+ "model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
75
+ "messages": [
76
+ {
77
+ "role": "user",
78
+ "content": [
79
+ {"type": "text", "text": prompt_text},
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {
83
+ "url": f"data:image/png;base64,{encoded_image}"
84
+ },
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ "max_tokens": 2000,
90
+ },
91
+ }
@@ -0,0 +1,37 @@
1
+ import asyncio
2
+
3
+ from pdf2image import convert_from_path
4
+
5
+ from parallex.models.image_file import ImageFile
6
+ from parallex.models.raw_file import RawFile
7
+ from parallex.utils.logger import logger
8
+
9
+
10
+ async def convert_pdf_to_images(
11
+ raw_file: RawFile, temp_directory: str
12
+ ) -> list[ImageFile]:
13
+ """Converts a PDF file to a series of images in the temp_directory. Returns a list ImageFile objects."""
14
+ options = {
15
+ "pdf_path": raw_file.path,
16
+ "output_folder": temp_directory,
17
+ "dpi": 300,
18
+ "fmt": "png",
19
+ "size": (None, 1056),
20
+ "thread_count": 4,
21
+ "use_pdftocairo": True,
22
+ "paths_only": True,
23
+ }
24
+
25
+ try:
26
+ image_paths = await asyncio.to_thread(convert_from_path, **options)
27
+ return [
28
+ ImageFile(
29
+ path=path,
30
+ trace_id=raw_file.trace_id,
31
+ given_file_name=raw_file.given_name,
32
+ page_number=(i + 1),
33
+ )
34
+ for i, path in enumerate(image_paths)
35
+ ]
36
+ except Exception as err:
37
+ logger.error(f"Error converting PDF to images: {err}")
@@ -0,0 +1,44 @@
1
+ import uuid
2
+
3
+ import httpx
4
+
5
+ from parallex.file_management.utils import file_in_temp_dir
6
+ from parallex.models.raw_file import RawFile
7
+
8
+
9
+ # TODO get from URL or from file system
10
+ async def add_file_to_temp_directory(
11
+ pdf_source_url: str, temp_directory: str
12
+ ) -> RawFile:
13
+ """Downloads file and adds to temp directory"""
14
+ given_file_name = pdf_source_url.split("/")[-1]
15
+ file_trace_id = uuid.uuid4()
16
+ async with httpx.AsyncClient() as client:
17
+ async with client.stream("GET", pdf_source_url) as response:
18
+ response.raise_for_status() # Check for HTTP errors
19
+ content_type = response.headers.get("Content-Type")
20
+ file_name = _determine_file_name(
21
+ given_file_name, file_trace_id, content_type
22
+ )
23
+ path = file_in_temp_dir(temp_directory, file_name)
24
+ with open(path, "wb") as file:
25
+ async for chunk in response.aiter_bytes():
26
+ file.write(chunk)
27
+
28
+ return RawFile(
29
+ name=file_name,
30
+ path=path,
31
+ content_type=content_type,
32
+ given_name=given_file_name,
33
+ pdf_source_url=pdf_source_url,
34
+ trace_id=file_trace_id,
35
+ )
36
+
37
+
38
+ def _determine_file_name(given_file_name: str, file_trace_id, content_type: str):
39
+ # TODO custom errors
40
+ # TODO other types besides pdf
41
+ name, extension = given_file_name.split(".")
42
+ if "application/pdf" not in content_type:
43
+ raise ValueError("Content-Type must be application/pdf")
44
+ return f"{file_trace_id}.{extension}"
@@ -0,0 +1,2 @@
1
+ def file_in_temp_dir(directory: str, file_name: str):
2
+ return "/".join([directory, file_name])
@@ -0,0 +1,11 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class BatchFile(BaseModel):
7
+ id: str = Field(description="ID of the OpenAI Batch")
8
+ name: str = Field(description="Name of file batch was created with")
9
+ purpose: str = Field(description="Purpose 'batch")
10
+ status: str = Field(description="Status of the batch")
11
+ trace_id: UUID = Field(description="Unique trace for each file")
@@ -0,0 +1,11 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel
4
+ from pydantic.fields import Field
5
+
6
+
7
+ class ImageFile(BaseModel):
8
+ path: str = Field(description="Path to the image in temp directory")
9
+ page_number: int = Field(description="Associated page of the PDF")
10
+ given_file_name: str = Field(description="Name of the given file")
11
+ trace_id: UUID = Field(description="Unique trace for each file")
@@ -0,0 +1,6 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class PageResponse(BaseModel):
5
+ output_content: str = Field(description="Markdown generated for the page")
6
+ page_number: int = Field(description="Page number of the associated PDF")
@@ -0,0 +1,12 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from parallex.models.page_response import PageResponse
6
+
7
+
8
+ class ParallexCallableOutput(BaseModel):
9
+ file_name: str = Field(description="Name of file that is processed")
10
+ pdf_source_url: str = Field(description="Given URL of the source of output")
11
+ trace_id: UUID = Field(description="Unique trace for each file")
12
+ pages: list[PageResponse] = Field(description="List of PageResponse objects")
@@ -0,0 +1,12 @@
1
+ from uuid import UUID
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class RawFile(BaseModel):
7
+ name: str = Field(description="Name of the file given by Parallex")
8
+ path: str = Field(description="Path to file in temp directory")
9
+ content_type: str = Field(description="Given file type")
10
+ given_name: str = Field(description="Name of file given")
11
+ pdf_source_url: str = Field(description="Source of file")
12
+ trace_id: UUID = Field(description="Unique trace for each file")
@@ -0,0 +1,45 @@
1
+ from typing import Optional
2
+ from uuid import UUID
3
+
4
+ from openai.types.batch import Errors, Batch
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class UploadBatch(BaseModel):
9
+ # page_number: int = Field(description="Page number of associated file")
10
+ trace_id: UUID = Field(description="Unique trace for each file")
11
+ id: str = Field(description="ID of the OpenAI Batch")
12
+ completion_window: str = Field(description="When batch can complete (24hrs)")
13
+ created_at: int = Field(description="When batch was created")
14
+ endpoint: str = Field(description="Endpoint used for retreival")
15
+ input_file_id: str = Field(description="File that is input to batch")
16
+ output_file_id: Optional[str] = Field(
17
+ None, description="File that is output when batch completes"
18
+ )
19
+ status: str = Field(description="Current status of the batch")
20
+ cancelled_at: Optional[int] = Field(None, description="When batch cancelled")
21
+ cancelling_at: Optional[int] = Field(
22
+ None, description="When batch started cancelling"
23
+ )
24
+ completed_at: Optional[int] = Field(None, description="When batch completed")
25
+ expired_at: Optional[int] = Field(None, description="When batch expired")
26
+ expires_at: Optional[int] = Field(None, description="When batch expires")
27
+ failed_at: Optional[int] = Field(None, description="When batch failed")
28
+ finalizing_at: Optional[int] = Field(
29
+ None, description="When batch started finalizing"
30
+ )
31
+ in_progress_at: Optional[int] = Field(
32
+ None, description="When batch started processing"
33
+ )
34
+ error_file_id: Optional[str] = Field(
35
+ None, description="File that is created during error of batch"
36
+ )
37
+ errors: Optional[Errors] = Field(None, description="List of errors")
38
+
39
+
40
+ def build_batch(open_ai_batch: Batch, trace_id: UUID) -> UploadBatch:
41
+ fields = UploadBatch.model_fields
42
+ input_fields = {key: getattr(open_ai_batch, key, None) for key in fields}
43
+ input_fields["trace_id"] = trace_id
44
+ # input_fields["page_number"] = page_number
45
+ return UploadBatch(**input_fields)
@@ -0,0 +1,117 @@
1
+ import asyncio
2
+ import tempfile
3
+ from typing import Callable, Optional
4
+ from uuid import UUID
5
+
6
+ from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
7
+ from parallex.ai.open_ai_client import OpenAIClient
8
+ from parallex.ai.output_processor import process_output
9
+ from parallex.ai.uploader import upload_images_for_processing
10
+ from parallex.file_management.converter import convert_pdf_to_images
11
+ from parallex.file_management.file_finder import add_file_to_temp_directory
12
+ from parallex.models.image_file import ImageFile
13
+ from parallex.models.parallex_callable_output import ParallexCallableOutput
14
+ from parallex.models.upload_batch import UploadBatch
15
+ from parallex.utils.constants import DEFAULT_PROMPT
16
+ from parallex.utils.logger import logger, setup_logger
17
+
18
+
19
+ # TODO pdf_source_url: str change to be URL or path
20
+ async def parallex(
21
+ model: str,
22
+ pdf_source_url: str,
23
+ post_process_callable: Optional[Callable[..., None]] = None,
24
+ concurrency: int = 20,
25
+ prompt_text: str = DEFAULT_PROMPT,
26
+ log_level: str = "ERROR",
27
+ ) -> ParallexCallableOutput:
28
+ setup_logger(log_level)
29
+ with tempfile.TemporaryDirectory() as temp_directory:
30
+ open_ai_client = OpenAIClient(model=model)
31
+
32
+ raw_file = await add_file_to_temp_directory(
33
+ pdf_source_url=pdf_source_url, temp_directory=temp_directory
34
+ )
35
+ trace_id = raw_file.trace_id
36
+ image_files = await convert_pdf_to_images(
37
+ raw_file=raw_file, temp_directory=temp_directory
38
+ )
39
+
40
+ batch_files = await upload_images_for_processing(
41
+ client=open_ai_client,
42
+ image_files=image_files,
43
+ temp_directory=temp_directory,
44
+ prompt_text=prompt_text,
45
+ )
46
+ start_batch_semaphore = asyncio.Semaphore(concurrency)
47
+ start_batch_tasks = []
48
+ for file in batch_files:
49
+ batch_task = asyncio.create_task(
50
+ _create_images_and_batch_jobs(
51
+ batch_file=file,
52
+ client=open_ai_client,
53
+ trace_id=trace_id,
54
+ semaphore=start_batch_semaphore,
55
+ )
56
+ )
57
+ start_batch_tasks.append(batch_task)
58
+ batches = await asyncio.gather(*start_batch_tasks)
59
+
60
+ pages_tasks = []
61
+ process_semaphore = asyncio.Semaphore(concurrency)
62
+ for batch in batches:
63
+ page_task = asyncio.create_task(
64
+ _wait_and_create_pages(
65
+ batch=batch, client=open_ai_client, semaphore=process_semaphore
66
+ )
67
+ )
68
+ pages_tasks.append(page_task)
69
+ page_groups = await asyncio.gather(*pages_tasks)
70
+
71
+ pages = [page for batch_pages in page_groups for page in batch_pages]
72
+ logger.debug(f"pages done. total pages- {len(pages)} - {trace_id}")
73
+ sorted_pages = sorted(pages, key=lambda x: x.page_number)
74
+
75
+ # TODO add combined version of MD to output / save to file system
76
+ callable_output = ParallexCallableOutput(
77
+ file_name=raw_file.given_name,
78
+ pdf_source_url=raw_file.pdf_source_url,
79
+ trace_id=trace_id,
80
+ pages=sorted_pages,
81
+ )
82
+ if post_process_callable is not None:
83
+ post_process_callable(output=callable_output)
84
+ return callable_output
85
+
86
+
87
+ async def _wait_and_create_pages(
88
+ batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
89
+ ):
90
+ async with semaphore:
91
+ logger.debug(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
92
+ output_file_id = await wait_for_batch_completion(client=client, batch=batch)
93
+ logger.debug(f"batch completed - {batch.id} - {batch.trace_id}")
94
+ page_responses = await process_output(
95
+ client=client, output_file_id=output_file_id
96
+ )
97
+ await _remove_global_batch_files(client=client, batch=batch)
98
+ return page_responses
99
+
100
+
101
+ async def _remove_global_batch_files(client: OpenAIClient, batch: UploadBatch):
102
+ file_ids = [batch.input_file_id, batch.output_file_id, batch.error_file_id]
103
+ for file_id in file_ids:
104
+ await client.delete_file(file_id)
105
+
106
+
107
+ async def _create_images_and_batch_jobs(
108
+ batch_file: ImageFile,
109
+ client: OpenAIClient,
110
+ trace_id: UUID,
111
+ semaphore: asyncio.Semaphore,
112
+ ):
113
+ async with semaphore:
114
+ batch = await create_batch(
115
+ client=client, file_id=batch_file.id, trace_id=trace_id
116
+ )
117
+ return batch
@@ -0,0 +1,9 @@
1
+ DEFAULT_PROMPT = """
2
+ Convert the following PDF page to markdown.
3
+ Return only the markdown with no explanation text.
4
+ Leave out any page numbers and redundant headers or footers.
5
+ Do not include any code blocks (e.g. "```markdown" or "```") in the response.
6
+ If unable to parse, return an empty string.
7
+ """
8
+
9
+ CUSTOM_ID_DELINEATOR = "--page--"
@@ -0,0 +1,20 @@
1
+ import logging
2
+
3
+ from aiologger import Logger
4
+
5
+ logger = Logger.with_default_handlers(name="parallex")
6
+
7
+
8
+ def setup_logger(level: str = "ERROR"):
9
+ level = {
10
+ "CRITICAL": logging.CRITICAL,
11
+ "ERROR": logging.ERROR,
12
+ "WARNING": logging.WARNING,
13
+ "INFO": logging.INFO,
14
+ "DEBUG": logging.DEBUG,
15
+ "NOTSET": logging.NOTSET,
16
+ }.get(level, logging.INFO)
17
+
18
+ logging.basicConfig(
19
+ level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
+ )
@@ -0,0 +1,23 @@
1
+ [tool.poetry]
2
+ name = "parallex"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.12"
10
+ pydantic = "^2.9.2"
11
+ httpx = "^0.27.2"
12
+ asyncio = "^3.4.3"
13
+ openai = "^1.54.4"
14
+ pdf2image = "^1.17.0"
15
+ aiologger = "^0.7.0"
16
+
17
+
18
+ [tool.poetry.group.dev.dependencies]
19
+ black = "^24.10.0"
20
+
21
+ [build-system]
22
+ requires = ["poetry-core"]
23
+ build-backend = "poetry.core.masonry.api"