parallex 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parallex/__init__.py +0 -0
- parallex/ai/batch_processor.py +42 -0
- parallex/ai/open_ai_client.py +43 -0
- parallex/ai/output_processor.py +25 -0
- parallex/ai/uploader.py +91 -0
- parallex/file_management/converter.py +37 -0
- parallex/file_management/file_finder.py +44 -0
- parallex/file_management/utils.py +2 -0
- parallex/models/batch_file.py +11 -0
- parallex/models/image_file.py +11 -0
- parallex/models/page_response.py +6 -0
- parallex/models/parallex_callable_output.py +12 -0
- parallex/models/raw_file.py +12 -0
- parallex/models/upload_batch.py +45 -0
- parallex/parallex.py +117 -0
- parallex/utils/constants.py +9 -0
- parallex/utils/logger.py +20 -0
- parallex-0.1.0.dist-info/LICENSE +19 -0
- parallex-0.1.0.dist-info/METADATA +42 -0
- parallex-0.1.0.dist-info/RECORD +21 -0
- parallex-0.1.0.dist-info/WHEEL +4 -0
parallex/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import asyncio
|
2
|
+
from uuid import UUID
|
3
|
+
|
4
|
+
from openai import BadRequestError
|
5
|
+
|
6
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
7
|
+
from parallex.models.upload_batch import build_batch, UploadBatch
|
8
|
+
|
9
|
+
|
10
|
+
async def create_batch(
|
11
|
+
client: OpenAIClient, file_id: str, trace_id: UUID
|
12
|
+
) -> UploadBatch:
|
13
|
+
"""Creates a Batch for the given file_id"""
|
14
|
+
max_retries = 10
|
15
|
+
backoff_delay = 5
|
16
|
+
|
17
|
+
for attempt in range(max_retries):
|
18
|
+
try:
|
19
|
+
batch_response = await client.create_batch(upload_file_id=file_id)
|
20
|
+
batch = build_batch(open_ai_batch=batch_response, trace_id=trace_id)
|
21
|
+
return batch # Return batch if successful
|
22
|
+
except BadRequestError as e:
|
23
|
+
if attempt == max_retries - 1:
|
24
|
+
raise e
|
25
|
+
await asyncio.sleep(backoff_delay)
|
26
|
+
backoff_delay *= 2
|
27
|
+
|
28
|
+
|
29
|
+
# TODO handle errors
|
30
|
+
async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) -> str:
|
31
|
+
"""Waits for Batch to complete and returns output_file_id when available"""
|
32
|
+
status = "validating"
|
33
|
+
delay = 5
|
34
|
+
while status not in ("completed", "failed", "canceled"):
|
35
|
+
await asyncio.sleep(delay)
|
36
|
+
batch_response = await client.retrieve_batch(batch.id)
|
37
|
+
status = batch_response.status
|
38
|
+
batch.output_file_id = batch_response.output_file_id
|
39
|
+
batch.error_file_id = batch_response.error_file_id
|
40
|
+
delay = 30
|
41
|
+
if status == "completed":
|
42
|
+
return batch_response.output_file_id
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from openai import AsyncAzureOpenAI
|
4
|
+
from openai._legacy_response import HttpxBinaryResponseContent
|
5
|
+
from openai.types import FileObject, Batch, FileDeleted
|
6
|
+
|
7
|
+
from parallex.utils.logger import logger
|
8
|
+
|
9
|
+
|
10
|
+
# Exceptions for missing keys, etc
|
11
|
+
class OpenAIClient:
|
12
|
+
def __init__(self, model: str):
|
13
|
+
self.model = model
|
14
|
+
|
15
|
+
self._client = AsyncAzureOpenAI(
|
16
|
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
17
|
+
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
18
|
+
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
|
19
|
+
)
|
20
|
+
|
21
|
+
async def upload(self, file_path: str) -> FileObject:
|
22
|
+
return await self._client.files.create(
|
23
|
+
file=open(file_path, "rb"), purpose="batch"
|
24
|
+
)
|
25
|
+
|
26
|
+
async def create_batch(self, upload_file_id: str) -> Batch:
|
27
|
+
return await self._client.batches.create(
|
28
|
+
input_file_id=upload_file_id,
|
29
|
+
endpoint="/chat/completions",
|
30
|
+
completion_window="24h",
|
31
|
+
)
|
32
|
+
|
33
|
+
async def retrieve_batch(self, batch_id: str) -> Batch:
|
34
|
+
return await self._client.batches.retrieve(batch_id)
|
35
|
+
|
36
|
+
async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
|
37
|
+
return await self._client.files.content(file_id)
|
38
|
+
|
39
|
+
async def delete_file(self, file_id: str) -> FileDeleted:
|
40
|
+
try:
|
41
|
+
return await self._client.files.delete(file_id)
|
42
|
+
except Exception as e:
|
43
|
+
logger.info(f"Did not delete file: {e}")
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
4
|
+
from parallex.models.page_response import PageResponse
|
5
|
+
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
6
|
+
|
7
|
+
|
8
|
+
async def process_output(
|
9
|
+
client: OpenAIClient, output_file_id: str
|
10
|
+
) -> list[PageResponse]:
|
11
|
+
"""Gets content from completed Batch to create PageResponse with created markdown"""
|
12
|
+
file_response = await client.retrieve_file(output_file_id)
|
13
|
+
raw_responses = file_response.text.strip().split("\n")
|
14
|
+
|
15
|
+
pages = []
|
16
|
+
for raw_response in raw_responses:
|
17
|
+
json_response = json.loads(raw_response)
|
18
|
+
custom_id = json_response["custom_id"]
|
19
|
+
page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
|
20
|
+
output_content = json_response["response"]["body"]["choices"][0]["message"][
|
21
|
+
"content"
|
22
|
+
]
|
23
|
+
page = PageResponse(output_content=output_content, page_number=int(page_number))
|
24
|
+
pages.append(page)
|
25
|
+
return pages
|
parallex/ai/uploader.py
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
import base64
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
|
5
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
6
|
+
from parallex.file_management.utils import file_in_temp_dir
|
7
|
+
from parallex.models.batch_file import BatchFile
|
8
|
+
from parallex.models.image_file import ImageFile
|
9
|
+
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
10
|
+
|
11
|
+
MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
|
12
|
+
|
13
|
+
|
14
|
+
async def upload_images_for_processing(
|
15
|
+
client: OpenAIClient,
|
16
|
+
image_files: list[ImageFile],
|
17
|
+
temp_directory: str,
|
18
|
+
prompt_text: str,
|
19
|
+
):
|
20
|
+
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
21
|
+
trace_id = image_files[0].trace_id
|
22
|
+
current_index = 0
|
23
|
+
batch_files = []
|
24
|
+
upload_file_location = file_in_temp_dir(
|
25
|
+
directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
|
26
|
+
)
|
27
|
+
|
28
|
+
for image_file in image_files:
|
29
|
+
if (
|
30
|
+
os.path.exists(upload_file_location)
|
31
|
+
and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
|
32
|
+
):
|
33
|
+
"""When approaching upload file limit, upload and start new file"""
|
34
|
+
batch_file = await _create_batch_file(
|
35
|
+
client, trace_id, upload_file_location
|
36
|
+
)
|
37
|
+
batch_files.append(batch_file)
|
38
|
+
current_index += 1
|
39
|
+
upload_file_location = file_in_temp_dir(
|
40
|
+
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
41
|
+
)
|
42
|
+
|
43
|
+
with open(image_file.path, "rb") as image:
|
44
|
+
base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
|
45
|
+
|
46
|
+
prompt_custom_id = (
|
47
|
+
f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
|
48
|
+
)
|
49
|
+
jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
|
50
|
+
with open(upload_file_location, "a") as jsonl_file:
|
51
|
+
jsonl_file.write(json.dumps(jsonl) + "\n")
|
52
|
+
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
53
|
+
batch_files.append(batch_file)
|
54
|
+
return batch_files
|
55
|
+
|
56
|
+
|
57
|
+
async def _create_batch_file(client, trace_id, upload_file_location):
|
58
|
+
file_response = await client.upload(upload_file_location)
|
59
|
+
return BatchFile(
|
60
|
+
id=file_response.id,
|
61
|
+
name=file_response.filename,
|
62
|
+
purpose=file_response.purpose,
|
63
|
+
status=file_response.status,
|
64
|
+
trace_id=trace_id,
|
65
|
+
)
|
66
|
+
|
67
|
+
|
68
|
+
def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
|
69
|
+
return {
|
70
|
+
"custom_id": prompt_custom_id,
|
71
|
+
"method": "POST",
|
72
|
+
"url": "/chat/completions",
|
73
|
+
"body": {
|
74
|
+
"model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
|
75
|
+
"messages": [
|
76
|
+
{
|
77
|
+
"role": "user",
|
78
|
+
"content": [
|
79
|
+
{"type": "text", "text": prompt_text},
|
80
|
+
{
|
81
|
+
"type": "image_url",
|
82
|
+
"image_url": {
|
83
|
+
"url": f"data:image/png;base64,{encoded_image}"
|
84
|
+
},
|
85
|
+
},
|
86
|
+
],
|
87
|
+
}
|
88
|
+
],
|
89
|
+
"max_tokens": 2000,
|
90
|
+
},
|
91
|
+
}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import asyncio
|
2
|
+
|
3
|
+
from pdf2image import convert_from_path
|
4
|
+
|
5
|
+
from parallex.models.image_file import ImageFile
|
6
|
+
from parallex.models.raw_file import RawFile
|
7
|
+
from parallex.utils.logger import logger
|
8
|
+
|
9
|
+
|
10
|
+
async def convert_pdf_to_images(
|
11
|
+
raw_file: RawFile, temp_directory: str
|
12
|
+
) -> list[ImageFile]:
|
13
|
+
"""Converts a PDF file to a series of images in the temp_directory. Returns a list ImageFile objects."""
|
14
|
+
options = {
|
15
|
+
"pdf_path": raw_file.path,
|
16
|
+
"output_folder": temp_directory,
|
17
|
+
"dpi": 300,
|
18
|
+
"fmt": "png",
|
19
|
+
"size": (None, 1056),
|
20
|
+
"thread_count": 4,
|
21
|
+
"use_pdftocairo": True,
|
22
|
+
"paths_only": True,
|
23
|
+
}
|
24
|
+
|
25
|
+
try:
|
26
|
+
image_paths = await asyncio.to_thread(convert_from_path, **options)
|
27
|
+
return [
|
28
|
+
ImageFile(
|
29
|
+
path=path,
|
30
|
+
trace_id=raw_file.trace_id,
|
31
|
+
given_file_name=raw_file.given_name,
|
32
|
+
page_number=(i + 1),
|
33
|
+
)
|
34
|
+
for i, path in enumerate(image_paths)
|
35
|
+
]
|
36
|
+
except Exception as err:
|
37
|
+
logger.error(f"Error converting PDF to images: {err}")
|
@@ -0,0 +1,44 @@
|
|
1
|
+
import uuid
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
|
5
|
+
from parallex.file_management.utils import file_in_temp_dir
|
6
|
+
from parallex.models.raw_file import RawFile
|
7
|
+
|
8
|
+
|
9
|
+
# TODO get from URL or from file system
|
10
|
+
async def add_file_to_temp_directory(
|
11
|
+
pdf_source_url: str, temp_directory: str
|
12
|
+
) -> RawFile:
|
13
|
+
"""Downloads file and adds to temp directory"""
|
14
|
+
given_file_name = pdf_source_url.split("/")[-1]
|
15
|
+
file_trace_id = uuid.uuid4()
|
16
|
+
async with httpx.AsyncClient() as client:
|
17
|
+
async with client.stream("GET", pdf_source_url) as response:
|
18
|
+
response.raise_for_status() # Check for HTTP errors
|
19
|
+
content_type = response.headers.get("Content-Type")
|
20
|
+
file_name = _determine_file_name(
|
21
|
+
given_file_name, file_trace_id, content_type
|
22
|
+
)
|
23
|
+
path = file_in_temp_dir(temp_directory, file_name)
|
24
|
+
with open(path, "wb") as file:
|
25
|
+
async for chunk in response.aiter_bytes():
|
26
|
+
file.write(chunk)
|
27
|
+
|
28
|
+
return RawFile(
|
29
|
+
name=file_name,
|
30
|
+
path=path,
|
31
|
+
content_type=content_type,
|
32
|
+
given_name=given_file_name,
|
33
|
+
pdf_source_url=pdf_source_url,
|
34
|
+
trace_id=file_trace_id,
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
def _determine_file_name(given_file_name: str, file_trace_id, content_type: str):
|
39
|
+
# TODO custom errors
|
40
|
+
# TODO other types besides pdf
|
41
|
+
name, extension = given_file_name.split(".")
|
42
|
+
if "application/pdf" not in content_type:
|
43
|
+
raise ValueError("Content-Type must be application/pdf")
|
44
|
+
return f"{file_trace_id}.{extension}"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
class BatchFile(BaseModel):
|
7
|
+
id: str = Field(description="ID of the OpenAI Batch")
|
8
|
+
name: str = Field(description="Name of file batch was created with")
|
9
|
+
purpose: str = Field(description="Purpose 'batch")
|
10
|
+
status: str = Field(description="Status of the batch")
|
11
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel
|
4
|
+
from pydantic.fields import Field
|
5
|
+
|
6
|
+
|
7
|
+
class ImageFile(BaseModel):
|
8
|
+
path: str = Field(description="Path to the image in temp directory")
|
9
|
+
page_number: int = Field(description="Associated page of the PDF")
|
10
|
+
given_file_name: str = Field(description="Name of the given file")
|
11
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
from parallex.models.page_response import PageResponse
|
6
|
+
|
7
|
+
|
8
|
+
class ParallexCallableOutput(BaseModel):
|
9
|
+
file_name: str = Field(description="Name of file that is processed")
|
10
|
+
pdf_source_url: str = Field(description="Given URL of the source of output")
|
11
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
12
|
+
pages: list[PageResponse] = Field(description="List of PageResponse objects")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
class RawFile(BaseModel):
|
7
|
+
name: str = Field(description="Name of the file given by Parallex")
|
8
|
+
path: str = Field(description="Path to file in temp directory")
|
9
|
+
content_type: str = Field(description="Given file type")
|
10
|
+
given_name: str = Field(description="Name of file given")
|
11
|
+
pdf_source_url: str = Field(description="Source of file")
|
12
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
@@ -0,0 +1,45 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
from uuid import UUID
|
3
|
+
|
4
|
+
from openai.types.batch import Errors, Batch
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
|
7
|
+
|
8
|
+
class UploadBatch(BaseModel):
|
9
|
+
# page_number: int = Field(description="Page number of associated file")
|
10
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
11
|
+
id: str = Field(description="ID of the OpenAI Batch")
|
12
|
+
completion_window: str = Field(description="When batch can complete (24hrs)")
|
13
|
+
created_at: int = Field(description="When batch was created")
|
14
|
+
endpoint: str = Field(description="Endpoint used for retreival")
|
15
|
+
input_file_id: str = Field(description="File that is input to batch")
|
16
|
+
output_file_id: Optional[str] = Field(
|
17
|
+
None, description="File that is output when batch completes"
|
18
|
+
)
|
19
|
+
status: str = Field(description="Current status of the batch")
|
20
|
+
cancelled_at: Optional[int] = Field(None, description="When batch cancelled")
|
21
|
+
cancelling_at: Optional[int] = Field(
|
22
|
+
None, description="When batch started cancelling"
|
23
|
+
)
|
24
|
+
completed_at: Optional[int] = Field(None, description="When batch completed")
|
25
|
+
expired_at: Optional[int] = Field(None, description="When batch expired")
|
26
|
+
expires_at: Optional[int] = Field(None, description="When batch expires")
|
27
|
+
failed_at: Optional[int] = Field(None, description="When batch failed")
|
28
|
+
finalizing_at: Optional[int] = Field(
|
29
|
+
None, description="When batch started finalizing"
|
30
|
+
)
|
31
|
+
in_progress_at: Optional[int] = Field(
|
32
|
+
None, description="When batch started processing"
|
33
|
+
)
|
34
|
+
error_file_id: Optional[str] = Field(
|
35
|
+
None, description="File that is created during error of batch"
|
36
|
+
)
|
37
|
+
errors: Optional[Errors] = Field(None, description="List of errors")
|
38
|
+
|
39
|
+
|
40
|
+
def build_batch(open_ai_batch: Batch, trace_id: UUID) -> UploadBatch:
|
41
|
+
fields = UploadBatch.model_fields
|
42
|
+
input_fields = {key: getattr(open_ai_batch, key, None) for key in fields}
|
43
|
+
input_fields["trace_id"] = trace_id
|
44
|
+
# input_fields["page_number"] = page_number
|
45
|
+
return UploadBatch(**input_fields)
|
parallex/parallex.py
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
import asyncio
|
2
|
+
import tempfile
|
3
|
+
from typing import Callable, Optional
|
4
|
+
from uuid import UUID
|
5
|
+
|
6
|
+
from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
|
7
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
8
|
+
from parallex.ai.output_processor import process_output
|
9
|
+
from parallex.ai.uploader import upload_images_for_processing
|
10
|
+
from parallex.file_management.converter import convert_pdf_to_images
|
11
|
+
from parallex.file_management.file_finder import add_file_to_temp_directory
|
12
|
+
from parallex.models.image_file import ImageFile
|
13
|
+
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
14
|
+
from parallex.models.upload_batch import UploadBatch
|
15
|
+
from parallex.utils.constants import DEFAULT_PROMPT
|
16
|
+
from parallex.utils.logger import logger, setup_logger
|
17
|
+
|
18
|
+
|
19
|
+
# TODO pdf_source_url: str change to be URL or path
|
20
|
+
async def parallex(
|
21
|
+
model: str,
|
22
|
+
pdf_source_url: str,
|
23
|
+
post_process_callable: Optional[Callable[..., None]] = None,
|
24
|
+
concurrency: int = 20,
|
25
|
+
prompt_text: str = DEFAULT_PROMPT,
|
26
|
+
log_level: str = "ERROR",
|
27
|
+
) -> ParallexCallableOutput:
|
28
|
+
setup_logger(log_level)
|
29
|
+
with tempfile.TemporaryDirectory() as temp_directory:
|
30
|
+
open_ai_client = OpenAIClient(model=model)
|
31
|
+
|
32
|
+
raw_file = await add_file_to_temp_directory(
|
33
|
+
pdf_source_url=pdf_source_url, temp_directory=temp_directory
|
34
|
+
)
|
35
|
+
trace_id = raw_file.trace_id
|
36
|
+
image_files = await convert_pdf_to_images(
|
37
|
+
raw_file=raw_file, temp_directory=temp_directory
|
38
|
+
)
|
39
|
+
|
40
|
+
batch_files = await upload_images_for_processing(
|
41
|
+
client=open_ai_client,
|
42
|
+
image_files=image_files,
|
43
|
+
temp_directory=temp_directory,
|
44
|
+
prompt_text=prompt_text,
|
45
|
+
)
|
46
|
+
start_batch_semaphore = asyncio.Semaphore(concurrency)
|
47
|
+
start_batch_tasks = []
|
48
|
+
for file in batch_files:
|
49
|
+
batch_task = asyncio.create_task(
|
50
|
+
_create_images_and_batch_jobs(
|
51
|
+
batch_file=file,
|
52
|
+
client=open_ai_client,
|
53
|
+
trace_id=trace_id,
|
54
|
+
semaphore=start_batch_semaphore,
|
55
|
+
)
|
56
|
+
)
|
57
|
+
start_batch_tasks.append(batch_task)
|
58
|
+
batches = await asyncio.gather(*start_batch_tasks)
|
59
|
+
|
60
|
+
pages_tasks = []
|
61
|
+
process_semaphore = asyncio.Semaphore(concurrency)
|
62
|
+
for batch in batches:
|
63
|
+
page_task = asyncio.create_task(
|
64
|
+
_wait_and_create_pages(
|
65
|
+
batch=batch, client=open_ai_client, semaphore=process_semaphore
|
66
|
+
)
|
67
|
+
)
|
68
|
+
pages_tasks.append(page_task)
|
69
|
+
page_groups = await asyncio.gather(*pages_tasks)
|
70
|
+
|
71
|
+
pages = [page for batch_pages in page_groups for page in batch_pages]
|
72
|
+
logger.debug(f"pages done. total pages- {len(pages)} - {trace_id}")
|
73
|
+
sorted_pages = sorted(pages, key=lambda x: x.page_number)
|
74
|
+
|
75
|
+
# TODO add combined version of MD to output / save to file system
|
76
|
+
callable_output = ParallexCallableOutput(
|
77
|
+
file_name=raw_file.given_name,
|
78
|
+
pdf_source_url=raw_file.pdf_source_url,
|
79
|
+
trace_id=trace_id,
|
80
|
+
pages=sorted_pages,
|
81
|
+
)
|
82
|
+
if post_process_callable is not None:
|
83
|
+
post_process_callable(output=callable_output)
|
84
|
+
return callable_output
|
85
|
+
|
86
|
+
|
87
|
+
async def _wait_and_create_pages(
|
88
|
+
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
89
|
+
):
|
90
|
+
async with semaphore:
|
91
|
+
logger.debug(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
92
|
+
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
93
|
+
logger.debug(f"batch completed - {batch.id} - {batch.trace_id}")
|
94
|
+
page_responses = await process_output(
|
95
|
+
client=client, output_file_id=output_file_id
|
96
|
+
)
|
97
|
+
await _remove_global_batch_files(client=client, batch=batch)
|
98
|
+
return page_responses
|
99
|
+
|
100
|
+
|
101
|
+
async def _remove_global_batch_files(client: OpenAIClient, batch: UploadBatch):
|
102
|
+
file_ids = [batch.input_file_id, batch.output_file_id, batch.error_file_id]
|
103
|
+
for file_id in file_ids:
|
104
|
+
await client.delete_file(file_id)
|
105
|
+
|
106
|
+
|
107
|
+
async def _create_images_and_batch_jobs(
|
108
|
+
batch_file: ImageFile,
|
109
|
+
client: OpenAIClient,
|
110
|
+
trace_id: UUID,
|
111
|
+
semaphore: asyncio.Semaphore,
|
112
|
+
):
|
113
|
+
async with semaphore:
|
114
|
+
batch = await create_batch(
|
115
|
+
client=client, file_id=batch_file.id, trace_id=trace_id
|
116
|
+
)
|
117
|
+
return batch
|
@@ -0,0 +1,9 @@
|
|
1
|
+
DEFAULT_PROMPT = """
|
2
|
+
Convert the following PDF page to markdown.
|
3
|
+
Return only the markdown with no explanation text.
|
4
|
+
Leave out any page numbers and redundant headers or footers.
|
5
|
+
Do not include any code blocks (e.g. "```markdown" or "```") in the response.
|
6
|
+
If unable to parse, return an empty string.
|
7
|
+
"""
|
8
|
+
|
9
|
+
CUSTOM_ID_DELINEATOR = "--page--"
|
parallex/utils/logger.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from aiologger import Logger
|
4
|
+
|
5
|
+
logger = Logger.with_default_handlers(name="parallex")
|
6
|
+
|
7
|
+
|
8
|
+
def setup_logger(level: str = "ERROR"):
|
9
|
+
level = {
|
10
|
+
"CRITICAL": logging.CRITICAL,
|
11
|
+
"ERROR": logging.ERROR,
|
12
|
+
"WARNING": logging.WARNING,
|
13
|
+
"INFO": logging.INFO,
|
14
|
+
"DEBUG": logging.DEBUG,
|
15
|
+
"NOTSET": logging.NOTSET,
|
16
|
+
}.get(level, logging.INFO)
|
17
|
+
|
18
|
+
logging.basicConfig(
|
19
|
+
level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
20
|
+
)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
@@ -0,0 +1,42 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: parallex
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary:
|
5
|
+
Author: Jeff Hostetler
|
6
|
+
Author-email: jeff@summed.ai
|
7
|
+
Requires-Python: >=3.12,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
11
|
+
Requires-Dist: aiologger (>=0.7.0,<0.8.0)
|
12
|
+
Requires-Dist: asyncio (>=3.4.3,<4.0.0)
|
13
|
+
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
14
|
+
Requires-Dist: openai (>=1.54.4,<2.0.0)
|
15
|
+
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
16
|
+
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
17
|
+
Description-Content-Type: text/markdown
|
18
|
+
|
19
|
+
# Parallex
|
20
|
+
|
21
|
+
### What it does
|
22
|
+
- Converts file into images
|
23
|
+
- Makes requests to OpenAI to covert the images to markdown
|
24
|
+
- [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
|
25
|
+
- [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
|
26
|
+
- Post batch processing to do what you wish with the resulting markdown
|
27
|
+
|
28
|
+
|
29
|
+
# Notes for us as we build
|
30
|
+
### Poetry
|
31
|
+
- Using [poetry](https://python-poetry.org/docs/) for dependency management
|
32
|
+
- add dependency `poetry add pydantic`
|
33
|
+
- add dev dependency `poetry add --group dev black`
|
34
|
+
- run main script `poetry run python main.py`
|
35
|
+
- run dev commands `poetry run black parallex`
|
36
|
+
|
37
|
+
|
38
|
+
# General behavior
|
39
|
+
- parallex takes args to do things with file
|
40
|
+
- parallex takes args to specify llm model
|
41
|
+
- parallex takes a callable to execute once batch process is "ready"
|
42
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
parallex/ai/batch_processor.py,sha256=wftq7-EKWbWO_tzz2PqVZa8XRVf7MoJlr7EcllX4-5I,1481
|
3
|
+
parallex/ai/open_ai_client.py,sha256=Yvnvg5MGEyQrmN3HF5k8fEWse9Slthy3J-oumO6ZKkQ,1459
|
4
|
+
parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
|
5
|
+
parallex/ai/uploader.py,sha256=M8g8dC_bwiGNDI_S5qxcRqJljDu6KSan_eIcQWA-ERA,3162
|
6
|
+
parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
|
7
|
+
parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
|
8
|
+
parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
|
9
|
+
parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
|
10
|
+
parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
|
11
|
+
parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
|
12
|
+
parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
|
13
|
+
parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
|
14
|
+
parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
|
15
|
+
parallex/parallex.py,sha256=2XXmG54eXtXnw2ElC12zjbWDnwDIHPgzKY1ktP8V93M,4472
|
16
|
+
parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
|
17
|
+
parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
|
18
|
+
parallex-0.1.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
|
19
|
+
parallex-0.1.0.dist-info/METADATA,sha256=ICDxk_FnofGhJgeGXv_g-awaZm_g4zU23PkfuXOndNs,1485
|
20
|
+
parallex-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
21
|
+
parallex-0.1.0.dist-info/RECORD,,
|