parallex 0.1.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- parallex/__init__.py +0 -0
- parallex/ai/batch_processor.py +42 -0
- parallex/ai/open_ai_client.py +43 -0
- parallex/ai/output_processor.py +25 -0
- parallex/ai/uploader.py +91 -0
- parallex/file_management/converter.py +37 -0
- parallex/file_management/file_finder.py +44 -0
- parallex/file_management/utils.py +2 -0
- parallex/models/batch_file.py +11 -0
- parallex/models/image_file.py +11 -0
- parallex/models/page_response.py +6 -0
- parallex/models/parallex_callable_output.py +12 -0
- parallex/models/raw_file.py +12 -0
- parallex/models/upload_batch.py +45 -0
- parallex/parallex.py +117 -0
- parallex/utils/constants.py +9 -0
- parallex/utils/logger.py +20 -0
- parallex-0.1.0.dist-info/LICENSE +19 -0
- parallex-0.1.0.dist-info/METADATA +42 -0
- parallex-0.1.0.dist-info/RECORD +21 -0
- parallex-0.1.0.dist-info/WHEEL +4 -0
parallex/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import asyncio
|
2
|
+
from uuid import UUID
|
3
|
+
|
4
|
+
from openai import BadRequestError
|
5
|
+
|
6
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
7
|
+
from parallex.models.upload_batch import build_batch, UploadBatch
|
8
|
+
|
9
|
+
|
10
|
+
async def create_batch(
|
11
|
+
client: OpenAIClient, file_id: str, trace_id: UUID
|
12
|
+
) -> UploadBatch:
|
13
|
+
"""Creates a Batch for the given file_id"""
|
14
|
+
max_retries = 10
|
15
|
+
backoff_delay = 5
|
16
|
+
|
17
|
+
for attempt in range(max_retries):
|
18
|
+
try:
|
19
|
+
batch_response = await client.create_batch(upload_file_id=file_id)
|
20
|
+
batch = build_batch(open_ai_batch=batch_response, trace_id=trace_id)
|
21
|
+
return batch # Return batch if successful
|
22
|
+
except BadRequestError as e:
|
23
|
+
if attempt == max_retries - 1:
|
24
|
+
raise e
|
25
|
+
await asyncio.sleep(backoff_delay)
|
26
|
+
backoff_delay *= 2
|
27
|
+
|
28
|
+
|
29
|
+
# TODO handle errors
|
30
|
+
async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) -> str:
|
31
|
+
"""Waits for Batch to complete and returns output_file_id when available"""
|
32
|
+
status = "validating"
|
33
|
+
delay = 5
|
34
|
+
while status not in ("completed", "failed", "canceled"):
|
35
|
+
await asyncio.sleep(delay)
|
36
|
+
batch_response = await client.retrieve_batch(batch.id)
|
37
|
+
status = batch_response.status
|
38
|
+
batch.output_file_id = batch_response.output_file_id
|
39
|
+
batch.error_file_id = batch_response.error_file_id
|
40
|
+
delay = 30
|
41
|
+
if status == "completed":
|
42
|
+
return batch_response.output_file_id
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from openai import AsyncAzureOpenAI
|
4
|
+
from openai._legacy_response import HttpxBinaryResponseContent
|
5
|
+
from openai.types import FileObject, Batch, FileDeleted
|
6
|
+
|
7
|
+
from parallex.utils.logger import logger
|
8
|
+
|
9
|
+
|
10
|
+
# Exceptions for missing keys, etc
|
11
|
+
class OpenAIClient:
|
12
|
+
def __init__(self, model: str):
|
13
|
+
self.model = model
|
14
|
+
|
15
|
+
self._client = AsyncAzureOpenAI(
|
16
|
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
17
|
+
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
18
|
+
api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
|
19
|
+
)
|
20
|
+
|
21
|
+
async def upload(self, file_path: str) -> FileObject:
|
22
|
+
return await self._client.files.create(
|
23
|
+
file=open(file_path, "rb"), purpose="batch"
|
24
|
+
)
|
25
|
+
|
26
|
+
async def create_batch(self, upload_file_id: str) -> Batch:
|
27
|
+
return await self._client.batches.create(
|
28
|
+
input_file_id=upload_file_id,
|
29
|
+
endpoint="/chat/completions",
|
30
|
+
completion_window="24h",
|
31
|
+
)
|
32
|
+
|
33
|
+
async def retrieve_batch(self, batch_id: str) -> Batch:
|
34
|
+
return await self._client.batches.retrieve(batch_id)
|
35
|
+
|
36
|
+
async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
|
37
|
+
return await self._client.files.content(file_id)
|
38
|
+
|
39
|
+
async def delete_file(self, file_id: str) -> FileDeleted:
|
40
|
+
try:
|
41
|
+
return await self._client.files.delete(file_id)
|
42
|
+
except Exception as e:
|
43
|
+
logger.info(f"Did not delete file: {e}")
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import json
|
2
|
+
|
3
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
4
|
+
from parallex.models.page_response import PageResponse
|
5
|
+
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
6
|
+
|
7
|
+
|
8
|
+
async def process_output(
|
9
|
+
client: OpenAIClient, output_file_id: str
|
10
|
+
) -> list[PageResponse]:
|
11
|
+
"""Gets content from completed Batch to create PageResponse with created markdown"""
|
12
|
+
file_response = await client.retrieve_file(output_file_id)
|
13
|
+
raw_responses = file_response.text.strip().split("\n")
|
14
|
+
|
15
|
+
pages = []
|
16
|
+
for raw_response in raw_responses:
|
17
|
+
json_response = json.loads(raw_response)
|
18
|
+
custom_id = json_response["custom_id"]
|
19
|
+
page_number = custom_id.split(CUSTOM_ID_DELINEATOR)[1].split(".")[0]
|
20
|
+
output_content = json_response["response"]["body"]["choices"][0]["message"][
|
21
|
+
"content"
|
22
|
+
]
|
23
|
+
page = PageResponse(output_content=output_content, page_number=int(page_number))
|
24
|
+
pages.append(page)
|
25
|
+
return pages
|
parallex/ai/uploader.py
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
import base64
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
|
5
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
6
|
+
from parallex.file_management.utils import file_in_temp_dir
|
7
|
+
from parallex.models.batch_file import BatchFile
|
8
|
+
from parallex.models.image_file import ImageFile
|
9
|
+
from parallex.utils.constants import CUSTOM_ID_DELINEATOR
|
10
|
+
|
11
|
+
MAX_FILE_SIZE = 150 * 1024 * 1024 # 150 MB in bytes
|
12
|
+
|
13
|
+
|
14
|
+
async def upload_images_for_processing(
|
15
|
+
client: OpenAIClient,
|
16
|
+
image_files: list[ImageFile],
|
17
|
+
temp_directory: str,
|
18
|
+
prompt_text: str,
|
19
|
+
):
|
20
|
+
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
21
|
+
trace_id = image_files[0].trace_id
|
22
|
+
current_index = 0
|
23
|
+
batch_files = []
|
24
|
+
upload_file_location = file_in_temp_dir(
|
25
|
+
directory=temp_directory, file_name=f"image-{trace_id}-{current_index}.jsonl"
|
26
|
+
)
|
27
|
+
|
28
|
+
for image_file in image_files:
|
29
|
+
if (
|
30
|
+
os.path.exists(upload_file_location)
|
31
|
+
and os.path.getsize(upload_file_location) > MAX_FILE_SIZE
|
32
|
+
):
|
33
|
+
"""When approaching upload file limit, upload and start new file"""
|
34
|
+
batch_file = await _create_batch_file(
|
35
|
+
client, trace_id, upload_file_location
|
36
|
+
)
|
37
|
+
batch_files.append(batch_file)
|
38
|
+
current_index += 1
|
39
|
+
upload_file_location = file_in_temp_dir(
|
40
|
+
directory=temp_directory, file_name=f"{trace_id}-{current_index}.jsonl"
|
41
|
+
)
|
42
|
+
|
43
|
+
with open(image_file.path, "rb") as image:
|
44
|
+
base64_encoded_image = base64.b64encode(image.read()).decode("utf-8")
|
45
|
+
|
46
|
+
prompt_custom_id = (
|
47
|
+
f"{image_file.trace_id}{CUSTOM_ID_DELINEATOR}{image_file.page_number}.jsonl"
|
48
|
+
)
|
49
|
+
jsonl = _jsonl_format(prompt_custom_id, base64_encoded_image, prompt_text)
|
50
|
+
with open(upload_file_location, "a") as jsonl_file:
|
51
|
+
jsonl_file.write(json.dumps(jsonl) + "\n")
|
52
|
+
batch_file = await _create_batch_file(client, trace_id, upload_file_location)
|
53
|
+
batch_files.append(batch_file)
|
54
|
+
return batch_files
|
55
|
+
|
56
|
+
|
57
|
+
async def _create_batch_file(client, trace_id, upload_file_location):
|
58
|
+
file_response = await client.upload(upload_file_location)
|
59
|
+
return BatchFile(
|
60
|
+
id=file_response.id,
|
61
|
+
name=file_response.filename,
|
62
|
+
purpose=file_response.purpose,
|
63
|
+
status=file_response.status,
|
64
|
+
trace_id=trace_id,
|
65
|
+
)
|
66
|
+
|
67
|
+
|
68
|
+
def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
|
69
|
+
return {
|
70
|
+
"custom_id": prompt_custom_id,
|
71
|
+
"method": "POST",
|
72
|
+
"url": "/chat/completions",
|
73
|
+
"body": {
|
74
|
+
"model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
|
75
|
+
"messages": [
|
76
|
+
{
|
77
|
+
"role": "user",
|
78
|
+
"content": [
|
79
|
+
{"type": "text", "text": prompt_text},
|
80
|
+
{
|
81
|
+
"type": "image_url",
|
82
|
+
"image_url": {
|
83
|
+
"url": f"data:image/png;base64,{encoded_image}"
|
84
|
+
},
|
85
|
+
},
|
86
|
+
],
|
87
|
+
}
|
88
|
+
],
|
89
|
+
"max_tokens": 2000,
|
90
|
+
},
|
91
|
+
}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import asyncio
|
2
|
+
|
3
|
+
from pdf2image import convert_from_path
|
4
|
+
|
5
|
+
from parallex.models.image_file import ImageFile
|
6
|
+
from parallex.models.raw_file import RawFile
|
7
|
+
from parallex.utils.logger import logger
|
8
|
+
|
9
|
+
|
10
|
+
async def convert_pdf_to_images(
|
11
|
+
raw_file: RawFile, temp_directory: str
|
12
|
+
) -> list[ImageFile]:
|
13
|
+
"""Converts a PDF file to a series of images in the temp_directory. Returns a list ImageFile objects."""
|
14
|
+
options = {
|
15
|
+
"pdf_path": raw_file.path,
|
16
|
+
"output_folder": temp_directory,
|
17
|
+
"dpi": 300,
|
18
|
+
"fmt": "png",
|
19
|
+
"size": (None, 1056),
|
20
|
+
"thread_count": 4,
|
21
|
+
"use_pdftocairo": True,
|
22
|
+
"paths_only": True,
|
23
|
+
}
|
24
|
+
|
25
|
+
try:
|
26
|
+
image_paths = await asyncio.to_thread(convert_from_path, **options)
|
27
|
+
return [
|
28
|
+
ImageFile(
|
29
|
+
path=path,
|
30
|
+
trace_id=raw_file.trace_id,
|
31
|
+
given_file_name=raw_file.given_name,
|
32
|
+
page_number=(i + 1),
|
33
|
+
)
|
34
|
+
for i, path in enumerate(image_paths)
|
35
|
+
]
|
36
|
+
except Exception as err:
|
37
|
+
logger.error(f"Error converting PDF to images: {err}")
|
@@ -0,0 +1,44 @@
|
|
1
|
+
import uuid
|
2
|
+
|
3
|
+
import httpx
|
4
|
+
|
5
|
+
from parallex.file_management.utils import file_in_temp_dir
|
6
|
+
from parallex.models.raw_file import RawFile
|
7
|
+
|
8
|
+
|
9
|
+
# TODO get from URL or from file system
|
10
|
+
async def add_file_to_temp_directory(
|
11
|
+
pdf_source_url: str, temp_directory: str
|
12
|
+
) -> RawFile:
|
13
|
+
"""Downloads file and adds to temp directory"""
|
14
|
+
given_file_name = pdf_source_url.split("/")[-1]
|
15
|
+
file_trace_id = uuid.uuid4()
|
16
|
+
async with httpx.AsyncClient() as client:
|
17
|
+
async with client.stream("GET", pdf_source_url) as response:
|
18
|
+
response.raise_for_status() # Check for HTTP errors
|
19
|
+
content_type = response.headers.get("Content-Type")
|
20
|
+
file_name = _determine_file_name(
|
21
|
+
given_file_name, file_trace_id, content_type
|
22
|
+
)
|
23
|
+
path = file_in_temp_dir(temp_directory, file_name)
|
24
|
+
with open(path, "wb") as file:
|
25
|
+
async for chunk in response.aiter_bytes():
|
26
|
+
file.write(chunk)
|
27
|
+
|
28
|
+
return RawFile(
|
29
|
+
name=file_name,
|
30
|
+
path=path,
|
31
|
+
content_type=content_type,
|
32
|
+
given_name=given_file_name,
|
33
|
+
pdf_source_url=pdf_source_url,
|
34
|
+
trace_id=file_trace_id,
|
35
|
+
)
|
36
|
+
|
37
|
+
|
38
|
+
def _determine_file_name(given_file_name: str, file_trace_id, content_type: str):
|
39
|
+
# TODO custom errors
|
40
|
+
# TODO other types besides pdf
|
41
|
+
name, extension = given_file_name.split(".")
|
42
|
+
if "application/pdf" not in content_type:
|
43
|
+
raise ValueError("Content-Type must be application/pdf")
|
44
|
+
return f"{file_trace_id}.{extension}"
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
class BatchFile(BaseModel):
|
7
|
+
id: str = Field(description="ID of the OpenAI Batch")
|
8
|
+
name: str = Field(description="Name of file batch was created with")
|
9
|
+
purpose: str = Field(description="Purpose 'batch")
|
10
|
+
status: str = Field(description="Status of the batch")
|
11
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel
|
4
|
+
from pydantic.fields import Field
|
5
|
+
|
6
|
+
|
7
|
+
class ImageFile(BaseModel):
|
8
|
+
path: str = Field(description="Path to the image in temp directory")
|
9
|
+
page_number: int = Field(description="Associated page of the PDF")
|
10
|
+
given_file_name: str = Field(description="Name of the given file")
|
11
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
from parallex.models.page_response import PageResponse
|
6
|
+
|
7
|
+
|
8
|
+
class ParallexCallableOutput(BaseModel):
|
9
|
+
file_name: str = Field(description="Name of file that is processed")
|
10
|
+
pdf_source_url: str = Field(description="Given URL of the source of output")
|
11
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
12
|
+
pages: list[PageResponse] = Field(description="List of PageResponse objects")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from uuid import UUID
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
class RawFile(BaseModel):
|
7
|
+
name: str = Field(description="Name of the file given by Parallex")
|
8
|
+
path: str = Field(description="Path to file in temp directory")
|
9
|
+
content_type: str = Field(description="Given file type")
|
10
|
+
given_name: str = Field(description="Name of file given")
|
11
|
+
pdf_source_url: str = Field(description="Source of file")
|
12
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
@@ -0,0 +1,45 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
from uuid import UUID
|
3
|
+
|
4
|
+
from openai.types.batch import Errors, Batch
|
5
|
+
from pydantic import BaseModel, Field
|
6
|
+
|
7
|
+
|
8
|
+
class UploadBatch(BaseModel):
|
9
|
+
# page_number: int = Field(description="Page number of associated file")
|
10
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
11
|
+
id: str = Field(description="ID of the OpenAI Batch")
|
12
|
+
completion_window: str = Field(description="When batch can complete (24hrs)")
|
13
|
+
created_at: int = Field(description="When batch was created")
|
14
|
+
endpoint: str = Field(description="Endpoint used for retreival")
|
15
|
+
input_file_id: str = Field(description="File that is input to batch")
|
16
|
+
output_file_id: Optional[str] = Field(
|
17
|
+
None, description="File that is output when batch completes"
|
18
|
+
)
|
19
|
+
status: str = Field(description="Current status of the batch")
|
20
|
+
cancelled_at: Optional[int] = Field(None, description="When batch cancelled")
|
21
|
+
cancelling_at: Optional[int] = Field(
|
22
|
+
None, description="When batch started cancelling"
|
23
|
+
)
|
24
|
+
completed_at: Optional[int] = Field(None, description="When batch completed")
|
25
|
+
expired_at: Optional[int] = Field(None, description="When batch expired")
|
26
|
+
expires_at: Optional[int] = Field(None, description="When batch expires")
|
27
|
+
failed_at: Optional[int] = Field(None, description="When batch failed")
|
28
|
+
finalizing_at: Optional[int] = Field(
|
29
|
+
None, description="When batch started finalizing"
|
30
|
+
)
|
31
|
+
in_progress_at: Optional[int] = Field(
|
32
|
+
None, description="When batch started processing"
|
33
|
+
)
|
34
|
+
error_file_id: Optional[str] = Field(
|
35
|
+
None, description="File that is created during error of batch"
|
36
|
+
)
|
37
|
+
errors: Optional[Errors] = Field(None, description="List of errors")
|
38
|
+
|
39
|
+
|
40
|
+
def build_batch(open_ai_batch: Batch, trace_id: UUID) -> UploadBatch:
|
41
|
+
fields = UploadBatch.model_fields
|
42
|
+
input_fields = {key: getattr(open_ai_batch, key, None) for key in fields}
|
43
|
+
input_fields["trace_id"] = trace_id
|
44
|
+
# input_fields["page_number"] = page_number
|
45
|
+
return UploadBatch(**input_fields)
|
parallex/parallex.py
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
import asyncio
|
2
|
+
import tempfile
|
3
|
+
from typing import Callable, Optional
|
4
|
+
from uuid import UUID
|
5
|
+
|
6
|
+
from parallex.ai.batch_processor import wait_for_batch_completion, create_batch
|
7
|
+
from parallex.ai.open_ai_client import OpenAIClient
|
8
|
+
from parallex.ai.output_processor import process_output
|
9
|
+
from parallex.ai.uploader import upload_images_for_processing
|
10
|
+
from parallex.file_management.converter import convert_pdf_to_images
|
11
|
+
from parallex.file_management.file_finder import add_file_to_temp_directory
|
12
|
+
from parallex.models.image_file import ImageFile
|
13
|
+
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
14
|
+
from parallex.models.upload_batch import UploadBatch
|
15
|
+
from parallex.utils.constants import DEFAULT_PROMPT
|
16
|
+
from parallex.utils.logger import logger, setup_logger
|
17
|
+
|
18
|
+
|
19
|
+
# TODO pdf_source_url: str change to be URL or path
|
20
|
+
async def parallex(
|
21
|
+
model: str,
|
22
|
+
pdf_source_url: str,
|
23
|
+
post_process_callable: Optional[Callable[..., None]] = None,
|
24
|
+
concurrency: int = 20,
|
25
|
+
prompt_text: str = DEFAULT_PROMPT,
|
26
|
+
log_level: str = "ERROR",
|
27
|
+
) -> ParallexCallableOutput:
|
28
|
+
setup_logger(log_level)
|
29
|
+
with tempfile.TemporaryDirectory() as temp_directory:
|
30
|
+
open_ai_client = OpenAIClient(model=model)
|
31
|
+
|
32
|
+
raw_file = await add_file_to_temp_directory(
|
33
|
+
pdf_source_url=pdf_source_url, temp_directory=temp_directory
|
34
|
+
)
|
35
|
+
trace_id = raw_file.trace_id
|
36
|
+
image_files = await convert_pdf_to_images(
|
37
|
+
raw_file=raw_file, temp_directory=temp_directory
|
38
|
+
)
|
39
|
+
|
40
|
+
batch_files = await upload_images_for_processing(
|
41
|
+
client=open_ai_client,
|
42
|
+
image_files=image_files,
|
43
|
+
temp_directory=temp_directory,
|
44
|
+
prompt_text=prompt_text,
|
45
|
+
)
|
46
|
+
start_batch_semaphore = asyncio.Semaphore(concurrency)
|
47
|
+
start_batch_tasks = []
|
48
|
+
for file in batch_files:
|
49
|
+
batch_task = asyncio.create_task(
|
50
|
+
_create_images_and_batch_jobs(
|
51
|
+
batch_file=file,
|
52
|
+
client=open_ai_client,
|
53
|
+
trace_id=trace_id,
|
54
|
+
semaphore=start_batch_semaphore,
|
55
|
+
)
|
56
|
+
)
|
57
|
+
start_batch_tasks.append(batch_task)
|
58
|
+
batches = await asyncio.gather(*start_batch_tasks)
|
59
|
+
|
60
|
+
pages_tasks = []
|
61
|
+
process_semaphore = asyncio.Semaphore(concurrency)
|
62
|
+
for batch in batches:
|
63
|
+
page_task = asyncio.create_task(
|
64
|
+
_wait_and_create_pages(
|
65
|
+
batch=batch, client=open_ai_client, semaphore=process_semaphore
|
66
|
+
)
|
67
|
+
)
|
68
|
+
pages_tasks.append(page_task)
|
69
|
+
page_groups = await asyncio.gather(*pages_tasks)
|
70
|
+
|
71
|
+
pages = [page for batch_pages in page_groups for page in batch_pages]
|
72
|
+
logger.debug(f"pages done. total pages- {len(pages)} - {trace_id}")
|
73
|
+
sorted_pages = sorted(pages, key=lambda x: x.page_number)
|
74
|
+
|
75
|
+
# TODO add combined version of MD to output / save to file system
|
76
|
+
callable_output = ParallexCallableOutput(
|
77
|
+
file_name=raw_file.given_name,
|
78
|
+
pdf_source_url=raw_file.pdf_source_url,
|
79
|
+
trace_id=trace_id,
|
80
|
+
pages=sorted_pages,
|
81
|
+
)
|
82
|
+
if post_process_callable is not None:
|
83
|
+
post_process_callable(output=callable_output)
|
84
|
+
return callable_output
|
85
|
+
|
86
|
+
|
87
|
+
async def _wait_and_create_pages(
|
88
|
+
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
89
|
+
):
|
90
|
+
async with semaphore:
|
91
|
+
logger.debug(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
92
|
+
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
93
|
+
logger.debug(f"batch completed - {batch.id} - {batch.trace_id}")
|
94
|
+
page_responses = await process_output(
|
95
|
+
client=client, output_file_id=output_file_id
|
96
|
+
)
|
97
|
+
await _remove_global_batch_files(client=client, batch=batch)
|
98
|
+
return page_responses
|
99
|
+
|
100
|
+
|
101
|
+
async def _remove_global_batch_files(client: OpenAIClient, batch: UploadBatch):
|
102
|
+
file_ids = [batch.input_file_id, batch.output_file_id, batch.error_file_id]
|
103
|
+
for file_id in file_ids:
|
104
|
+
await client.delete_file(file_id)
|
105
|
+
|
106
|
+
|
107
|
+
async def _create_images_and_batch_jobs(
|
108
|
+
batch_file: ImageFile,
|
109
|
+
client: OpenAIClient,
|
110
|
+
trace_id: UUID,
|
111
|
+
semaphore: asyncio.Semaphore,
|
112
|
+
):
|
113
|
+
async with semaphore:
|
114
|
+
batch = await create_batch(
|
115
|
+
client=client, file_id=batch_file.id, trace_id=trace_id
|
116
|
+
)
|
117
|
+
return batch
|
@@ -0,0 +1,9 @@
|
|
1
|
+
DEFAULT_PROMPT = """
|
2
|
+
Convert the following PDF page to markdown.
|
3
|
+
Return only the markdown with no explanation text.
|
4
|
+
Leave out any page numbers and redundant headers or footers.
|
5
|
+
Do not include any code blocks (e.g. "```markdown" or "```") in the response.
|
6
|
+
If unable to parse, return an empty string.
|
7
|
+
"""
|
8
|
+
|
9
|
+
CUSTOM_ID_DELINEATOR = "--page--"
|
parallex/utils/logger.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from aiologger import Logger
|
4
|
+
|
5
|
+
logger = Logger.with_default_handlers(name="parallex")
|
6
|
+
|
7
|
+
|
8
|
+
def setup_logger(level: str = "ERROR"):
|
9
|
+
level = {
|
10
|
+
"CRITICAL": logging.CRITICAL,
|
11
|
+
"ERROR": logging.ERROR,
|
12
|
+
"WARNING": logging.WARNING,
|
13
|
+
"INFO": logging.INFO,
|
14
|
+
"DEBUG": logging.DEBUG,
|
15
|
+
"NOTSET": logging.NOTSET,
|
16
|
+
}.get(level, logging.INFO)
|
17
|
+
|
18
|
+
logging.basicConfig(
|
19
|
+
level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
20
|
+
)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
@@ -0,0 +1,42 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: parallex
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary:
|
5
|
+
Author: Jeff Hostetler
|
6
|
+
Author-email: jeff@summed.ai
|
7
|
+
Requires-Python: >=3.12,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
11
|
+
Requires-Dist: aiologger (>=0.7.0,<0.8.0)
|
12
|
+
Requires-Dist: asyncio (>=3.4.3,<4.0.0)
|
13
|
+
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
14
|
+
Requires-Dist: openai (>=1.54.4,<2.0.0)
|
15
|
+
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
16
|
+
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
17
|
+
Description-Content-Type: text/markdown
|
18
|
+
|
19
|
+
# Parallex
|
20
|
+
|
21
|
+
### What it does
|
22
|
+
- Converts file into images
|
23
|
+
- Makes requests to OpenAI to covert the images to markdown
|
24
|
+
- [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
|
25
|
+
- [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
|
26
|
+
- Post batch processing to do what you wish with the resulting markdown
|
27
|
+
|
28
|
+
|
29
|
+
# Notes for us as we build
|
30
|
+
### Poetry
|
31
|
+
- Using [poetry](https://python-poetry.org/docs/) for dependency management
|
32
|
+
- add dependency `poetry add pydantic`
|
33
|
+
- add dev dependency `poetry add --group dev black`
|
34
|
+
- run main script `poetry run python main.py`
|
35
|
+
- run dev commands `poetry run black parallex`
|
36
|
+
|
37
|
+
|
38
|
+
# General behavior
|
39
|
+
- parallex takes args to do things with file
|
40
|
+
- parallex takes args to specify llm model
|
41
|
+
- parallex takes a callable to execute once batch process is "ready"
|
42
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
parallex/ai/batch_processor.py,sha256=wftq7-EKWbWO_tzz2PqVZa8XRVf7MoJlr7EcllX4-5I,1481
|
3
|
+
parallex/ai/open_ai_client.py,sha256=Yvnvg5MGEyQrmN3HF5k8fEWse9Slthy3J-oumO6ZKkQ,1459
|
4
|
+
parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
|
5
|
+
parallex/ai/uploader.py,sha256=M8g8dC_bwiGNDI_S5qxcRqJljDu6KSan_eIcQWA-ERA,3162
|
6
|
+
parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
|
7
|
+
parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
|
8
|
+
parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
|
9
|
+
parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
|
10
|
+
parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
|
11
|
+
parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaBbt0,228
|
12
|
+
parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
|
13
|
+
parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
|
14
|
+
parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
|
15
|
+
parallex/parallex.py,sha256=2XXmG54eXtXnw2ElC12zjbWDnwDIHPgzKY1ktP8V93M,4472
|
16
|
+
parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
|
17
|
+
parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
|
18
|
+
parallex-0.1.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
|
19
|
+
parallex-0.1.0.dist-info/METADATA,sha256=ICDxk_FnofGhJgeGXv_g-awaZm_g4zU23PkfuXOndNs,1485
|
20
|
+
parallex-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
21
|
+
parallex-0.1.0.dist-info/RECORD,,
|