parallex 0.1.3__tar.gz → 0.2.0__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {parallex-0.1.3 → parallex-0.2.0}/PKG-INFO +5 -5
- {parallex-0.1.3 → parallex-0.2.0}/README.md +4 -4
- {parallex-0.1.3 → parallex-0.2.0}/parallex/ai/batch_processor.py +0 -2
- {parallex-0.1.3 → parallex-0.2.0}/parallex/ai/open_ai_client.py +19 -7
- {parallex-0.1.3 → parallex-0.2.0}/parallex/ai/uploader.py +2 -2
- parallex-0.2.0/parallex/file_management/remote_file_handler.py +7 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/parallex.py +37 -20
- {parallex-0.1.3 → parallex-0.2.0}/parallex/utils/logger.py +3 -5
- {parallex-0.1.3 → parallex-0.2.0}/pyproject.toml +1 -1
- {parallex-0.1.3 → parallex-0.2.0}/LICENSE +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/__init__.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/ai/output_processor.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/file_management/converter.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/file_management/file_finder.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/file_management/utils.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/models/batch_file.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/models/image_file.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/models/page_response.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/models/parallex_callable_output.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/models/raw_file.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/models/upload_batch.py +0 -0
- {parallex-0.1.3 → parallex-0.2.0}/parallex/utils/constants.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parallex
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: PDF to markdown using Azure OpenAI batch processing
|
5
5
|
Home-page: https://github.com/Summed-AI/parallex
|
6
6
|
Author: Jeff Hostetler
|
@@ -45,10 +45,10 @@ import os
|
|
45
45
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
46
46
|
from parallex.parallex import parallex
|
47
47
|
|
48
|
-
os.environ["
|
49
|
-
os.environ["
|
50
|
-
os.environ["
|
51
|
-
os.environ["
|
48
|
+
os.environ["AZURE_API_KEY"] = "key"
|
49
|
+
os.environ["AZURE_API_BASE"] = "your-endpoint.com"
|
50
|
+
os.environ["AZURE_API_VERSION"] = "deployment_version"
|
51
|
+
os.environ["AZURE_API_DEPLOYMENT"] = "deployment_name"
|
52
52
|
|
53
53
|
model = "gpt-4o"
|
54
54
|
|
@@ -25,10 +25,10 @@ import os
|
|
25
25
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
26
26
|
from parallex.parallex import parallex
|
27
27
|
|
28
|
-
os.environ["
|
29
|
-
os.environ["
|
30
|
-
os.environ["
|
31
|
-
os.environ["
|
28
|
+
os.environ["AZURE_API_KEY"] = "key"
|
29
|
+
os.environ["AZURE_API_BASE"] = "your-endpoint.com"
|
30
|
+
os.environ["AZURE_API_VERSION"] = "deployment_version"
|
31
|
+
os.environ["AZURE_API_DEPLOYMENT"] = "deployment_name"
|
32
32
|
|
33
33
|
model = "gpt-4o"
|
34
34
|
|
@@ -35,8 +35,6 @@ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) ->
|
|
35
35
|
await asyncio.sleep(delay)
|
36
36
|
batch_response = await client.retrieve_batch(batch.id)
|
37
37
|
status = batch_response.status
|
38
|
-
batch.output_file_id = batch_response.output_file_id
|
39
|
-
batch.error_file_id = batch_response.error_file_id
|
40
38
|
delay = 30
|
41
39
|
if status == "completed":
|
42
40
|
return batch_response.output_file_id
|
@@ -4,34 +4,46 @@ from openai import AsyncAzureOpenAI
|
|
4
4
|
from openai._legacy_response import HttpxBinaryResponseContent
|
5
5
|
from openai.types import FileObject, Batch, FileDeleted
|
6
6
|
|
7
|
+
from parallex.file_management.remote_file_handler import RemoteFileHandler
|
7
8
|
from parallex.utils.logger import logger
|
8
9
|
|
9
10
|
|
10
11
|
# Exceptions for missing keys, etc
|
11
12
|
class OpenAIClient:
|
12
|
-
def __init__(self, model: str):
|
13
|
+
def __init__(self, model: str, remote_file_handler: RemoteFileHandler):
|
13
14
|
self.model = model
|
15
|
+
self.file_handler = remote_file_handler
|
14
16
|
|
15
17
|
self._client = AsyncAzureOpenAI(
|
16
|
-
azure_endpoint=os.getenv("
|
17
|
-
api_key=os.getenv("
|
18
|
-
api_version=os.getenv("
|
18
|
+
azure_endpoint=os.getenv("AZURE_API_BASE"),
|
19
|
+
api_key=os.getenv("AZURE_API_KEY"),
|
20
|
+
api_version=os.getenv("AZURE_API_VERSION"),
|
19
21
|
)
|
20
22
|
|
21
23
|
async def upload(self, file_path: str) -> FileObject:
|
22
|
-
|
24
|
+
file = await self._client.files.create(
|
23
25
|
file=open(file_path, "rb"), purpose="batch"
|
24
26
|
)
|
27
|
+
self.file_handler.add_file(file.id)
|
28
|
+
return file
|
25
29
|
|
26
30
|
async def create_batch(self, upload_file_id: str) -> Batch:
|
27
|
-
|
31
|
+
batch = await self._client.batches.create(
|
28
32
|
input_file_id=upload_file_id,
|
29
33
|
endpoint="/chat/completions",
|
30
34
|
completion_window="24h",
|
31
35
|
)
|
36
|
+
self.file_handler.add_file(batch.input_file_id)
|
37
|
+
self.file_handler.add_file(batch.output_file_id)
|
38
|
+
self.file_handler.add_file(batch.error_file_id)
|
39
|
+
return batch
|
32
40
|
|
33
41
|
async def retrieve_batch(self, batch_id: str) -> Batch:
|
34
|
-
|
42
|
+
batch = await self._client.batches.retrieve(batch_id)
|
43
|
+
self.file_handler.add_file(batch.input_file_id)
|
44
|
+
self.file_handler.add_file(batch.output_file_id)
|
45
|
+
self.file_handler.add_file(batch.error_file_id)
|
46
|
+
return batch
|
35
47
|
|
36
48
|
async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
|
37
49
|
return await self._client.files.content(file_id)
|
@@ -16,7 +16,7 @@ async def upload_images_for_processing(
|
|
16
16
|
image_files: list[ImageFile],
|
17
17
|
temp_directory: str,
|
18
18
|
prompt_text: str,
|
19
|
-
):
|
19
|
+
) -> list[BatchFile]:
|
20
20
|
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
21
21
|
trace_id = image_files[0].trace_id
|
22
22
|
current_index = 0
|
@@ -71,7 +71,7 @@ def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
|
|
71
71
|
"method": "POST",
|
72
72
|
"url": "/chat/completions",
|
73
73
|
"body": {
|
74
|
-
"model": os.getenv("
|
74
|
+
"model": os.getenv("AZURE_API_DEPLOYMENT"),
|
75
75
|
"messages": [
|
76
76
|
{
|
77
77
|
"role": "user",
|
@@ -9,7 +9,8 @@ from parallex.ai.output_processor import process_output
|
|
9
9
|
from parallex.ai.uploader import upload_images_for_processing
|
10
10
|
from parallex.file_management.converter import convert_pdf_to_images
|
11
11
|
from parallex.file_management.file_finder import add_file_to_temp_directory
|
12
|
-
from parallex.
|
12
|
+
from parallex.file_management.remote_file_handler import RemoteFileHandler
|
13
|
+
from parallex.models.batch_file import BatchFile
|
13
14
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
14
15
|
from parallex.models.upload_batch import UploadBatch
|
15
16
|
from parallex.utils.constants import DEFAULT_PROMPT
|
@@ -26,9 +27,32 @@ async def parallex(
|
|
26
27
|
log_level: Optional[str] = "ERROR",
|
27
28
|
) -> ParallexCallableOutput:
|
28
29
|
setup_logger(log_level)
|
29
|
-
|
30
|
-
|
30
|
+
remote_file_handler = RemoteFileHandler()
|
31
|
+
open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
|
32
|
+
try:
|
33
|
+
return await _execute(
|
34
|
+
open_ai_client=open_ai_client,
|
35
|
+
pdf_source_url=pdf_source_url,
|
36
|
+
post_process_callable=post_process_callable,
|
37
|
+
concurrency=concurrency,
|
38
|
+
prompt_text=prompt_text,
|
39
|
+
)
|
40
|
+
except Exception as e:
|
41
|
+
logger.error(f"Error occurred: {e}")
|
42
|
+
finally:
|
43
|
+
for file in remote_file_handler.created_files:
|
44
|
+
logger.info(f"deleting - {file}")
|
45
|
+
await open_ai_client.delete_file(file)
|
31
46
|
|
47
|
+
|
48
|
+
async def _execute(
|
49
|
+
open_ai_client: OpenAIClient,
|
50
|
+
pdf_source_url: str,
|
51
|
+
post_process_callable: Optional[Callable[..., None]] = None,
|
52
|
+
concurrency: Optional[int] = 20,
|
53
|
+
prompt_text: Optional[str] = DEFAULT_PROMPT,
|
54
|
+
) -> ParallexCallableOutput:
|
55
|
+
with tempfile.TemporaryDirectory() as temp_directory:
|
32
56
|
raw_file = await add_file_to_temp_directory(
|
33
57
|
pdf_source_url=pdf_source_url, temp_directory=temp_directory
|
34
58
|
)
|
@@ -47,7 +71,7 @@ async def parallex(
|
|
47
71
|
start_batch_tasks = []
|
48
72
|
for file in batch_files:
|
49
73
|
batch_task = asyncio.create_task(
|
50
|
-
|
74
|
+
_create_batch_jobs(
|
51
75
|
batch_file=file,
|
52
76
|
client=open_ai_client,
|
53
77
|
trace_id=trace_id,
|
@@ -55,11 +79,11 @@ async def parallex(
|
|
55
79
|
)
|
56
80
|
)
|
57
81
|
start_batch_tasks.append(batch_task)
|
58
|
-
|
82
|
+
batch_jobs = await asyncio.gather(*start_batch_tasks)
|
59
83
|
|
60
84
|
pages_tasks = []
|
61
85
|
process_semaphore = asyncio.Semaphore(concurrency)
|
62
|
-
for batch in
|
86
|
+
for batch in batch_jobs:
|
63
87
|
page_task = asyncio.create_task(
|
64
88
|
_wait_and_create_pages(
|
65
89
|
batch=batch, client=open_ai_client, semaphore=process_semaphore
|
@@ -69,7 +93,7 @@ async def parallex(
|
|
69
93
|
page_groups = await asyncio.gather(*pages_tasks)
|
70
94
|
|
71
95
|
pages = [page for batch_pages in page_groups for page in batch_pages]
|
72
|
-
logger.
|
96
|
+
logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
|
73
97
|
sorted_pages = sorted(pages, key=lambda x: x.page_number)
|
74
98
|
|
75
99
|
# TODO add combined version of MD to output / save to file system
|
@@ -88,30 +112,23 @@ async def _wait_and_create_pages(
|
|
88
112
|
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
89
113
|
):
|
90
114
|
async with semaphore:
|
91
|
-
logger.
|
115
|
+
logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
92
116
|
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
93
|
-
logger.
|
117
|
+
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
94
118
|
page_responses = await process_output(
|
95
119
|
client=client, output_file_id=output_file_id
|
96
120
|
)
|
97
|
-
await _remove_global_batch_files(client=client, batch=batch)
|
98
121
|
return page_responses
|
99
122
|
|
100
123
|
|
101
|
-
async def
|
102
|
-
|
103
|
-
for file_id in file_ids:
|
104
|
-
await client.delete_file(file_id)
|
105
|
-
|
106
|
-
|
107
|
-
async def _create_images_and_batch_jobs(
|
108
|
-
batch_file: ImageFile,
|
124
|
+
async def _create_batch_jobs(
|
125
|
+
batch_file: BatchFile,
|
109
126
|
client: OpenAIClient,
|
110
127
|
trace_id: UUID,
|
111
128
|
semaphore: asyncio.Semaphore,
|
112
129
|
):
|
113
130
|
async with semaphore:
|
114
|
-
|
131
|
+
upload_batch = await create_batch(
|
115
132
|
client=client, file_id=batch_file.id, trace_id=trace_id
|
116
133
|
)
|
117
|
-
return
|
134
|
+
return upload_batch
|
@@ -1,8 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
from aiologger import
|
3
|
+
from aiologger.loggers.json import JsonLogger
|
4
4
|
|
5
|
-
logger =
|
5
|
+
logger = JsonLogger.with_default_handlers(name="parallex")
|
6
6
|
|
7
7
|
|
8
8
|
def setup_logger(level: str = "ERROR"):
|
@@ -15,6 +15,4 @@ def setup_logger(level: str = "ERROR"):
|
|
15
15
|
"NOTSET": logging.NOTSET,
|
16
16
|
}.get(level, logging.INFO)
|
17
17
|
|
18
|
-
|
19
|
-
level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
20
|
-
)
|
18
|
+
logger.setLevel = level
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "parallex"
|
3
|
-
version = "0.
|
3
|
+
version = "0.2.0"
|
4
4
|
description = "PDF to markdown using Azure OpenAI batch processing"
|
5
5
|
authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
|
6
6
|
repository = "https://github.com/Summed-AI/parallex"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|