parallex 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parallex/ai/batch_processor.py +0 -2
- parallex/ai/open_ai_client.py +19 -7
- parallex/ai/uploader.py +2 -2
- parallex/file_management/remote_file_handler.py +7 -0
- parallex/parallex.py +37 -20
- parallex/utils/logger.py +3 -5
- {parallex-0.1.3.dist-info → parallex-0.2.0.dist-info}/METADATA +5 -5
- {parallex-0.1.3.dist-info → parallex-0.2.0.dist-info}/RECORD +10 -9
- {parallex-0.1.3.dist-info → parallex-0.2.0.dist-info}/LICENSE +0 -0
- {parallex-0.1.3.dist-info → parallex-0.2.0.dist-info}/WHEEL +0 -0
parallex/ai/batch_processor.py
CHANGED
@@ -35,8 +35,6 @@ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) ->
|
|
35
35
|
await asyncio.sleep(delay)
|
36
36
|
batch_response = await client.retrieve_batch(batch.id)
|
37
37
|
status = batch_response.status
|
38
|
-
batch.output_file_id = batch_response.output_file_id
|
39
|
-
batch.error_file_id = batch_response.error_file_id
|
40
38
|
delay = 30
|
41
39
|
if status == "completed":
|
42
40
|
return batch_response.output_file_id
|
parallex/ai/open_ai_client.py
CHANGED
@@ -4,34 +4,46 @@ from openai import AsyncAzureOpenAI
|
|
4
4
|
from openai._legacy_response import HttpxBinaryResponseContent
|
5
5
|
from openai.types import FileObject, Batch, FileDeleted
|
6
6
|
|
7
|
+
from parallex.file_management.remote_file_handler import RemoteFileHandler
|
7
8
|
from parallex.utils.logger import logger
|
8
9
|
|
9
10
|
|
10
11
|
# Exceptions for missing keys, etc
|
11
12
|
class OpenAIClient:
|
12
|
-
def __init__(self, model: str):
|
13
|
+
def __init__(self, model: str, remote_file_handler: RemoteFileHandler):
|
13
14
|
self.model = model
|
15
|
+
self.file_handler = remote_file_handler
|
14
16
|
|
15
17
|
self._client = AsyncAzureOpenAI(
|
16
|
-
azure_endpoint=os.getenv("
|
17
|
-
api_key=os.getenv("
|
18
|
-
api_version=os.getenv("
|
18
|
+
azure_endpoint=os.getenv("AZURE_API_BASE"),
|
19
|
+
api_key=os.getenv("AZURE_API_KEY"),
|
20
|
+
api_version=os.getenv("AZURE_API_VERSION"),
|
19
21
|
)
|
20
22
|
|
21
23
|
async def upload(self, file_path: str) -> FileObject:
|
22
|
-
|
24
|
+
file = await self._client.files.create(
|
23
25
|
file=open(file_path, "rb"), purpose="batch"
|
24
26
|
)
|
27
|
+
self.file_handler.add_file(file.id)
|
28
|
+
return file
|
25
29
|
|
26
30
|
async def create_batch(self, upload_file_id: str) -> Batch:
|
27
|
-
|
31
|
+
batch = await self._client.batches.create(
|
28
32
|
input_file_id=upload_file_id,
|
29
33
|
endpoint="/chat/completions",
|
30
34
|
completion_window="24h",
|
31
35
|
)
|
36
|
+
self.file_handler.add_file(batch.input_file_id)
|
37
|
+
self.file_handler.add_file(batch.output_file_id)
|
38
|
+
self.file_handler.add_file(batch.error_file_id)
|
39
|
+
return batch
|
32
40
|
|
33
41
|
async def retrieve_batch(self, batch_id: str) -> Batch:
|
34
|
-
|
42
|
+
batch = await self._client.batches.retrieve(batch_id)
|
43
|
+
self.file_handler.add_file(batch.input_file_id)
|
44
|
+
self.file_handler.add_file(batch.output_file_id)
|
45
|
+
self.file_handler.add_file(batch.error_file_id)
|
46
|
+
return batch
|
35
47
|
|
36
48
|
async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
|
37
49
|
return await self._client.files.content(file_id)
|
parallex/ai/uploader.py
CHANGED
@@ -16,7 +16,7 @@ async def upload_images_for_processing(
|
|
16
16
|
image_files: list[ImageFile],
|
17
17
|
temp_directory: str,
|
18
18
|
prompt_text: str,
|
19
|
-
):
|
19
|
+
) -> list[BatchFile]:
|
20
20
|
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
21
21
|
trace_id = image_files[0].trace_id
|
22
22
|
current_index = 0
|
@@ -71,7 +71,7 @@ def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
|
|
71
71
|
"method": "POST",
|
72
72
|
"url": "/chat/completions",
|
73
73
|
"body": {
|
74
|
-
"model": os.getenv("
|
74
|
+
"model": os.getenv("AZURE_API_DEPLOYMENT"),
|
75
75
|
"messages": [
|
76
76
|
{
|
77
77
|
"role": "user",
|
parallex/parallex.py
CHANGED
@@ -9,7 +9,8 @@ from parallex.ai.output_processor import process_output
|
|
9
9
|
from parallex.ai.uploader import upload_images_for_processing
|
10
10
|
from parallex.file_management.converter import convert_pdf_to_images
|
11
11
|
from parallex.file_management.file_finder import add_file_to_temp_directory
|
12
|
-
from parallex.
|
12
|
+
from parallex.file_management.remote_file_handler import RemoteFileHandler
|
13
|
+
from parallex.models.batch_file import BatchFile
|
13
14
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
14
15
|
from parallex.models.upload_batch import UploadBatch
|
15
16
|
from parallex.utils.constants import DEFAULT_PROMPT
|
@@ -26,9 +27,32 @@ async def parallex(
|
|
26
27
|
log_level: Optional[str] = "ERROR",
|
27
28
|
) -> ParallexCallableOutput:
|
28
29
|
setup_logger(log_level)
|
29
|
-
|
30
|
-
|
30
|
+
remote_file_handler = RemoteFileHandler()
|
31
|
+
open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
|
32
|
+
try:
|
33
|
+
return await _execute(
|
34
|
+
open_ai_client=open_ai_client,
|
35
|
+
pdf_source_url=pdf_source_url,
|
36
|
+
post_process_callable=post_process_callable,
|
37
|
+
concurrency=concurrency,
|
38
|
+
prompt_text=prompt_text,
|
39
|
+
)
|
40
|
+
except Exception as e:
|
41
|
+
logger.error(f"Error occurred: {e}")
|
42
|
+
finally:
|
43
|
+
for file in remote_file_handler.created_files:
|
44
|
+
logger.info(f"deleting - {file}")
|
45
|
+
await open_ai_client.delete_file(file)
|
31
46
|
|
47
|
+
|
48
|
+
async def _execute(
|
49
|
+
open_ai_client: OpenAIClient,
|
50
|
+
pdf_source_url: str,
|
51
|
+
post_process_callable: Optional[Callable[..., None]] = None,
|
52
|
+
concurrency: Optional[int] = 20,
|
53
|
+
prompt_text: Optional[str] = DEFAULT_PROMPT,
|
54
|
+
) -> ParallexCallableOutput:
|
55
|
+
with tempfile.TemporaryDirectory() as temp_directory:
|
32
56
|
raw_file = await add_file_to_temp_directory(
|
33
57
|
pdf_source_url=pdf_source_url, temp_directory=temp_directory
|
34
58
|
)
|
@@ -47,7 +71,7 @@ async def parallex(
|
|
47
71
|
start_batch_tasks = []
|
48
72
|
for file in batch_files:
|
49
73
|
batch_task = asyncio.create_task(
|
50
|
-
|
74
|
+
_create_batch_jobs(
|
51
75
|
batch_file=file,
|
52
76
|
client=open_ai_client,
|
53
77
|
trace_id=trace_id,
|
@@ -55,11 +79,11 @@ async def parallex(
|
|
55
79
|
)
|
56
80
|
)
|
57
81
|
start_batch_tasks.append(batch_task)
|
58
|
-
|
82
|
+
batch_jobs = await asyncio.gather(*start_batch_tasks)
|
59
83
|
|
60
84
|
pages_tasks = []
|
61
85
|
process_semaphore = asyncio.Semaphore(concurrency)
|
62
|
-
for batch in
|
86
|
+
for batch in batch_jobs:
|
63
87
|
page_task = asyncio.create_task(
|
64
88
|
_wait_and_create_pages(
|
65
89
|
batch=batch, client=open_ai_client, semaphore=process_semaphore
|
@@ -69,7 +93,7 @@ async def parallex(
|
|
69
93
|
page_groups = await asyncio.gather(*pages_tasks)
|
70
94
|
|
71
95
|
pages = [page for batch_pages in page_groups for page in batch_pages]
|
72
|
-
logger.
|
96
|
+
logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
|
73
97
|
sorted_pages = sorted(pages, key=lambda x: x.page_number)
|
74
98
|
|
75
99
|
# TODO add combined version of MD to output / save to file system
|
@@ -88,30 +112,23 @@ async def _wait_and_create_pages(
|
|
88
112
|
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
89
113
|
):
|
90
114
|
async with semaphore:
|
91
|
-
logger.
|
115
|
+
logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
92
116
|
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
93
|
-
logger.
|
117
|
+
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
94
118
|
page_responses = await process_output(
|
95
119
|
client=client, output_file_id=output_file_id
|
96
120
|
)
|
97
|
-
await _remove_global_batch_files(client=client, batch=batch)
|
98
121
|
return page_responses
|
99
122
|
|
100
123
|
|
101
|
-
async def
|
102
|
-
|
103
|
-
for file_id in file_ids:
|
104
|
-
await client.delete_file(file_id)
|
105
|
-
|
106
|
-
|
107
|
-
async def _create_images_and_batch_jobs(
|
108
|
-
batch_file: ImageFile,
|
124
|
+
async def _create_batch_jobs(
|
125
|
+
batch_file: BatchFile,
|
109
126
|
client: OpenAIClient,
|
110
127
|
trace_id: UUID,
|
111
128
|
semaphore: asyncio.Semaphore,
|
112
129
|
):
|
113
130
|
async with semaphore:
|
114
|
-
|
131
|
+
upload_batch = await create_batch(
|
115
132
|
client=client, file_id=batch_file.id, trace_id=trace_id
|
116
133
|
)
|
117
|
-
return
|
134
|
+
return upload_batch
|
parallex/utils/logger.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
from aiologger import
|
3
|
+
from aiologger.loggers.json import JsonLogger
|
4
4
|
|
5
|
-
logger =
|
5
|
+
logger = JsonLogger.with_default_handlers(name="parallex")
|
6
6
|
|
7
7
|
|
8
8
|
def setup_logger(level: str = "ERROR"):
|
@@ -15,6 +15,4 @@ def setup_logger(level: str = "ERROR"):
|
|
15
15
|
"NOTSET": logging.NOTSET,
|
16
16
|
}.get(level, logging.INFO)
|
17
17
|
|
18
|
-
|
19
|
-
level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
20
|
-
)
|
18
|
+
logger.setLevel = level
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parallex
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: PDF to markdown using Azure OpenAI batch processing
|
5
5
|
Home-page: https://github.com/Summed-AI/parallex
|
6
6
|
Author: Jeff Hostetler
|
@@ -45,10 +45,10 @@ import os
|
|
45
45
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
46
46
|
from parallex.parallex import parallex
|
47
47
|
|
48
|
-
os.environ["
|
49
|
-
os.environ["
|
50
|
-
os.environ["
|
51
|
-
os.environ["
|
48
|
+
os.environ["AZURE_API_KEY"] = "key"
|
49
|
+
os.environ["AZURE_API_BASE"] = "your-endpoint.com"
|
50
|
+
os.environ["AZURE_API_VERSION"] = "deployment_version"
|
51
|
+
os.environ["AZURE_API_DEPLOYMENT"] = "deployment_name"
|
52
52
|
|
53
53
|
model = "gpt-4o"
|
54
54
|
|
@@ -1,10 +1,11 @@
|
|
1
1
|
parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
parallex/ai/batch_processor.py,sha256=
|
3
|
-
parallex/ai/open_ai_client.py,sha256=
|
2
|
+
parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
|
3
|
+
parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
|
4
4
|
parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
|
5
|
-
parallex/ai/uploader.py,sha256=
|
5
|
+
parallex/ai/uploader.py,sha256=92P0LxLuRgtjtD4kLtM0n8WUww_8-GImLxb3pbl-kkg,3174
|
6
6
|
parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
|
7
7
|
parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
|
8
|
+
parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
|
8
9
|
parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
|
9
10
|
parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
|
10
11
|
parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
|
@@ -12,10 +13,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
|
|
12
13
|
parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
|
13
14
|
parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
|
14
15
|
parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
|
15
|
-
parallex/parallex.py,sha256=
|
16
|
+
parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
|
16
17
|
parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
|
17
|
-
parallex/utils/logger.py,sha256=
|
18
|
-
parallex-0.
|
19
|
-
parallex-0.
|
20
|
-
parallex-0.
|
21
|
-
parallex-0.
|
18
|
+
parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
|
19
|
+
parallex-0.2.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
|
20
|
+
parallex-0.2.0.dist-info/METADATA,sha256=Aq2RRlLwkXcLZ_wNXLZAsydFYJqTSe47eVhAq78oja8,3416
|
21
|
+
parallex-0.2.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
22
|
+
parallex-0.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|