parallex 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- parallex/ai/batch_processor.py +0 -2
- parallex/ai/open_ai_client.py +16 -4
- parallex/ai/uploader.py +1 -1
- parallex/file_management/remote_file_handler.py +7 -0
- parallex/parallex.py +37 -20
- parallex/utils/logger.py +3 -5
- {parallex-0.1.2.dist-info → parallex-0.1.4.dist-info}/METADATA +7 -4
- {parallex-0.1.2.dist-info → parallex-0.1.4.dist-info}/RECORD +10 -9
- {parallex-0.1.2.dist-info → parallex-0.1.4.dist-info}/LICENSE +0 -0
- {parallex-0.1.2.dist-info → parallex-0.1.4.dist-info}/WHEEL +0 -0
parallex/ai/batch_processor.py
CHANGED
@@ -35,8 +35,6 @@ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) ->
|
|
35
35
|
await asyncio.sleep(delay)
|
36
36
|
batch_response = await client.retrieve_batch(batch.id)
|
37
37
|
status = batch_response.status
|
38
|
-
batch.output_file_id = batch_response.output_file_id
|
39
|
-
batch.error_file_id = batch_response.error_file_id
|
40
38
|
delay = 30
|
41
39
|
if status == "completed":
|
42
40
|
return batch_response.output_file_id
|
parallex/ai/open_ai_client.py
CHANGED
@@ -4,13 +4,15 @@ from openai import AsyncAzureOpenAI
|
|
4
4
|
from openai._legacy_response import HttpxBinaryResponseContent
|
5
5
|
from openai.types import FileObject, Batch, FileDeleted
|
6
6
|
|
7
|
+
from parallex.file_management.remote_file_handler import RemoteFileHandler
|
7
8
|
from parallex.utils.logger import logger
|
8
9
|
|
9
10
|
|
10
11
|
# Exceptions for missing keys, etc
|
11
12
|
class OpenAIClient:
|
12
|
-
def __init__(self, model: str):
|
13
|
+
def __init__(self, model: str, remote_file_handler: RemoteFileHandler):
|
13
14
|
self.model = model
|
15
|
+
self.file_handler = remote_file_handler
|
14
16
|
|
15
17
|
self._client = AsyncAzureOpenAI(
|
16
18
|
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
@@ -19,19 +21,29 @@ class OpenAIClient:
|
|
19
21
|
)
|
20
22
|
|
21
23
|
async def upload(self, file_path: str) -> FileObject:
|
22
|
-
|
24
|
+
file = await self._client.files.create(
|
23
25
|
file=open(file_path, "rb"), purpose="batch"
|
24
26
|
)
|
27
|
+
self.file_handler.add_file(file.id)
|
28
|
+
return file
|
25
29
|
|
26
30
|
async def create_batch(self, upload_file_id: str) -> Batch:
|
27
|
-
|
31
|
+
batch = await self._client.batches.create(
|
28
32
|
input_file_id=upload_file_id,
|
29
33
|
endpoint="/chat/completions",
|
30
34
|
completion_window="24h",
|
31
35
|
)
|
36
|
+
self.file_handler.add_file(batch.input_file_id)
|
37
|
+
self.file_handler.add_file(batch.output_file_id)
|
38
|
+
self.file_handler.add_file(batch.error_file_id)
|
39
|
+
return batch
|
32
40
|
|
33
41
|
async def retrieve_batch(self, batch_id: str) -> Batch:
|
34
|
-
|
42
|
+
batch = await self._client.batches.retrieve(batch_id)
|
43
|
+
self.file_handler.add_file(batch.input_file_id)
|
44
|
+
self.file_handler.add_file(batch.output_file_id)
|
45
|
+
self.file_handler.add_file(batch.error_file_id)
|
46
|
+
return batch
|
35
47
|
|
36
48
|
async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
|
37
49
|
return await self._client.files.content(file_id)
|
parallex/ai/uploader.py
CHANGED
@@ -16,7 +16,7 @@ async def upload_images_for_processing(
|
|
16
16
|
image_files: list[ImageFile],
|
17
17
|
temp_directory: str,
|
18
18
|
prompt_text: str,
|
19
|
-
):
|
19
|
+
) -> list[BatchFile]:
|
20
20
|
"""Base64 encodes image, converts to expected jsonl format and uploads"""
|
21
21
|
trace_id = image_files[0].trace_id
|
22
22
|
current_index = 0
|
parallex/parallex.py
CHANGED
@@ -9,7 +9,8 @@ from parallex.ai.output_processor import process_output
|
|
9
9
|
from parallex.ai.uploader import upload_images_for_processing
|
10
10
|
from parallex.file_management.converter import convert_pdf_to_images
|
11
11
|
from parallex.file_management.file_finder import add_file_to_temp_directory
|
12
|
-
from parallex.
|
12
|
+
from parallex.file_management.remote_file_handler import RemoteFileHandler
|
13
|
+
from parallex.models.batch_file import BatchFile
|
13
14
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
14
15
|
from parallex.models.upload_batch import UploadBatch
|
15
16
|
from parallex.utils.constants import DEFAULT_PROMPT
|
@@ -26,9 +27,32 @@ async def parallex(
|
|
26
27
|
log_level: Optional[str] = "ERROR",
|
27
28
|
) -> ParallexCallableOutput:
|
28
29
|
setup_logger(log_level)
|
29
|
-
|
30
|
-
|
30
|
+
remote_file_handler = RemoteFileHandler()
|
31
|
+
open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
|
32
|
+
try:
|
33
|
+
return await _execute(
|
34
|
+
open_ai_client=open_ai_client,
|
35
|
+
pdf_source_url=pdf_source_url,
|
36
|
+
post_process_callable=post_process_callable,
|
37
|
+
concurrency=concurrency,
|
38
|
+
prompt_text=prompt_text,
|
39
|
+
)
|
40
|
+
except Exception as e:
|
41
|
+
logger.error(f"Error occurred: {e}")
|
42
|
+
finally:
|
43
|
+
for file in remote_file_handler.created_files:
|
44
|
+
logger.info(f"deleting - {file}")
|
45
|
+
await open_ai_client.delete_file(file)
|
31
46
|
|
47
|
+
|
48
|
+
async def _execute(
|
49
|
+
open_ai_client: OpenAIClient,
|
50
|
+
pdf_source_url: str,
|
51
|
+
post_process_callable: Optional[Callable[..., None]] = None,
|
52
|
+
concurrency: Optional[int] = 20,
|
53
|
+
prompt_text: Optional[str] = DEFAULT_PROMPT,
|
54
|
+
) -> ParallexCallableOutput:
|
55
|
+
with tempfile.TemporaryDirectory() as temp_directory:
|
32
56
|
raw_file = await add_file_to_temp_directory(
|
33
57
|
pdf_source_url=pdf_source_url, temp_directory=temp_directory
|
34
58
|
)
|
@@ -47,7 +71,7 @@ async def parallex(
|
|
47
71
|
start_batch_tasks = []
|
48
72
|
for file in batch_files:
|
49
73
|
batch_task = asyncio.create_task(
|
50
|
-
|
74
|
+
_create_batch_jobs(
|
51
75
|
batch_file=file,
|
52
76
|
client=open_ai_client,
|
53
77
|
trace_id=trace_id,
|
@@ -55,11 +79,11 @@ async def parallex(
|
|
55
79
|
)
|
56
80
|
)
|
57
81
|
start_batch_tasks.append(batch_task)
|
58
|
-
|
82
|
+
batch_jobs = await asyncio.gather(*start_batch_tasks)
|
59
83
|
|
60
84
|
pages_tasks = []
|
61
85
|
process_semaphore = asyncio.Semaphore(concurrency)
|
62
|
-
for batch in
|
86
|
+
for batch in batch_jobs:
|
63
87
|
page_task = asyncio.create_task(
|
64
88
|
_wait_and_create_pages(
|
65
89
|
batch=batch, client=open_ai_client, semaphore=process_semaphore
|
@@ -69,7 +93,7 @@ async def parallex(
|
|
69
93
|
page_groups = await asyncio.gather(*pages_tasks)
|
70
94
|
|
71
95
|
pages = [page for batch_pages in page_groups for page in batch_pages]
|
72
|
-
logger.
|
96
|
+
logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
|
73
97
|
sorted_pages = sorted(pages, key=lambda x: x.page_number)
|
74
98
|
|
75
99
|
# TODO add combined version of MD to output / save to file system
|
@@ -88,30 +112,23 @@ async def _wait_and_create_pages(
|
|
88
112
|
batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
|
89
113
|
):
|
90
114
|
async with semaphore:
|
91
|
-
logger.
|
115
|
+
logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
|
92
116
|
output_file_id = await wait_for_batch_completion(client=client, batch=batch)
|
93
|
-
logger.
|
117
|
+
logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
|
94
118
|
page_responses = await process_output(
|
95
119
|
client=client, output_file_id=output_file_id
|
96
120
|
)
|
97
|
-
await _remove_global_batch_files(client=client, batch=batch)
|
98
121
|
return page_responses
|
99
122
|
|
100
123
|
|
101
|
-
async def
|
102
|
-
|
103
|
-
for file_id in file_ids:
|
104
|
-
await client.delete_file(file_id)
|
105
|
-
|
106
|
-
|
107
|
-
async def _create_images_and_batch_jobs(
|
108
|
-
batch_file: ImageFile,
|
124
|
+
async def _create_batch_jobs(
|
125
|
+
batch_file: BatchFile,
|
109
126
|
client: OpenAIClient,
|
110
127
|
trace_id: UUID,
|
111
128
|
semaphore: asyncio.Semaphore,
|
112
129
|
):
|
113
130
|
async with semaphore:
|
114
|
-
|
131
|
+
upload_batch = await create_batch(
|
115
132
|
client=client, file_id=batch_file.id, trace_id=trace_id
|
116
133
|
)
|
117
|
-
return
|
134
|
+
return upload_batch
|
parallex/utils/logger.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
from aiologger import
|
3
|
+
from aiologger.loggers.json import JsonLogger
|
4
4
|
|
5
|
-
logger =
|
5
|
+
logger = JsonLogger.with_default_handlers(name="parallex")
|
6
6
|
|
7
7
|
|
8
8
|
def setup_logger(level: str = "ERROR"):
|
@@ -15,6 +15,4 @@ def setup_logger(level: str = "ERROR"):
|
|
15
15
|
"NOTSET": logging.NOTSET,
|
16
16
|
}.get(level, logging.INFO)
|
17
17
|
|
18
|
-
|
19
|
-
level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
20
|
-
)
|
18
|
+
logger.setLevel = level
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: parallex
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: PDF to markdown using Azure OpenAI batch processing
|
5
5
|
Home-page: https://github.com/Summed-AI/parallex
|
6
6
|
Author: Jeff Hostetler
|
@@ -22,10 +22,10 @@ Description-Content-Type: text/markdown
|
|
22
22
|
|
23
23
|
### What it does
|
24
24
|
- Converts PDF into images
|
25
|
-
- Makes requests to Azure OpenAI to
|
25
|
+
- Makes requests to Azure OpenAI to convert the images to markdown using Batch API
|
26
26
|
- [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
|
27
27
|
- [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
|
28
|
-
- Polls for batch completion and then
|
28
|
+
- Polls for batch completion and then converts AI responses in structured output based on the page of the corresponding PDF
|
29
29
|
- Post batch processing to do what you wish with the resulting markdown
|
30
30
|
|
31
31
|
### Requirements
|
@@ -34,9 +34,12 @@ Parallex uses `graphicsmagick` for the conversion of PDF to images.
|
|
34
34
|
brew install graphicsmagick
|
35
35
|
```
|
36
36
|
|
37
|
+
### Installation
|
38
|
+
```bash
|
39
|
+
pip install parallex
|
40
|
+
```
|
37
41
|
|
38
42
|
### Example usage
|
39
|
-
|
40
43
|
```python
|
41
44
|
import os
|
42
45
|
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
@@ -1,10 +1,11 @@
|
|
1
1
|
parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
parallex/ai/batch_processor.py,sha256=
|
3
|
-
parallex/ai/open_ai_client.py,sha256=
|
2
|
+
parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
|
3
|
+
parallex/ai/open_ai_client.py,sha256=UOb7tOCGCTVlA5Yj8eqOmuZKpOO4ioUw6GKbPY9zZwQ,2068
|
4
4
|
parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
|
5
|
-
parallex/ai/uploader.py,sha256=
|
5
|
+
parallex/ai/uploader.py,sha256=Vt7hF7dm-meimH6hVXbc3IZ2B7WPjWDMwoGiQQSB31Q,3181
|
6
6
|
parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
|
7
7
|
parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
|
8
|
+
parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
|
8
9
|
parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
|
9
10
|
parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
|
10
11
|
parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
|
@@ -12,10 +13,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
|
|
12
13
|
parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
|
13
14
|
parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
|
14
15
|
parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
|
15
|
-
parallex/parallex.py,sha256=
|
16
|
+
parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
|
16
17
|
parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
|
17
|
-
parallex/utils/logger.py,sha256=
|
18
|
-
parallex-0.1.
|
19
|
-
parallex-0.1.
|
20
|
-
parallex-0.1.
|
21
|
-
parallex-0.1.
|
18
|
+
parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
|
19
|
+
parallex-0.1.4.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
|
20
|
+
parallex-0.1.4.dist-info/METADATA,sha256=x3dPVyDeYjMN0254jputI8-Tr7ruesZXSRc2_Yc--ik,3444
|
21
|
+
parallex-0.1.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
22
|
+
parallex-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|