parallex 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,8 +35,6 @@ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) ->
35
35
  await asyncio.sleep(delay)
36
36
  batch_response = await client.retrieve_batch(batch.id)
37
37
  status = batch_response.status
38
- batch.output_file_id = batch_response.output_file_id
39
- batch.error_file_id = batch_response.error_file_id
40
38
  delay = 30
41
39
  if status == "completed":
42
40
  return batch_response.output_file_id
@@ -4,34 +4,46 @@ from openai import AsyncAzureOpenAI
4
4
  from openai._legacy_response import HttpxBinaryResponseContent
5
5
  from openai.types import FileObject, Batch, FileDeleted
6
6
 
7
+ from parallex.file_management.remote_file_handler import RemoteFileHandler
7
8
  from parallex.utils.logger import logger
8
9
 
9
10
 
10
11
  # Exceptions for missing keys, etc
11
12
  class OpenAIClient:
12
- def __init__(self, model: str):
13
+ def __init__(self, model: str, remote_file_handler: RemoteFileHandler):
13
14
  self.model = model
15
+ self.file_handler = remote_file_handler
14
16
 
15
17
  self._client = AsyncAzureOpenAI(
16
- azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
17
- api_key=os.getenv("AZURE_OPENAI_API_KEY"),
18
- api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
18
+ azure_endpoint=os.getenv("AZURE_API_BASE"),
19
+ api_key=os.getenv("AZURE_API_KEY"),
20
+ api_version=os.getenv("AZURE_API_VERSION"),
19
21
  )
20
22
 
21
23
  async def upload(self, file_path: str) -> FileObject:
22
- return await self._client.files.create(
24
+ file = await self._client.files.create(
23
25
  file=open(file_path, "rb"), purpose="batch"
24
26
  )
27
+ self.file_handler.add_file(file.id)
28
+ return file
25
29
 
26
30
  async def create_batch(self, upload_file_id: str) -> Batch:
27
- return await self._client.batches.create(
31
+ batch = await self._client.batches.create(
28
32
  input_file_id=upload_file_id,
29
33
  endpoint="/chat/completions",
30
34
  completion_window="24h",
31
35
  )
36
+ self.file_handler.add_file(batch.input_file_id)
37
+ self.file_handler.add_file(batch.output_file_id)
38
+ self.file_handler.add_file(batch.error_file_id)
39
+ return batch
32
40
 
33
41
  async def retrieve_batch(self, batch_id: str) -> Batch:
34
- return await self._client.batches.retrieve(batch_id)
42
+ batch = await self._client.batches.retrieve(batch_id)
43
+ self.file_handler.add_file(batch.input_file_id)
44
+ self.file_handler.add_file(batch.output_file_id)
45
+ self.file_handler.add_file(batch.error_file_id)
46
+ return batch
35
47
 
36
48
  async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
37
49
  return await self._client.files.content(file_id)
parallex/ai/uploader.py CHANGED
@@ -16,7 +16,7 @@ async def upload_images_for_processing(
16
16
  image_files: list[ImageFile],
17
17
  temp_directory: str,
18
18
  prompt_text: str,
19
- ):
19
+ ) -> list[BatchFile]:
20
20
  """Base64 encodes image, converts to expected jsonl format and uploads"""
21
21
  trace_id = image_files[0].trace_id
22
22
  current_index = 0
@@ -71,7 +71,7 @@ def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
71
71
  "method": "POST",
72
72
  "url": "/chat/completions",
73
73
  "body": {
74
- "model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
74
+ "model": os.getenv("AZURE_API_DEPLOYMENT"),
75
75
  "messages": [
76
76
  {
77
77
  "role": "user",
@@ -0,0 +1,7 @@
1
+ class RemoteFileHandler:
2
+ def __init__(self):
3
+ self.created_files = set()
4
+
5
+ def add_file(self, file_name: str) -> None:
6
+ if file_name is not None:
7
+ self.created_files.add(file_name)
parallex/parallex.py CHANGED
@@ -9,7 +9,8 @@ from parallex.ai.output_processor import process_output
9
9
  from parallex.ai.uploader import upload_images_for_processing
10
10
  from parallex.file_management.converter import convert_pdf_to_images
11
11
  from parallex.file_management.file_finder import add_file_to_temp_directory
12
- from parallex.models.image_file import ImageFile
12
+ from parallex.file_management.remote_file_handler import RemoteFileHandler
13
+ from parallex.models.batch_file import BatchFile
13
14
  from parallex.models.parallex_callable_output import ParallexCallableOutput
14
15
  from parallex.models.upload_batch import UploadBatch
15
16
  from parallex.utils.constants import DEFAULT_PROMPT
@@ -26,9 +27,32 @@ async def parallex(
26
27
  log_level: Optional[str] = "ERROR",
27
28
  ) -> ParallexCallableOutput:
28
29
  setup_logger(log_level)
29
- with tempfile.TemporaryDirectory() as temp_directory:
30
- open_ai_client = OpenAIClient(model=model)
30
+ remote_file_handler = RemoteFileHandler()
31
+ open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
32
+ try:
33
+ return await _execute(
34
+ open_ai_client=open_ai_client,
35
+ pdf_source_url=pdf_source_url,
36
+ post_process_callable=post_process_callable,
37
+ concurrency=concurrency,
38
+ prompt_text=prompt_text,
39
+ )
40
+ except Exception as e:
41
+ logger.error(f"Error occurred: {e}")
42
+ finally:
43
+ for file in remote_file_handler.created_files:
44
+ logger.info(f"deleting - {file}")
45
+ await open_ai_client.delete_file(file)
31
46
 
47
+
48
+ async def _execute(
49
+ open_ai_client: OpenAIClient,
50
+ pdf_source_url: str,
51
+ post_process_callable: Optional[Callable[..., None]] = None,
52
+ concurrency: Optional[int] = 20,
53
+ prompt_text: Optional[str] = DEFAULT_PROMPT,
54
+ ) -> ParallexCallableOutput:
55
+ with tempfile.TemporaryDirectory() as temp_directory:
32
56
  raw_file = await add_file_to_temp_directory(
33
57
  pdf_source_url=pdf_source_url, temp_directory=temp_directory
34
58
  )
@@ -47,7 +71,7 @@ async def parallex(
47
71
  start_batch_tasks = []
48
72
  for file in batch_files:
49
73
  batch_task = asyncio.create_task(
50
- _create_images_and_batch_jobs(
74
+ _create_batch_jobs(
51
75
  batch_file=file,
52
76
  client=open_ai_client,
53
77
  trace_id=trace_id,
@@ -55,11 +79,11 @@ async def parallex(
55
79
  )
56
80
  )
57
81
  start_batch_tasks.append(batch_task)
58
- batches = await asyncio.gather(*start_batch_tasks)
82
+ batch_jobs = await asyncio.gather(*start_batch_tasks)
59
83
 
60
84
  pages_tasks = []
61
85
  process_semaphore = asyncio.Semaphore(concurrency)
62
- for batch in batches:
86
+ for batch in batch_jobs:
63
87
  page_task = asyncio.create_task(
64
88
  _wait_and_create_pages(
65
89
  batch=batch, client=open_ai_client, semaphore=process_semaphore
@@ -69,7 +93,7 @@ async def parallex(
69
93
  page_groups = await asyncio.gather(*pages_tasks)
70
94
 
71
95
  pages = [page for batch_pages in page_groups for page in batch_pages]
72
- logger.debug(f"pages done. total pages- {len(pages)} - {trace_id}")
96
+ logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
73
97
  sorted_pages = sorted(pages, key=lambda x: x.page_number)
74
98
 
75
99
  # TODO add combined version of MD to output / save to file system
@@ -88,30 +112,23 @@ async def _wait_and_create_pages(
88
112
  batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
89
113
  ):
90
114
  async with semaphore:
91
- logger.debug(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
115
+ logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
92
116
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
93
- logger.debug(f"batch completed - {batch.id} - {batch.trace_id}")
117
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
94
118
  page_responses = await process_output(
95
119
  client=client, output_file_id=output_file_id
96
120
  )
97
- await _remove_global_batch_files(client=client, batch=batch)
98
121
  return page_responses
99
122
 
100
123
 
101
- async def _remove_global_batch_files(client: OpenAIClient, batch: UploadBatch):
102
- file_ids = [batch.input_file_id, batch.output_file_id, batch.error_file_id]
103
- for file_id in file_ids:
104
- await client.delete_file(file_id)
105
-
106
-
107
- async def _create_images_and_batch_jobs(
108
- batch_file: ImageFile,
124
+ async def _create_batch_jobs(
125
+ batch_file: BatchFile,
109
126
  client: OpenAIClient,
110
127
  trace_id: UUID,
111
128
  semaphore: asyncio.Semaphore,
112
129
  ):
113
130
  async with semaphore:
114
- batch = await create_batch(
131
+ upload_batch = await create_batch(
115
132
  client=client, file_id=batch_file.id, trace_id=trace_id
116
133
  )
117
- return batch
134
+ return upload_batch
parallex/utils/logger.py CHANGED
@@ -1,8 +1,8 @@
1
1
  import logging
2
2
 
3
- from aiologger import Logger
3
+ from aiologger.loggers.json import JsonLogger
4
4
 
5
- logger = Logger.with_default_handlers(name="parallex")
5
+ logger = JsonLogger.with_default_handlers(name="parallex")
6
6
 
7
7
 
8
8
  def setup_logger(level: str = "ERROR"):
@@ -15,6 +15,4 @@ def setup_logger(level: str = "ERROR"):
15
15
  "NOTSET": logging.NOTSET,
16
16
  }.get(level, logging.INFO)
17
17
 
18
- logging.basicConfig(
19
- level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
- )
18
+ logger.setLevel = level
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -45,10 +45,10 @@ import os
45
45
  from parallex.models.parallex_callable_output import ParallexCallableOutput
46
46
  from parallex.parallex import parallex
47
47
 
48
- os.environ["AZURE_OPENAI_API_KEY"] = "key"
49
- os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
50
- os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
51
- os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
48
+ os.environ["AZURE_API_KEY"] = "key"
49
+ os.environ["AZURE_API_BASE"] = "your-endpoint.com"
50
+ os.environ["AZURE_API_VERSION"] = "deployment_version"
51
+ os.environ["AZURE_API_DEPLOYMENT"] = "deployment_name"
52
52
 
53
53
  model = "gpt-4o"
54
54
 
@@ -1,10 +1,11 @@
1
1
  parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- parallex/ai/batch_processor.py,sha256=wftq7-EKWbWO_tzz2PqVZa8XRVf7MoJlr7EcllX4-5I,1481
3
- parallex/ai/open_ai_client.py,sha256=Yvnvg5MGEyQrmN3HF5k8fEWse9Slthy3J-oumO6ZKkQ,1459
2
+ parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
3
+ parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
4
4
  parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
5
- parallex/ai/uploader.py,sha256=M8g8dC_bwiGNDI_S5qxcRqJljDu6KSan_eIcQWA-ERA,3162
5
+ parallex/ai/uploader.py,sha256=92P0LxLuRgtjtD4kLtM0n8WUww_8-GImLxb3pbl-kkg,3174
6
6
  parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
7
  parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
+ parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
8
9
  parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
9
10
  parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
10
11
  parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
@@ -12,10 +13,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
12
13
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
13
14
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
14
15
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
15
- parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
16
+ parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
16
17
  parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
17
- parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
18
- parallex-0.1.3.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
- parallex-0.1.3.dist-info/METADATA,sha256=oOHuVOKgITHhEjzSQQW4Odo06tZjFfY1B2AVR8q9SDg,3444
20
- parallex-0.1.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
- parallex-0.1.3.dist-info/RECORD,,
18
+ parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
19
+ parallex-0.2.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
20
+ parallex-0.2.0.dist-info/METADATA,sha256=Aq2RRlLwkXcLZ_wNXLZAsydFYJqTSe47eVhAq78oja8,3416
21
+ parallex-0.2.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
22
+ parallex-0.2.0.dist-info/RECORD,,