parallex 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -35,8 +35,6 @@ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) ->
35
35
  await asyncio.sleep(delay)
36
36
  batch_response = await client.retrieve_batch(batch.id)
37
37
  status = batch_response.status
38
- batch.output_file_id = batch_response.output_file_id
39
- batch.error_file_id = batch_response.error_file_id
40
38
  delay = 30
41
39
  if status == "completed":
42
40
  return batch_response.output_file_id
@@ -4,34 +4,46 @@ from openai import AsyncAzureOpenAI
4
4
  from openai._legacy_response import HttpxBinaryResponseContent
5
5
  from openai.types import FileObject, Batch, FileDeleted
6
6
 
7
+ from parallex.file_management.remote_file_handler import RemoteFileHandler
7
8
  from parallex.utils.logger import logger
8
9
 
9
10
 
10
11
  # Exceptions for missing keys, etc
11
12
  class OpenAIClient:
12
- def __init__(self, model: str):
13
+ def __init__(self, model: str, remote_file_handler: RemoteFileHandler):
13
14
  self.model = model
15
+ self.file_handler = remote_file_handler
14
16
 
15
17
  self._client = AsyncAzureOpenAI(
16
- azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
17
- api_key=os.getenv("AZURE_OPENAI_API_KEY"),
18
- api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
18
+ azure_endpoint=os.getenv("AZURE_API_BASE"),
19
+ api_key=os.getenv("AZURE_API_KEY"),
20
+ api_version=os.getenv("AZURE_API_VERSION"),
19
21
  )
20
22
 
21
23
  async def upload(self, file_path: str) -> FileObject:
22
- return await self._client.files.create(
24
+ file = await self._client.files.create(
23
25
  file=open(file_path, "rb"), purpose="batch"
24
26
  )
27
+ self.file_handler.add_file(file.id)
28
+ return file
25
29
 
26
30
  async def create_batch(self, upload_file_id: str) -> Batch:
27
- return await self._client.batches.create(
31
+ batch = await self._client.batches.create(
28
32
  input_file_id=upload_file_id,
29
33
  endpoint="/chat/completions",
30
34
  completion_window="24h",
31
35
  )
36
+ self.file_handler.add_file(batch.input_file_id)
37
+ self.file_handler.add_file(batch.output_file_id)
38
+ self.file_handler.add_file(batch.error_file_id)
39
+ return batch
32
40
 
33
41
  async def retrieve_batch(self, batch_id: str) -> Batch:
34
- return await self._client.batches.retrieve(batch_id)
42
+ batch = await self._client.batches.retrieve(batch_id)
43
+ self.file_handler.add_file(batch.input_file_id)
44
+ self.file_handler.add_file(batch.output_file_id)
45
+ self.file_handler.add_file(batch.error_file_id)
46
+ return batch
35
47
 
36
48
  async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
37
49
  return await self._client.files.content(file_id)
parallex/ai/uploader.py CHANGED
@@ -16,7 +16,7 @@ async def upload_images_for_processing(
16
16
  image_files: list[ImageFile],
17
17
  temp_directory: str,
18
18
  prompt_text: str,
19
- ):
19
+ ) -> list[BatchFile]:
20
20
  """Base64 encodes image, converts to expected jsonl format and uploads"""
21
21
  trace_id = image_files[0].trace_id
22
22
  current_index = 0
@@ -71,7 +71,7 @@ def _jsonl_format(prompt_custom_id: str, encoded_image: str, prompt_text: str):
71
71
  "method": "POST",
72
72
  "url": "/chat/completions",
73
73
  "body": {
74
- "model": os.getenv("AZURE_OPENAI_API_DEPLOYMENT"),
74
+ "model": os.getenv("AZURE_API_DEPLOYMENT"),
75
75
  "messages": [
76
76
  {
77
77
  "role": "user",
@@ -0,0 +1,7 @@
1
+ class RemoteFileHandler:
2
+ def __init__(self):
3
+ self.created_files = set()
4
+
5
+ def add_file(self, file_name: str) -> None:
6
+ if file_name is not None:
7
+ self.created_files.add(file_name)
parallex/parallex.py CHANGED
@@ -9,7 +9,8 @@ from parallex.ai.output_processor import process_output
9
9
  from parallex.ai.uploader import upload_images_for_processing
10
10
  from parallex.file_management.converter import convert_pdf_to_images
11
11
  from parallex.file_management.file_finder import add_file_to_temp_directory
12
- from parallex.models.image_file import ImageFile
12
+ from parallex.file_management.remote_file_handler import RemoteFileHandler
13
+ from parallex.models.batch_file import BatchFile
13
14
  from parallex.models.parallex_callable_output import ParallexCallableOutput
14
15
  from parallex.models.upload_batch import UploadBatch
15
16
  from parallex.utils.constants import DEFAULT_PROMPT
@@ -26,9 +27,32 @@ async def parallex(
26
27
  log_level: Optional[str] = "ERROR",
27
28
  ) -> ParallexCallableOutput:
28
29
  setup_logger(log_level)
29
- with tempfile.TemporaryDirectory() as temp_directory:
30
- open_ai_client = OpenAIClient(model=model)
30
+ remote_file_handler = RemoteFileHandler()
31
+ open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
32
+ try:
33
+ return await _execute(
34
+ open_ai_client=open_ai_client,
35
+ pdf_source_url=pdf_source_url,
36
+ post_process_callable=post_process_callable,
37
+ concurrency=concurrency,
38
+ prompt_text=prompt_text,
39
+ )
40
+ except Exception as e:
41
+ logger.error(f"Error occurred: {e}")
42
+ finally:
43
+ for file in remote_file_handler.created_files:
44
+ logger.info(f"deleting - {file}")
45
+ await open_ai_client.delete_file(file)
31
46
 
47
+
48
+ async def _execute(
49
+ open_ai_client: OpenAIClient,
50
+ pdf_source_url: str,
51
+ post_process_callable: Optional[Callable[..., None]] = None,
52
+ concurrency: Optional[int] = 20,
53
+ prompt_text: Optional[str] = DEFAULT_PROMPT,
54
+ ) -> ParallexCallableOutput:
55
+ with tempfile.TemporaryDirectory() as temp_directory:
32
56
  raw_file = await add_file_to_temp_directory(
33
57
  pdf_source_url=pdf_source_url, temp_directory=temp_directory
34
58
  )
@@ -47,7 +71,7 @@ async def parallex(
47
71
  start_batch_tasks = []
48
72
  for file in batch_files:
49
73
  batch_task = asyncio.create_task(
50
- _create_images_and_batch_jobs(
74
+ _create_batch_jobs(
51
75
  batch_file=file,
52
76
  client=open_ai_client,
53
77
  trace_id=trace_id,
@@ -55,11 +79,11 @@ async def parallex(
55
79
  )
56
80
  )
57
81
  start_batch_tasks.append(batch_task)
58
- batches = await asyncio.gather(*start_batch_tasks)
82
+ batch_jobs = await asyncio.gather(*start_batch_tasks)
59
83
 
60
84
  pages_tasks = []
61
85
  process_semaphore = asyncio.Semaphore(concurrency)
62
- for batch in batches:
86
+ for batch in batch_jobs:
63
87
  page_task = asyncio.create_task(
64
88
  _wait_and_create_pages(
65
89
  batch=batch, client=open_ai_client, semaphore=process_semaphore
@@ -69,7 +93,7 @@ async def parallex(
69
93
  page_groups = await asyncio.gather(*pages_tasks)
70
94
 
71
95
  pages = [page for batch_pages in page_groups for page in batch_pages]
72
- logger.debug(f"pages done. total pages- {len(pages)} - {trace_id}")
96
+ logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
73
97
  sorted_pages = sorted(pages, key=lambda x: x.page_number)
74
98
 
75
99
  # TODO add combined version of MD to output / save to file system
@@ -88,30 +112,23 @@ async def _wait_and_create_pages(
88
112
  batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
89
113
  ):
90
114
  async with semaphore:
91
- logger.debug(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
115
+ logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
92
116
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
93
- logger.debug(f"batch completed - {batch.id} - {batch.trace_id}")
117
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
94
118
  page_responses = await process_output(
95
119
  client=client, output_file_id=output_file_id
96
120
  )
97
- await _remove_global_batch_files(client=client, batch=batch)
98
121
  return page_responses
99
122
 
100
123
 
101
- async def _remove_global_batch_files(client: OpenAIClient, batch: UploadBatch):
102
- file_ids = [batch.input_file_id, batch.output_file_id, batch.error_file_id]
103
- for file_id in file_ids:
104
- await client.delete_file(file_id)
105
-
106
-
107
- async def _create_images_and_batch_jobs(
108
- batch_file: ImageFile,
124
+ async def _create_batch_jobs(
125
+ batch_file: BatchFile,
109
126
  client: OpenAIClient,
110
127
  trace_id: UUID,
111
128
  semaphore: asyncio.Semaphore,
112
129
  ):
113
130
  async with semaphore:
114
- batch = await create_batch(
131
+ upload_batch = await create_batch(
115
132
  client=client, file_id=batch_file.id, trace_id=trace_id
116
133
  )
117
- return batch
134
+ return upload_batch
parallex/utils/logger.py CHANGED
@@ -1,8 +1,8 @@
1
1
  import logging
2
2
 
3
- from aiologger import Logger
3
+ from aiologger.loggers.json import JsonLogger
4
4
 
5
- logger = Logger.with_default_handlers(name="parallex")
5
+ logger = JsonLogger.with_default_handlers(name="parallex")
6
6
 
7
7
 
8
8
  def setup_logger(level: str = "ERROR"):
@@ -15,6 +15,4 @@ def setup_logger(level: str = "ERROR"):
15
15
  "NOTSET": logging.NOTSET,
16
16
  }.get(level, logging.INFO)
17
17
 
18
- logging.basicConfig(
19
- level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
- )
18
+ logger.setLevel = level
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -45,10 +45,10 @@ import os
45
45
  from parallex.models.parallex_callable_output import ParallexCallableOutput
46
46
  from parallex.parallex import parallex
47
47
 
48
- os.environ["AZURE_OPENAI_API_KEY"] = "key"
49
- os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
50
- os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
51
- os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
48
+ os.environ["AZURE_API_KEY"] = "key"
49
+ os.environ["AZURE_API_BASE"] = "your-endpoint.com"
50
+ os.environ["AZURE_API_VERSION"] = "deployment_version"
51
+ os.environ["AZURE_API_DEPLOYMENT"] = "deployment_name"
52
52
 
53
53
  model = "gpt-4o"
54
54
 
@@ -1,10 +1,11 @@
1
1
  parallex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- parallex/ai/batch_processor.py,sha256=wftq7-EKWbWO_tzz2PqVZa8XRVf7MoJlr7EcllX4-5I,1481
3
- parallex/ai/open_ai_client.py,sha256=Yvnvg5MGEyQrmN3HF5k8fEWse9Slthy3J-oumO6ZKkQ,1459
2
+ parallex/ai/batch_processor.py,sha256=O5q_jaIU0VI93p7Riq4aZ_qUiN9Omxp5GOfn0IqEYgo,1361
3
+ parallex/ai/open_ai_client.py,sha256=TRH78oYod_EWpp3hjEh097OT7hwsQmtv44_j3X9Frxo,2047
4
4
  parallex/ai/output_processor.py,sha256=P6ak7cblRHnsR1W7oEtbOGM7zd7tzZbRKigixQaXWyw,966
5
- parallex/ai/uploader.py,sha256=M8g8dC_bwiGNDI_S5qxcRqJljDu6KSan_eIcQWA-ERA,3162
5
+ parallex/ai/uploader.py,sha256=92P0LxLuRgtjtD4kLtM0n8WUww_8-GImLxb3pbl-kkg,3174
6
6
  parallex/file_management/converter.py,sha256=Rj-93LXNl2gCY-XUOCZv7DdCNI2-GyRpS5FobnTqwzo,1111
7
7
  parallex/file_management/file_finder.py,sha256=BPvrkxZlwOYmRXzzS138wGTsVzuhDIKfQZn0CISUj3o,1598
8
+ parallex/file_management/remote_file_handler.py,sha256=jsI9NhOrKQR8K3yo536lGplVBGis9XY0G4dRpumgWFM,213
8
9
  parallex/file_management/utils.py,sha256=WMdXd9UOFbJDHnL2IWfDXyyD2jhwnGtpCVI_npiSlIk,98
9
10
  parallex/models/batch_file.py,sha256=JwARFB48sMOTN-wf7J5YbsWIac2rxXnZ4fBABFESA0M,405
10
11
  parallex/models/image_file.py,sha256=LjQne2b6rIDWpQpdYT41KXNDWpg5kv9bkM1SCx6jnAI,402
@@ -12,10 +13,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
12
13
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
13
14
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
14
15
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
15
- parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
16
+ parallex/parallex.py,sha256=EkD_kZevDu0UBpRet3nsvIr826f7uBHiT0JA5hR3E8c,5117
16
17
  parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
17
- parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
18
- parallex-0.1.3.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
- parallex-0.1.3.dist-info/METADATA,sha256=oOHuVOKgITHhEjzSQQW4Odo06tZjFfY1B2AVR8q9SDg,3444
20
- parallex-0.1.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
- parallex-0.1.3.dist-info/RECORD,,
18
+ parallex/utils/logger.py,sha256=i3ZZ7YTUmhUStbvVME67F9ffnkLOv5ijm7wVUyJT8Ys,440
19
+ parallex-0.2.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
20
+ parallex-0.2.0.dist-info/METADATA,sha256=Aq2RRlLwkXcLZ_wNXLZAsydFYJqTSe47eVhAq78oja8,3416
21
+ parallex-0.2.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
22
+ parallex-0.2.0.dist-info/RECORD,,