parallex 0.1.2__tar.gz → 0.1.4__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (22) hide show
  1. {parallex-0.1.2 → parallex-0.1.4}/PKG-INFO +7 -4
  2. {parallex-0.1.2 → parallex-0.1.4}/README.md +6 -3
  3. {parallex-0.1.2 → parallex-0.1.4}/parallex/ai/batch_processor.py +0 -2
  4. {parallex-0.1.2 → parallex-0.1.4}/parallex/ai/open_ai_client.py +16 -4
  5. {parallex-0.1.2 → parallex-0.1.4}/parallex/ai/uploader.py +1 -1
  6. parallex-0.1.4/parallex/file_management/remote_file_handler.py +7 -0
  7. {parallex-0.1.2 → parallex-0.1.4}/parallex/parallex.py +37 -20
  8. {parallex-0.1.2 → parallex-0.1.4}/parallex/utils/logger.py +3 -5
  9. {parallex-0.1.2 → parallex-0.1.4}/pyproject.toml +1 -1
  10. {parallex-0.1.2 → parallex-0.1.4}/LICENSE +0 -0
  11. {parallex-0.1.2 → parallex-0.1.4}/parallex/__init__.py +0 -0
  12. {parallex-0.1.2 → parallex-0.1.4}/parallex/ai/output_processor.py +0 -0
  13. {parallex-0.1.2 → parallex-0.1.4}/parallex/file_management/converter.py +0 -0
  14. {parallex-0.1.2 → parallex-0.1.4}/parallex/file_management/file_finder.py +0 -0
  15. {parallex-0.1.2 → parallex-0.1.4}/parallex/file_management/utils.py +0 -0
  16. {parallex-0.1.2 → parallex-0.1.4}/parallex/models/batch_file.py +0 -0
  17. {parallex-0.1.2 → parallex-0.1.4}/parallex/models/image_file.py +0 -0
  18. {parallex-0.1.2 → parallex-0.1.4}/parallex/models/page_response.py +0 -0
  19. {parallex-0.1.2 → parallex-0.1.4}/parallex/models/parallex_callable_output.py +0 -0
  20. {parallex-0.1.2 → parallex-0.1.4}/parallex/models/raw_file.py +0 -0
  21. {parallex-0.1.2 → parallex-0.1.4}/parallex/models/upload_batch.py +0 -0
  22. {parallex-0.1.2 → parallex-0.1.4}/parallex/utils/constants.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: parallex
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: PDF to markdown using Azure OpenAI batch processing
5
5
  Home-page: https://github.com/Summed-AI/parallex
6
6
  Author: Jeff Hostetler
@@ -22,10 +22,10 @@ Description-Content-Type: text/markdown
22
22
 
23
23
  ### What it does
24
24
  - Converts PDF into images
25
- - Makes requests to Azure OpenAI to covert the images to markdown using Batch API
25
+ - Makes requests to Azure OpenAI to convert the images to markdown using Batch API
26
26
  - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
27
27
  - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
28
- - Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
28
+ - Polls for batch completion and then converts AI responses in structured output based on the page of the corresponding PDF
29
29
  - Post batch processing to do what you wish with the resulting markdown
30
30
 
31
31
  ### Requirements
@@ -34,9 +34,12 @@ Parallex uses `graphicsmagick` for the conversion of PDF to images.
34
34
  brew install graphicsmagick
35
35
  ```
36
36
 
37
+ ### Installation
38
+ ```bash
39
+ pip install parallex
40
+ ```
37
41
 
38
42
  ### Example usage
39
-
40
43
  ```python
41
44
  import os
42
45
  from parallex.models.parallex_callable_output import ParallexCallableOutput
@@ -2,10 +2,10 @@
2
2
 
3
3
  ### What it does
4
4
  - Converts PDF into images
5
- - Makes requests to Azure OpenAI to covert the images to markdown using Batch API
5
+ - Makes requests to Azure OpenAI to convert the images to markdown using Batch API
6
6
  - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
7
7
  - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
8
- - Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
8
+ - Polls for batch completion and then converts AI responses in structured output based on the page of the corresponding PDF
9
9
  - Post batch processing to do what you wish with the resulting markdown
10
10
 
11
11
  ### Requirements
@@ -14,9 +14,12 @@ Parallex uses `graphicsmagick` for the conversion of PDF to images.
14
14
  brew install graphicsmagick
15
15
  ```
16
16
 
17
+ ### Installation
18
+ ```bash
19
+ pip install parallex
20
+ ```
17
21
 
18
22
  ### Example usage
19
-
20
23
  ```python
21
24
  import os
22
25
  from parallex.models.parallex_callable_output import ParallexCallableOutput
@@ -35,8 +35,6 @@ async def wait_for_batch_completion(client: OpenAIClient, batch: UploadBatch) ->
35
35
  await asyncio.sleep(delay)
36
36
  batch_response = await client.retrieve_batch(batch.id)
37
37
  status = batch_response.status
38
- batch.output_file_id = batch_response.output_file_id
39
- batch.error_file_id = batch_response.error_file_id
40
38
  delay = 30
41
39
  if status == "completed":
42
40
  return batch_response.output_file_id
@@ -4,13 +4,15 @@ from openai import AsyncAzureOpenAI
4
4
  from openai._legacy_response import HttpxBinaryResponseContent
5
5
  from openai.types import FileObject, Batch, FileDeleted
6
6
 
7
+ from parallex.file_management.remote_file_handler import RemoteFileHandler
7
8
  from parallex.utils.logger import logger
8
9
 
9
10
 
10
11
  # Exceptions for missing keys, etc
11
12
  class OpenAIClient:
12
- def __init__(self, model: str):
13
+ def __init__(self, model: str, remote_file_handler: RemoteFileHandler):
13
14
  self.model = model
15
+ self.file_handler = remote_file_handler
14
16
 
15
17
  self._client = AsyncAzureOpenAI(
16
18
  azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
@@ -19,19 +21,29 @@ class OpenAIClient:
19
21
  )
20
22
 
21
23
  async def upload(self, file_path: str) -> FileObject:
22
- return await self._client.files.create(
24
+ file = await self._client.files.create(
23
25
  file=open(file_path, "rb"), purpose="batch"
24
26
  )
27
+ self.file_handler.add_file(file.id)
28
+ return file
25
29
 
26
30
  async def create_batch(self, upload_file_id: str) -> Batch:
27
- return await self._client.batches.create(
31
+ batch = await self._client.batches.create(
28
32
  input_file_id=upload_file_id,
29
33
  endpoint="/chat/completions",
30
34
  completion_window="24h",
31
35
  )
36
+ self.file_handler.add_file(batch.input_file_id)
37
+ self.file_handler.add_file(batch.output_file_id)
38
+ self.file_handler.add_file(batch.error_file_id)
39
+ return batch
32
40
 
33
41
  async def retrieve_batch(self, batch_id: str) -> Batch:
34
- return await self._client.batches.retrieve(batch_id)
42
+ batch = await self._client.batches.retrieve(batch_id)
43
+ self.file_handler.add_file(batch.input_file_id)
44
+ self.file_handler.add_file(batch.output_file_id)
45
+ self.file_handler.add_file(batch.error_file_id)
46
+ return batch
35
47
 
36
48
  async def retrieve_file(self, file_id: str) -> HttpxBinaryResponseContent:
37
49
  return await self._client.files.content(file_id)
@@ -16,7 +16,7 @@ async def upload_images_for_processing(
16
16
  image_files: list[ImageFile],
17
17
  temp_directory: str,
18
18
  prompt_text: str,
19
- ):
19
+ ) -> list[BatchFile]:
20
20
  """Base64 encodes image, converts to expected jsonl format and uploads"""
21
21
  trace_id = image_files[0].trace_id
22
22
  current_index = 0
@@ -0,0 +1,7 @@
1
+ class RemoteFileHandler:
2
+ def __init__(self):
3
+ self.created_files = set()
4
+
5
+ def add_file(self, file_name: str) -> None:
6
+ if file_name is not None:
7
+ self.created_files.add(file_name)
@@ -9,7 +9,8 @@ from parallex.ai.output_processor import process_output
9
9
  from parallex.ai.uploader import upload_images_for_processing
10
10
  from parallex.file_management.converter import convert_pdf_to_images
11
11
  from parallex.file_management.file_finder import add_file_to_temp_directory
12
- from parallex.models.image_file import ImageFile
12
+ from parallex.file_management.remote_file_handler import RemoteFileHandler
13
+ from parallex.models.batch_file import BatchFile
13
14
  from parallex.models.parallex_callable_output import ParallexCallableOutput
14
15
  from parallex.models.upload_batch import UploadBatch
15
16
  from parallex.utils.constants import DEFAULT_PROMPT
@@ -26,9 +27,32 @@ async def parallex(
26
27
  log_level: Optional[str] = "ERROR",
27
28
  ) -> ParallexCallableOutput:
28
29
  setup_logger(log_level)
29
- with tempfile.TemporaryDirectory() as temp_directory:
30
- open_ai_client = OpenAIClient(model=model)
30
+ remote_file_handler = RemoteFileHandler()
31
+ open_ai_client = OpenAIClient(model=model, remote_file_handler=remote_file_handler)
32
+ try:
33
+ return await _execute(
34
+ open_ai_client=open_ai_client,
35
+ pdf_source_url=pdf_source_url,
36
+ post_process_callable=post_process_callable,
37
+ concurrency=concurrency,
38
+ prompt_text=prompt_text,
39
+ )
40
+ except Exception as e:
41
+ logger.error(f"Error occurred: {e}")
42
+ finally:
43
+ for file in remote_file_handler.created_files:
44
+ logger.info(f"deleting - {file}")
45
+ await open_ai_client.delete_file(file)
31
46
 
47
+
48
+ async def _execute(
49
+ open_ai_client: OpenAIClient,
50
+ pdf_source_url: str,
51
+ post_process_callable: Optional[Callable[..., None]] = None,
52
+ concurrency: Optional[int] = 20,
53
+ prompt_text: Optional[str] = DEFAULT_PROMPT,
54
+ ) -> ParallexCallableOutput:
55
+ with tempfile.TemporaryDirectory() as temp_directory:
32
56
  raw_file = await add_file_to_temp_directory(
33
57
  pdf_source_url=pdf_source_url, temp_directory=temp_directory
34
58
  )
@@ -47,7 +71,7 @@ async def parallex(
47
71
  start_batch_tasks = []
48
72
  for file in batch_files:
49
73
  batch_task = asyncio.create_task(
50
- _create_images_and_batch_jobs(
74
+ _create_batch_jobs(
51
75
  batch_file=file,
52
76
  client=open_ai_client,
53
77
  trace_id=trace_id,
@@ -55,11 +79,11 @@ async def parallex(
55
79
  )
56
80
  )
57
81
  start_batch_tasks.append(batch_task)
58
- batches = await asyncio.gather(*start_batch_tasks)
82
+ batch_jobs = await asyncio.gather(*start_batch_tasks)
59
83
 
60
84
  pages_tasks = []
61
85
  process_semaphore = asyncio.Semaphore(concurrency)
62
- for batch in batches:
86
+ for batch in batch_jobs:
63
87
  page_task = asyncio.create_task(
64
88
  _wait_and_create_pages(
65
89
  batch=batch, client=open_ai_client, semaphore=process_semaphore
@@ -69,7 +93,7 @@ async def parallex(
69
93
  page_groups = await asyncio.gather(*pages_tasks)
70
94
 
71
95
  pages = [page for batch_pages in page_groups for page in batch_pages]
72
- logger.debug(f"pages done. total pages- {len(pages)} - {trace_id}")
96
+ logger.info(f"pages done. total pages- {len(pages)} - {trace_id}")
73
97
  sorted_pages = sorted(pages, key=lambda x: x.page_number)
74
98
 
75
99
  # TODO add combined version of MD to output / save to file system
@@ -88,30 +112,23 @@ async def _wait_and_create_pages(
88
112
  batch: UploadBatch, client: OpenAIClient, semaphore: asyncio.Semaphore
89
113
  ):
90
114
  async with semaphore:
91
- logger.debug(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
115
+ logger.info(f"waiting for batch to complete - {batch.id} - {batch.trace_id}")
92
116
  output_file_id = await wait_for_batch_completion(client=client, batch=batch)
93
- logger.debug(f"batch completed - {batch.id} - {batch.trace_id}")
117
+ logger.info(f"batch completed - {batch.id} - {batch.trace_id}")
94
118
  page_responses = await process_output(
95
119
  client=client, output_file_id=output_file_id
96
120
  )
97
- await _remove_global_batch_files(client=client, batch=batch)
98
121
  return page_responses
99
122
 
100
123
 
101
- async def _remove_global_batch_files(client: OpenAIClient, batch: UploadBatch):
102
- file_ids = [batch.input_file_id, batch.output_file_id, batch.error_file_id]
103
- for file_id in file_ids:
104
- await client.delete_file(file_id)
105
-
106
-
107
- async def _create_images_and_batch_jobs(
108
- batch_file: ImageFile,
124
+ async def _create_batch_jobs(
125
+ batch_file: BatchFile,
109
126
  client: OpenAIClient,
110
127
  trace_id: UUID,
111
128
  semaphore: asyncio.Semaphore,
112
129
  ):
113
130
  async with semaphore:
114
- batch = await create_batch(
131
+ upload_batch = await create_batch(
115
132
  client=client, file_id=batch_file.id, trace_id=trace_id
116
133
  )
117
- return batch
134
+ return upload_batch
@@ -1,8 +1,8 @@
1
1
  import logging
2
2
 
3
- from aiologger import Logger
3
+ from aiologger.loggers.json import JsonLogger
4
4
 
5
- logger = Logger.with_default_handlers(name="parallex")
5
+ logger = JsonLogger.with_default_handlers(name="parallex")
6
6
 
7
7
 
8
8
  def setup_logger(level: str = "ERROR"):
@@ -15,6 +15,4 @@ def setup_logger(level: str = "ERROR"):
15
15
  "NOTSET": logging.NOTSET,
16
16
  }.get(level, logging.INFO)
17
17
 
18
- logging.basicConfig(
19
- level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
20
- )
18
+ logger.setLevel = level
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "parallex"
3
- version = "0.1.2"
3
+ version = "0.1.4"
4
4
  description = "PDF to markdown using Azure OpenAI batch processing"
5
5
  authors = ["Jeff Hostetler <jeff@summed.ai>", "Kevin Bao <kevin@summed.ai>"]
6
6
  repository = "https://github.com/Summed-AI/parallex"
File without changes
File without changes