parallex 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- parallex/parallex.py +3 -3
- parallex-0.1.2.dist-info/METADATA +95 -0
- {parallex-0.1.0.dist-info → parallex-0.1.2.dist-info}/RECORD +5 -5
- parallex-0.1.0.dist-info/METADATA +0 -42
- {parallex-0.1.0.dist-info → parallex-0.1.2.dist-info}/LICENSE +0 -0
- {parallex-0.1.0.dist-info → parallex-0.1.2.dist-info}/WHEEL +0 -0
parallex/parallex.py
CHANGED
@@ -21,9 +21,9 @@ async def parallex(
|
|
21
21
|
model: str,
|
22
22
|
pdf_source_url: str,
|
23
23
|
post_process_callable: Optional[Callable[..., None]] = None,
|
24
|
-
concurrency: int = 20,
|
25
|
-
prompt_text: str = DEFAULT_PROMPT,
|
26
|
-
log_level: str = "ERROR",
|
24
|
+
concurrency: Optional[int] = 20,
|
25
|
+
prompt_text: Optional[str] = DEFAULT_PROMPT,
|
26
|
+
log_level: Optional[str] = "ERROR",
|
27
27
|
) -> ParallexCallableOutput:
|
28
28
|
setup_logger(log_level)
|
29
29
|
with tempfile.TemporaryDirectory() as temp_directory:
|
@@ -0,0 +1,95 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: parallex
|
3
|
+
Version: 0.1.2
|
4
|
+
Summary: PDF to markdown using Azure OpenAI batch processing
|
5
|
+
Home-page: https://github.com/Summed-AI/parallex
|
6
|
+
Author: Jeff Hostetler
|
7
|
+
Author-email: jeff@summed.ai
|
8
|
+
Requires-Python: >=3.12,<4.0
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
12
|
+
Requires-Dist: aiologger (>=0.7.0,<0.8.0)
|
13
|
+
Requires-Dist: asyncio (>=3.4.3,<4.0.0)
|
14
|
+
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
15
|
+
Requires-Dist: openai (>=1.54.4,<2.0.0)
|
16
|
+
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
17
|
+
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
18
|
+
Project-URL: Repository, https://github.com/Summed-AI/parallex
|
19
|
+
Description-Content-Type: text/markdown
|
20
|
+
|
21
|
+
# Parallex
|
22
|
+
|
23
|
+
### What it does
|
24
|
+
- Converts PDF into images
|
25
|
+
- Makes requests to Azure OpenAI to covert the images to markdown using Batch API
|
26
|
+
- [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
|
27
|
+
- [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
|
28
|
+
- Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
|
29
|
+
- Post batch processing to do what you wish with the resulting markdown
|
30
|
+
|
31
|
+
### Requirements
|
32
|
+
Parallex uses `graphicsmagick` for the conversion of PDF to images.
|
33
|
+
```bash
|
34
|
+
brew install graphicsmagick
|
35
|
+
```
|
36
|
+
|
37
|
+
|
38
|
+
### Example usage
|
39
|
+
|
40
|
+
```python
|
41
|
+
import os
|
42
|
+
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
43
|
+
from parallex.parallex import parallex
|
44
|
+
|
45
|
+
os.environ["AZURE_OPENAI_API_KEY"] = "key"
|
46
|
+
os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
|
47
|
+
os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
|
48
|
+
os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
|
49
|
+
|
50
|
+
model = "gpt-4o"
|
51
|
+
|
52
|
+
async def some_operation(file_url: str) -> None:
|
53
|
+
response_data: ParallexCallableOutput = await parallex(
|
54
|
+
model=model,
|
55
|
+
pdf_source_url=file_url,
|
56
|
+
post_process_callable=example_post_process, # Optional
|
57
|
+
concurrency=2, # Optional
|
58
|
+
prompt_text="Turn images into markdown", # Optional
|
59
|
+
log_level="ERROR" # Optional
|
60
|
+
)
|
61
|
+
pages = response_data.pages
|
62
|
+
|
63
|
+
def example_post_process(output: ParallexCallableOutput) -> None:
|
64
|
+
file_name = output.file_name
|
65
|
+
pages = output.pages
|
66
|
+
for page in pages:
|
67
|
+
markdown_for_page = page.output_content
|
68
|
+
pdf_page_number = page.page_number
|
69
|
+
|
70
|
+
```
|
71
|
+
|
72
|
+
Responses have the following structure;
|
73
|
+
```python
|
74
|
+
class ParallexCallableOutput(BaseModel):
|
75
|
+
file_name: str = Field(description="Name of file that is processed")
|
76
|
+
pdf_source_url: str = Field(description="Given URL of the source of output")
|
77
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
78
|
+
pages: list[PageResponse] = Field(description="List of PageResponse objects")
|
79
|
+
|
80
|
+
class PageResponse(BaseModel):
|
81
|
+
output_content: str = Field(description="Markdown generated for the page")
|
82
|
+
page_number: int = Field(description="Page number of the associated PDF")
|
83
|
+
```
|
84
|
+
|
85
|
+
### Default prompt is
|
86
|
+
```python
|
87
|
+
"""
|
88
|
+
Convert the following PDF page to markdown.
|
89
|
+
Return only the markdown with no explanation text.
|
90
|
+
Leave out any page numbers and redundant headers or footers.
|
91
|
+
Do not include any code blocks (e.g. "```markdown" or "```") in the response.
|
92
|
+
If unable to parse, return an empty string.
|
93
|
+
"""
|
94
|
+
```
|
95
|
+
|
@@ -12,10 +12,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
|
|
12
12
|
parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
|
13
13
|
parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
|
14
14
|
parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
|
15
|
-
parallex/parallex.py,sha256=
|
15
|
+
parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
|
16
16
|
parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
|
17
17
|
parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
|
18
|
-
parallex-0.1.
|
19
|
-
parallex-0.1.
|
20
|
-
parallex-0.1.
|
21
|
-
parallex-0.1.
|
18
|
+
parallex-0.1.2.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
|
19
|
+
parallex-0.1.2.dist-info/METADATA,sha256=MJ1bOEQ2MXvKhha6y1ehwMFzqf54DBKVuMxkipPM0tY,3393
|
20
|
+
parallex-0.1.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
21
|
+
parallex-0.1.2.dist-info/RECORD,,
|
@@ -1,42 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: parallex
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary:
|
5
|
-
Author: Jeff Hostetler
|
6
|
-
Author-email: jeff@summed.ai
|
7
|
-
Requires-Python: >=3.12,<4.0
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: Programming Language :: Python :: 3.12
|
10
|
-
Classifier: Programming Language :: Python :: 3.13
|
11
|
-
Requires-Dist: aiologger (>=0.7.0,<0.8.0)
|
12
|
-
Requires-Dist: asyncio (>=3.4.3,<4.0.0)
|
13
|
-
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
14
|
-
Requires-Dist: openai (>=1.54.4,<2.0.0)
|
15
|
-
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
16
|
-
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
17
|
-
Description-Content-Type: text/markdown
|
18
|
-
|
19
|
-
# Parallex
|
20
|
-
|
21
|
-
### What it does
|
22
|
-
- Converts file into images
|
23
|
-
- Makes requests to OpenAI to covert the images to markdown
|
24
|
-
- [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
|
25
|
-
- [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
|
26
|
-
- Post batch processing to do what you wish with the resulting markdown
|
27
|
-
|
28
|
-
|
29
|
-
# Notes for us as we build
|
30
|
-
### Poetry
|
31
|
-
- Using [poetry](https://python-poetry.org/docs/) for dependency management
|
32
|
-
- add dependency `poetry add pydantic`
|
33
|
-
- add dev dependency `poetry add --group dev black`
|
34
|
-
- run main script `poetry run python main.py`
|
35
|
-
- run dev commands `poetry run black parallex`
|
36
|
-
|
37
|
-
|
38
|
-
# General behavior
|
39
|
-
- parallex takes args to do things with file
|
40
|
-
- parallex takes args to specify llm model
|
41
|
-
- parallex takes a callable to execute once batch process is "ready"
|
42
|
-
|
File without changes
|
File without changes
|