parallex 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parallex/parallex.py +3 -3
- parallex-0.1.1.dist-info/METADATA +93 -0
- {parallex-0.1.0.dist-info → parallex-0.1.1.dist-info}/RECORD +5 -5
- parallex-0.1.0.dist-info/METADATA +0 -42
- {parallex-0.1.0.dist-info → parallex-0.1.1.dist-info}/LICENSE +0 -0
- {parallex-0.1.0.dist-info → parallex-0.1.1.dist-info}/WHEEL +0 -0
parallex/parallex.py
CHANGED
@@ -21,9 +21,9 @@ async def parallex(
|
|
21
21
|
model: str,
|
22
22
|
pdf_source_url: str,
|
23
23
|
post_process_callable: Optional[Callable[..., None]] = None,
|
24
|
-
concurrency: int = 20,
|
25
|
-
prompt_text: str = DEFAULT_PROMPT,
|
26
|
-
log_level: str = "ERROR",
|
24
|
+
concurrency: Optional[int] = 20,
|
25
|
+
prompt_text: Optional[str] = DEFAULT_PROMPT,
|
26
|
+
log_level: Optional[str] = "ERROR",
|
27
27
|
) -> ParallexCallableOutput:
|
28
28
|
setup_logger(log_level)
|
29
29
|
with tempfile.TemporaryDirectory() as temp_directory:
|
@@ -0,0 +1,93 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: parallex
|
3
|
+
Version: 0.1.1
|
4
|
+
Summary:
|
5
|
+
Author: Jeff Hostetler
|
6
|
+
Author-email: jeff@summed.ai
|
7
|
+
Requires-Python: >=3.12,<4.0
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
11
|
+
Requires-Dist: aiologger (>=0.7.0,<0.8.0)
|
12
|
+
Requires-Dist: asyncio (>=3.4.3,<4.0.0)
|
13
|
+
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
14
|
+
Requires-Dist: openai (>=1.54.4,<2.0.0)
|
15
|
+
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
16
|
+
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
17
|
+
Description-Content-Type: text/markdown
|
18
|
+
|
19
|
+
# Parallex
|
20
|
+
|
21
|
+
### What it does
|
22
|
+
- Converts PDF into images
|
23
|
+
- Makes requests to Azure OpenAI to covert the images to markdown using Batch API
|
24
|
+
- [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
|
25
|
+
- [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
|
26
|
+
- Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
|
27
|
+
- Post batch processing to do what you wish with the resulting markdown
|
28
|
+
|
29
|
+
### Requirements
|
30
|
+
Parallex uses `graphicsmagick` for the conversion of PDF to images.
|
31
|
+
```bash
|
32
|
+
brew install graphicsmagick
|
33
|
+
```
|
34
|
+
|
35
|
+
|
36
|
+
### Example usage
|
37
|
+
|
38
|
+
```python
|
39
|
+
import os
|
40
|
+
from parallex.models.parallex_callable_output import ParallexCallableOutput
|
41
|
+
from parallex.parallex import parallex
|
42
|
+
|
43
|
+
os.environ["AZURE_OPENAI_API_KEY"] = "key"
|
44
|
+
os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
|
45
|
+
os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
|
46
|
+
os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
|
47
|
+
|
48
|
+
model = "gpt-4o"
|
49
|
+
|
50
|
+
async def some_operation(file_url: str) -> None:
|
51
|
+
response_data: ParallexCallableOutput = await parallex(
|
52
|
+
model=model,
|
53
|
+
pdf_source_url=file_url,
|
54
|
+
post_process_callable=example_post_process, # Optional
|
55
|
+
concurrency=2, # Optional
|
56
|
+
prompt_text="Turn images into markdown", # Optional
|
57
|
+
log_level="ERROR" # Optional
|
58
|
+
)
|
59
|
+
pages = response_data.pages
|
60
|
+
|
61
|
+
def example_post_process(output: ParallexCallableOutput) -> None:
|
62
|
+
file_name = output.file_name
|
63
|
+
pages = output.pages
|
64
|
+
for page in pages:
|
65
|
+
markdown_for_page = page.output_content
|
66
|
+
pdf_page_number = page.page_number
|
67
|
+
|
68
|
+
```
|
69
|
+
|
70
|
+
Responses have the following structure;
|
71
|
+
```python
|
72
|
+
class ParallexCallableOutput(BaseModel):
|
73
|
+
file_name: str = Field(description="Name of file that is processed")
|
74
|
+
pdf_source_url: str = Field(description="Given URL of the source of output")
|
75
|
+
trace_id: UUID = Field(description="Unique trace for each file")
|
76
|
+
pages: list[PageResponse] = Field(description="List of PageResponse objects")
|
77
|
+
|
78
|
+
class PageResponse(BaseModel):
|
79
|
+
output_content: str = Field(description="Markdown generated for the page")
|
80
|
+
page_number: int = Field(description="Page number of the associated PDF")
|
81
|
+
```
|
82
|
+
|
83
|
+
### Default prompt is
|
84
|
+
```python
|
85
|
+
"""
|
86
|
+
Convert the following PDF page to markdown.
|
87
|
+
Return only the markdown with no explanation text.
|
88
|
+
Leave out any page numbers and redundant headers or footers.
|
89
|
+
Do not include any code blocks (e.g. "```markdown" or "```") in the response.
|
90
|
+
If unable to parse, return an empty string.
|
91
|
+
"""
|
92
|
+
```
|
93
|
+
|
@@ -12,10 +12,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
|
|
12
12
|
parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
|
13
13
|
parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
|
14
14
|
parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
|
15
|
-
parallex/parallex.py,sha256=
|
15
|
+
parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
|
16
16
|
parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
|
17
17
|
parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
|
18
|
-
parallex-0.1.
|
19
|
-
parallex-0.1.
|
20
|
-
parallex-0.1.
|
21
|
-
parallex-0.1.
|
18
|
+
parallex-0.1.1.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
|
19
|
+
parallex-0.1.1.dist-info/METADATA,sha256=-Sx_c_BTiA5GXur_eAXLaONi8MAjto5u4_NkRGTSKxU,3230
|
20
|
+
parallex-0.1.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
21
|
+
parallex-0.1.1.dist-info/RECORD,,
|
@@ -1,42 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: parallex
|
3
|
-
Version: 0.1.0
|
4
|
-
Summary:
|
5
|
-
Author: Jeff Hostetler
|
6
|
-
Author-email: jeff@summed.ai
|
7
|
-
Requires-Python: >=3.12,<4.0
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
9
|
-
Classifier: Programming Language :: Python :: 3.12
|
10
|
-
Classifier: Programming Language :: Python :: 3.13
|
11
|
-
Requires-Dist: aiologger (>=0.7.0,<0.8.0)
|
12
|
-
Requires-Dist: asyncio (>=3.4.3,<4.0.0)
|
13
|
-
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
14
|
-
Requires-Dist: openai (>=1.54.4,<2.0.0)
|
15
|
-
Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
|
16
|
-
Requires-Dist: pydantic (>=2.9.2,<3.0.0)
|
17
|
-
Description-Content-Type: text/markdown
|
18
|
-
|
19
|
-
# Parallex
|
20
|
-
|
21
|
-
### What it does
|
22
|
-
- Converts file into images
|
23
|
-
- Makes requests to OpenAI to covert the images to markdown
|
24
|
-
- [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
|
25
|
-
- [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
|
26
|
-
- Post batch processing to do what you wish with the resulting markdown
|
27
|
-
|
28
|
-
|
29
|
-
# Notes for us as we build
|
30
|
-
### Poetry
|
31
|
-
- Using [poetry](https://python-poetry.org/docs/) for dependency management
|
32
|
-
- add dependency `poetry add pydantic`
|
33
|
-
- add dev dependency `poetry add --group dev black`
|
34
|
-
- run main script `poetry run python main.py`
|
35
|
-
- run dev commands `poetry run black parallex`
|
36
|
-
|
37
|
-
|
38
|
-
# General behavior
|
39
|
-
- parallex takes args to do things with file
|
40
|
-
- parallex takes args to specify llm model
|
41
|
-
- parallex takes a callable to execute once batch process is "ready"
|
42
|
-
|
File without changes
|
File without changes
|