parallex 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parallex/parallex.py CHANGED
@@ -21,9 +21,9 @@ async def parallex(
21
21
  model: str,
22
22
  pdf_source_url: str,
23
23
  post_process_callable: Optional[Callable[..., None]] = None,
24
- concurrency: int = 20,
25
- prompt_text: str = DEFAULT_PROMPT,
26
- log_level: str = "ERROR",
24
+ concurrency: Optional[int] = 20,
25
+ prompt_text: Optional[str] = DEFAULT_PROMPT,
26
+ log_level: Optional[str] = "ERROR",
27
27
  ) -> ParallexCallableOutput:
28
28
  setup_logger(log_level)
29
29
  with tempfile.TemporaryDirectory() as temp_directory:
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.1
2
+ Name: parallex
3
+ Version: 0.1.2
4
+ Summary: PDF to markdown using Azure OpenAI batch processing
5
+ Home-page: https://github.com/Summed-AI/parallex
6
+ Author: Jeff Hostetler
7
+ Author-email: jeff@summed.ai
8
+ Requires-Python: >=3.12,<4.0
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Requires-Dist: aiologger (>=0.7.0,<0.8.0)
13
+ Requires-Dist: asyncio (>=3.4.3,<4.0.0)
14
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
15
+ Requires-Dist: openai (>=1.54.4,<2.0.0)
16
+ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
17
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
18
+ Project-URL: Repository, https://github.com/Summed-AI/parallex
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Parallex
22
+
23
+ ### What it does
24
+ - Converts PDF into images
25
+ - Makes requests to Azure OpenAI to covert the images to markdown using Batch API
26
+ - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
27
+ - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
28
+ - Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
29
+ - Post batch processing to do what you wish with the resulting markdown
30
+
31
+ ### Requirements
32
+ Parallex uses `graphicsmagick` for the conversion of PDF to images.
33
+ ```bash
34
+ brew install graphicsmagick
35
+ ```
36
+
37
+
38
+ ### Example usage
39
+
40
+ ```python
41
+ import os
42
+ from parallex.models.parallex_callable_output import ParallexCallableOutput
43
+ from parallex.parallex import parallex
44
+
45
+ os.environ["AZURE_OPENAI_API_KEY"] = "key"
46
+ os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
47
+ os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
48
+ os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
49
+
50
+ model = "gpt-4o"
51
+
52
+ async def some_operation(file_url: str) -> None:
53
+ response_data: ParallexCallableOutput = await parallex(
54
+ model=model,
55
+ pdf_source_url=file_url,
56
+ post_process_callable=example_post_process, # Optional
57
+ concurrency=2, # Optional
58
+ prompt_text="Turn images into markdown", # Optional
59
+ log_level="ERROR" # Optional
60
+ )
61
+ pages = response_data.pages
62
+
63
+ def example_post_process(output: ParallexCallableOutput) -> None:
64
+ file_name = output.file_name
65
+ pages = output.pages
66
+ for page in pages:
67
+ markdown_for_page = page.output_content
68
+ pdf_page_number = page.page_number
69
+
70
+ ```
71
+
72
+ Responses have the following structure;
73
+ ```python
74
+ class ParallexCallableOutput(BaseModel):
75
+ file_name: str = Field(description="Name of file that is processed")
76
+ pdf_source_url: str = Field(description="Given URL of the source of output")
77
+ trace_id: UUID = Field(description="Unique trace for each file")
78
+ pages: list[PageResponse] = Field(description="List of PageResponse objects")
79
+
80
+ class PageResponse(BaseModel):
81
+ output_content: str = Field(description="Markdown generated for the page")
82
+ page_number: int = Field(description="Page number of the associated PDF")
83
+ ```
84
+
85
+ ### Default prompt is
86
+ ```python
87
+ """
88
+ Convert the following PDF page to markdown.
89
+ Return only the markdown with no explanation text.
90
+ Leave out any page numbers and redundant headers or footers.
91
+ Do not include any code blocks (e.g. "```markdown" or "```") in the response.
92
+ If unable to parse, return an empty string.
93
+ """
94
+ ```
95
+
@@ -12,10 +12,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
12
12
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
13
13
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
14
14
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
15
- parallex/parallex.py,sha256=2XXmG54eXtXnw2ElC12zjbWDnwDIHPgzKY1ktP8V93M,4472
15
+ parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
16
16
  parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
17
17
  parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
18
- parallex-0.1.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
- parallex-0.1.0.dist-info/METADATA,sha256=ICDxk_FnofGhJgeGXv_g-awaZm_g4zU23PkfuXOndNs,1485
20
- parallex-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
- parallex-0.1.0.dist-info/RECORD,,
18
+ parallex-0.1.2.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
+ parallex-0.1.2.dist-info/METADATA,sha256=MJ1bOEQ2MXvKhha6y1ehwMFzqf54DBKVuMxkipPM0tY,3393
20
+ parallex-0.1.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
+ parallex-0.1.2.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: parallex
3
- Version: 0.1.0
4
- Summary:
5
- Author: Jeff Hostetler
6
- Author-email: jeff@summed.ai
7
- Requires-Python: >=3.12,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.12
10
- Classifier: Programming Language :: Python :: 3.13
11
- Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
- Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
- Requires-Dist: openai (>=1.54.4,<2.0.0)
15
- Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
- Description-Content-Type: text/markdown
18
-
19
- # Parallex
20
-
21
- ### What it does
22
- - Converts file into images
23
- - Makes requests to OpenAI to covert the images to markdown
24
- - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
- - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
- - Post batch processing to do what you wish with the resulting markdown
27
-
28
-
29
- # Notes for us as we build
30
- ### Poetry
31
- - Using [poetry](https://python-poetry.org/docs/) for dependency management
32
- - add dependency `poetry add pydantic`
33
- - add dev dependency `poetry add --group dev black`
34
- - run main script `poetry run python main.py`
35
- - run dev commands `poetry run black parallex`
36
-
37
-
38
- # General behavior
39
- - parallex takes args to do things with file
40
- - parallex takes args to specify llm model
41
- - parallex takes a callable to execute once batch process is "ready"
42
-