parallex 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
parallex/parallex.py CHANGED
@@ -21,9 +21,9 @@ async def parallex(
21
21
  model: str,
22
22
  pdf_source_url: str,
23
23
  post_process_callable: Optional[Callable[..., None]] = None,
24
- concurrency: int = 20,
25
- prompt_text: str = DEFAULT_PROMPT,
26
- log_level: str = "ERROR",
24
+ concurrency: Optional[int] = 20,
25
+ prompt_text: Optional[str] = DEFAULT_PROMPT,
26
+ log_level: Optional[str] = "ERROR",
27
27
  ) -> ParallexCallableOutput:
28
28
  setup_logger(log_level)
29
29
  with tempfile.TemporaryDirectory() as temp_directory:
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.1
2
+ Name: parallex
3
+ Version: 0.1.1
4
+ Summary:
5
+ Author: Jeff Hostetler
6
+ Author-email: jeff@summed.ai
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
+ Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
+ Requires-Dist: openai (>=1.54.4,<2.0.0)
15
+ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # Parallex
20
+
21
+ ### What it does
22
+ - Converts PDF into images
23
+ - Makes requests to Azure OpenAI to covert the images to markdown using Batch API
24
+ - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
+ - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
+ - Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
27
+ - Post batch processing to do what you wish with the resulting markdown
28
+
29
+ ### Requirements
30
+ Parallex uses `graphicsmagick` for the conversion of PDF to images.
31
+ ```bash
32
+ brew install graphicsmagick
33
+ ```
34
+
35
+
36
+ ### Example usage
37
+
38
+ ```python
39
+ import os
40
+ from parallex.models.parallex_callable_output import ParallexCallableOutput
41
+ from parallex.parallex import parallex
42
+
43
+ os.environ["AZURE_OPENAI_API_KEY"] = "key"
44
+ os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
45
+ os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
46
+ os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
47
+
48
+ model = "gpt-4o"
49
+
50
+ async def some_operation(file_url: str) -> None:
51
+ response_data: ParallexCallableOutput = await parallex(
52
+ model=model,
53
+ pdf_source_url=file_url,
54
+ post_process_callable=example_post_process, # Optional
55
+ concurrency=2, # Optional
56
+ prompt_text="Turn images into markdown", # Optional
57
+ log_level="ERROR" # Optional
58
+ )
59
+ pages = response_data.pages
60
+
61
+ def example_post_process(output: ParallexCallableOutput) -> None:
62
+ file_name = output.file_name
63
+ pages = output.pages
64
+ for page in pages:
65
+ markdown_for_page = page.output_content
66
+ pdf_page_number = page.page_number
67
+
68
+ ```
69
+
70
+ Responses have the following structure;
71
+ ```python
72
+ class ParallexCallableOutput(BaseModel):
73
+ file_name: str = Field(description="Name of file that is processed")
74
+ pdf_source_url: str = Field(description="Given URL of the source of output")
75
+ trace_id: UUID = Field(description="Unique trace for each file")
76
+ pages: list[PageResponse] = Field(description="List of PageResponse objects")
77
+
78
+ class PageResponse(BaseModel):
79
+ output_content: str = Field(description="Markdown generated for the page")
80
+ page_number: int = Field(description="Page number of the associated PDF")
81
+ ```
82
+
83
+ ### Default prompt is
84
+ ```python
85
+ """
86
+ Convert the following PDF page to markdown.
87
+ Return only the markdown with no explanation text.
88
+ Leave out any page numbers and redundant headers or footers.
89
+ Do not include any code blocks (e.g. "```markdown" or "```") in the response.
90
+ If unable to parse, return an empty string.
91
+ """
92
+ ```
93
+
@@ -12,10 +12,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
12
12
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
13
13
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
14
14
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
15
- parallex/parallex.py,sha256=2XXmG54eXtXnw2ElC12zjbWDnwDIHPgzKY1ktP8V93M,4472
15
+ parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
16
16
  parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
17
17
  parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
18
- parallex-0.1.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
- parallex-0.1.0.dist-info/METADATA,sha256=ICDxk_FnofGhJgeGXv_g-awaZm_g4zU23PkfuXOndNs,1485
20
- parallex-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
- parallex-0.1.0.dist-info/RECORD,,
18
+ parallex-0.1.1.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
+ parallex-0.1.1.dist-info/METADATA,sha256=-Sx_c_BTiA5GXur_eAXLaONi8MAjto5u4_NkRGTSKxU,3230
20
+ parallex-0.1.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
+ parallex-0.1.1.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: parallex
3
- Version: 0.1.0
4
- Summary:
5
- Author: Jeff Hostetler
6
- Author-email: jeff@summed.ai
7
- Requires-Python: >=3.12,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.12
10
- Classifier: Programming Language :: Python :: 3.13
11
- Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
- Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
- Requires-Dist: openai (>=1.54.4,<2.0.0)
15
- Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
- Description-Content-Type: text/markdown
18
-
19
- # Parallex
20
-
21
- ### What it does
22
- - Converts file into images
23
- - Makes requests to OpenAI to covert the images to markdown
24
- - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
- - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
- - Post batch processing to do what you wish with the resulting markdown
27
-
28
-
29
- # Notes for us as we build
30
- ### Poetry
31
- - Using [poetry](https://python-poetry.org/docs/) for dependency management
32
- - add dependency `poetry add pydantic`
33
- - add dev dependency `poetry add --group dev black`
34
- - run main script `poetry run python main.py`
35
- - run dev commands `poetry run black parallex`
36
-
37
-
38
- # General behavior
39
- - parallex takes args to do things with file
40
- - parallex takes args to specify llm model
41
- - parallex takes a callable to execute once batch process is "ready"
42
-