parallex 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
parallex/parallex.py CHANGED
@@ -21,9 +21,9 @@ async def parallex(
21
21
  model: str,
22
22
  pdf_source_url: str,
23
23
  post_process_callable: Optional[Callable[..., None]] = None,
24
- concurrency: int = 20,
25
- prompt_text: str = DEFAULT_PROMPT,
26
- log_level: str = "ERROR",
24
+ concurrency: Optional[int] = 20,
25
+ prompt_text: Optional[str] = DEFAULT_PROMPT,
26
+ log_level: Optional[str] = "ERROR",
27
27
  ) -> ParallexCallableOutput:
28
28
  setup_logger(log_level)
29
29
  with tempfile.TemporaryDirectory() as temp_directory:
@@ -0,0 +1,95 @@
1
+ Metadata-Version: 2.1
2
+ Name: parallex
3
+ Version: 0.1.2
4
+ Summary: PDF to markdown using Azure OpenAI batch processing
5
+ Home-page: https://github.com/Summed-AI/parallex
6
+ Author: Jeff Hostetler
7
+ Author-email: jeff@summed.ai
8
+ Requires-Python: >=3.12,<4.0
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Requires-Dist: aiologger (>=0.7.0,<0.8.0)
13
+ Requires-Dist: asyncio (>=3.4.3,<4.0.0)
14
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
15
+ Requires-Dist: openai (>=1.54.4,<2.0.0)
16
+ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
17
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
18
+ Project-URL: Repository, https://github.com/Summed-AI/parallex
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Parallex
22
+
23
+ ### What it does
24
+ - Converts PDF into images
25
+ - Makes requests to Azure OpenAI to covert the images to markdown using Batch API
26
+ - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
27
+ - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
28
+ - Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
29
+ - Post batch processing to do what you wish with the resulting markdown
30
+
31
+ ### Requirements
32
+ Parallex uses `graphicsmagick` for the conversion of PDF to images.
33
+ ```bash
34
+ brew install graphicsmagick
35
+ ```
36
+
37
+
38
+ ### Example usage
39
+
40
+ ```python
41
+ import os
42
+ from parallex.models.parallex_callable_output import ParallexCallableOutput
43
+ from parallex.parallex import parallex
44
+
45
+ os.environ["AZURE_OPENAI_API_KEY"] = "key"
46
+ os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
47
+ os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
48
+ os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
49
+
50
+ model = "gpt-4o"
51
+
52
+ async def some_operation(file_url: str) -> None:
53
+ response_data: ParallexCallableOutput = await parallex(
54
+ model=model,
55
+ pdf_source_url=file_url,
56
+ post_process_callable=example_post_process, # Optional
57
+ concurrency=2, # Optional
58
+ prompt_text="Turn images into markdown", # Optional
59
+ log_level="ERROR" # Optional
60
+ )
61
+ pages = response_data.pages
62
+
63
+ def example_post_process(output: ParallexCallableOutput) -> None:
64
+ file_name = output.file_name
65
+ pages = output.pages
66
+ for page in pages:
67
+ markdown_for_page = page.output_content
68
+ pdf_page_number = page.page_number
69
+
70
+ ```
71
+
72
+ Responses have the following structure;
73
+ ```python
74
+ class ParallexCallableOutput(BaseModel):
75
+ file_name: str = Field(description="Name of file that is processed")
76
+ pdf_source_url: str = Field(description="Given URL of the source of output")
77
+ trace_id: UUID = Field(description="Unique trace for each file")
78
+ pages: list[PageResponse] = Field(description="List of PageResponse objects")
79
+
80
+ class PageResponse(BaseModel):
81
+ output_content: str = Field(description="Markdown generated for the page")
82
+ page_number: int = Field(description="Page number of the associated PDF")
83
+ ```
84
+
85
+ ### Default prompt is
86
+ ```python
87
+ """
88
+ Convert the following PDF page to markdown.
89
+ Return only the markdown with no explanation text.
90
+ Leave out any page numbers and redundant headers or footers.
91
+ Do not include any code blocks (e.g. "```markdown" or "```") in the response.
92
+ If unable to parse, return an empty string.
93
+ """
94
+ ```
95
+
@@ -12,10 +12,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
12
12
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
13
13
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
14
14
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
15
- parallex/parallex.py,sha256=2XXmG54eXtXnw2ElC12zjbWDnwDIHPgzKY1ktP8V93M,4472
15
+ parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
16
16
  parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
17
17
  parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
18
- parallex-0.1.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
- parallex-0.1.0.dist-info/METADATA,sha256=ICDxk_FnofGhJgeGXv_g-awaZm_g4zU23PkfuXOndNs,1485
20
- parallex-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
- parallex-0.1.0.dist-info/RECORD,,
18
+ parallex-0.1.2.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
+ parallex-0.1.2.dist-info/METADATA,sha256=MJ1bOEQ2MXvKhha6y1ehwMFzqf54DBKVuMxkipPM0tY,3393
20
+ parallex-0.1.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
+ parallex-0.1.2.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: parallex
3
- Version: 0.1.0
4
- Summary:
5
- Author: Jeff Hostetler
6
- Author-email: jeff@summed.ai
7
- Requires-Python: >=3.12,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.12
10
- Classifier: Programming Language :: Python :: 3.13
11
- Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
- Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
- Requires-Dist: openai (>=1.54.4,<2.0.0)
15
- Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
- Description-Content-Type: text/markdown
18
-
19
- # Parallex
20
-
21
- ### What it does
22
- - Converts file into images
23
- - Makes requests to OpenAI to covert the images to markdown
24
- - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
- - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
- - Post batch processing to do what you wish with the resulting markdown
27
-
28
-
29
- # Notes for us as we build
30
- ### Poetry
31
- - Using [poetry](https://python-poetry.org/docs/) for dependency management
32
- - add dependency `poetry add pydantic`
33
- - add dev dependency `poetry add --group dev black`
34
- - run main script `poetry run python main.py`
35
- - run dev commands `poetry run black parallex`
36
-
37
-
38
- # General behavior
39
- - parallex takes args to do things with file
40
- - parallex takes args to specify llm model
41
- - parallex takes a callable to execute once batch process is "ready"
42
-