parallex 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
parallex/parallex.py CHANGED
@@ -21,9 +21,9 @@ async def parallex(
21
21
  model: str,
22
22
  pdf_source_url: str,
23
23
  post_process_callable: Optional[Callable[..., None]] = None,
24
- concurrency: int = 20,
25
- prompt_text: str = DEFAULT_PROMPT,
26
- log_level: str = "ERROR",
24
+ concurrency: Optional[int] = 20,
25
+ prompt_text: Optional[str] = DEFAULT_PROMPT,
26
+ log_level: Optional[str] = "ERROR",
27
27
  ) -> ParallexCallableOutput:
28
28
  setup_logger(log_level)
29
29
  with tempfile.TemporaryDirectory() as temp_directory:
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.1
2
+ Name: parallex
3
+ Version: 0.1.1
4
+ Summary:
5
+ Author: Jeff Hostetler
6
+ Author-email: jeff@summed.ai
7
+ Requires-Python: >=3.12,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Programming Language :: Python :: 3.13
11
+ Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
+ Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
+ Requires-Dist: openai (>=1.54.4,<2.0.0)
15
+ Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # Parallex
20
+
21
+ ### What it does
22
+ - Converts PDF into images
23
+ - Makes requests to Azure OpenAI to covert the images to markdown using Batch API
24
+ - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
+ - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
+ - Polls for batch completion and then coverts AI responses in structured output based on the page of the corresponding PDF
27
+ - Post batch processing to do what you wish with the resulting markdown
28
+
29
+ ### Requirements
30
+ Parallex uses `graphicsmagick` for the conversion of PDF to images.
31
+ ```bash
32
+ brew install graphicsmagick
33
+ ```
34
+
35
+
36
+ ### Example usage
37
+
38
+ ```python
39
+ import os
40
+ from parallex.models.parallex_callable_output import ParallexCallableOutput
41
+ from parallex.parallex import parallex
42
+
43
+ os.environ["AZURE_OPENAI_API_KEY"] = "key"
44
+ os.environ["AZURE_OPENAI_ENDPOINT"] = "your-endpoint.com"
45
+ os.environ["AZURE_OPENAI_API_VERSION"] = "deployment_version"
46
+ os.environ["AZURE_OPENAI_API_DEPLOYMENT"] = "deployment_name"
47
+
48
+ model = "gpt-4o"
49
+
50
+ async def some_operation(file_url: str) -> None:
51
+ response_data: ParallexCallableOutput = await parallex(
52
+ model=model,
53
+ pdf_source_url=file_url,
54
+ post_process_callable=example_post_process, # Optional
55
+ concurrency=2, # Optional
56
+ prompt_text="Turn images into markdown", # Optional
57
+ log_level="ERROR" # Optional
58
+ )
59
+ pages = response_data.pages
60
+
61
+ def example_post_process(output: ParallexCallableOutput) -> None:
62
+ file_name = output.file_name
63
+ pages = output.pages
64
+ for page in pages:
65
+ markdown_for_page = page.output_content
66
+ pdf_page_number = page.page_number
67
+
68
+ ```
69
+
70
+ Responses have the following structure;
71
+ ```python
72
+ class ParallexCallableOutput(BaseModel):
73
+ file_name: str = Field(description="Name of file that is processed")
74
+ pdf_source_url: str = Field(description="Given URL of the source of output")
75
+ trace_id: UUID = Field(description="Unique trace for each file")
76
+ pages: list[PageResponse] = Field(description="List of PageResponse objects")
77
+
78
+ class PageResponse(BaseModel):
79
+ output_content: str = Field(description="Markdown generated for the page")
80
+ page_number: int = Field(description="Page number of the associated PDF")
81
+ ```
82
+
83
+ ### Default prompt is
84
+ ```python
85
+ """
86
+ Convert the following PDF page to markdown.
87
+ Return only the markdown with no explanation text.
88
+ Leave out any page numbers and redundant headers or footers.
89
+ Do not include any code blocks (e.g. "```markdown" or "```") in the response.
90
+ If unable to parse, return an empty string.
91
+ """
92
+ ```
93
+
@@ -12,10 +12,10 @@ parallex/models/page_response.py,sha256=KADCAV3XnkqWm-q_FBCfbt5nqDbiHg9MroZvFXaB
12
12
  parallex/models/parallex_callable_output.py,sha256=CkJKA8mwsc5olNnG1K6nrWUu4xTkJvp8bp3SSPQEX5c,465
13
13
  parallex/models/raw_file.py,sha256=Nlv6u_jlDCXDgU2_Ff7DRbDCx27pB1NZugNhEoaBMQU,483
14
14
  parallex/models/upload_batch.py,sha256=jrnds9ryXg9drL4TF8TGimMVTCDfKaWsBzFv_ed0i88,2068
15
- parallex/parallex.py,sha256=2XXmG54eXtXnw2ElC12zjbWDnwDIHPgzKY1ktP8V93M,4472
15
+ parallex/parallex.py,sha256=RvSDNyjrE7Fd3LuatEt18_1sR_btPJ-i6uxjNr_dGh0,4502
16
16
  parallex/utils/constants.py,sha256=c6i_-OSfCXAzW9ILzddSSHfldqHnsPEID3G3VYGYXUg,362
17
17
  parallex/utils/logger.py,sha256=5dpTogztRq4NCgYWnbbkFNx3V2sFCN-Mtoagwj8i18Q,505
18
- parallex-0.1.0.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
- parallex-0.1.0.dist-info/METADATA,sha256=ICDxk_FnofGhJgeGXv_g-awaZm_g4zU23PkfuXOndNs,1485
20
- parallex-0.1.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
- parallex-0.1.0.dist-info/RECORD,,
18
+ parallex-0.1.1.dist-info/LICENSE,sha256=wPwCqGrisXnEcpaUxSO79C2mdOUTbtjhLjyy8mVW6p8,1046
19
+ parallex-0.1.1.dist-info/METADATA,sha256=-Sx_c_BTiA5GXur_eAXLaONi8MAjto5u4_NkRGTSKxU,3230
20
+ parallex-0.1.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
21
+ parallex-0.1.1.dist-info/RECORD,,
@@ -1,42 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: parallex
3
- Version: 0.1.0
4
- Summary:
5
- Author: Jeff Hostetler
6
- Author-email: jeff@summed.ai
7
- Requires-Python: >=3.12,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.12
10
- Classifier: Programming Language :: Python :: 3.13
11
- Requires-Dist: aiologger (>=0.7.0,<0.8.0)
12
- Requires-Dist: asyncio (>=3.4.3,<4.0.0)
13
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
14
- Requires-Dist: openai (>=1.54.4,<2.0.0)
15
- Requires-Dist: pdf2image (>=1.17.0,<2.0.0)
16
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
17
- Description-Content-Type: text/markdown
18
-
19
- # Parallex
20
-
21
- ### What it does
22
- - Converts file into images
23
- - Makes requests to OpenAI to covert the images to markdown
24
- - [Azure OpenAPI Batch](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/batch?tabs=standard-input%2Cpython-secure&pivots=programming-language-python)
25
- - [OpenAPI Batch](https://platform.openai.com/docs/guides/batch)
26
- - Post batch processing to do what you wish with the resulting markdown
27
-
28
-
29
- # Notes for us as we build
30
- ### Poetry
31
- - Using [poetry](https://python-poetry.org/docs/) for dependency management
32
- - add dependency `poetry add pydantic`
33
- - add dev dependency `poetry add --group dev black`
34
- - run main script `poetry run python main.py`
35
- - run dev commands `poetry run black parallex`
36
-
37
-
38
- # General behavior
39
- - parallex takes args to do things with file
40
- - parallex takes args to specify llm model
41
- - parallex takes a callable to execute once batch process is "ready"
42
-