pdf-invoke 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_invoke-0.1.0/LICENSE +21 -0
- pdf_invoke-0.1.0/PKG-INFO +24 -0
- pdf_invoke-0.1.0/README.md +1 -0
- pdf_invoke-0.1.0/pyproject.toml +29 -0
- pdf_invoke-0.1.0/src/pdf_invoke/__init__.py +2 -0
- pdf_invoke-0.1.0/src/pdf_invoke/converter.py +121 -0
- pdf_invoke-0.1.0/src/pdf_invoke/multimodal_llm.py +120 -0
- pdf_invoke-0.1.0/src/pdf_invoke/types.py +9 -0
- pdf_invoke-0.1.0/src/pdf_invoke/utils.py +33 -0
pdf_invoke-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2026] [Luciano Bermudez]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf-invoke
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A LLM utility for working with pdfs
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Luciano Bermudez
|
|
8
|
+
Author-email: lberm007@ucr.edu
|
|
9
|
+
Requires-Python: >=3.10,<4.0.0
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
16
|
+
Requires-Dist: dotenv (>=0.9.9,<0.10.0)
|
|
17
|
+
Requires-Dist: langchain (>=1.2.10,<2.0.0)
|
|
18
|
+
Requires-Dist: langchain-core (>=1.2.11,<2.0.0)
|
|
19
|
+
Requires-Dist: pillow (>=12.1.1,<13.0.0)
|
|
20
|
+
Requires-Dist: pydantic (>=2.12.5,<3.0.0)
|
|
21
|
+
Requires-Dist: pymupdf (>=1.27.1,<2.0.0)
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# A package
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# A package
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pdf-invoke"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A LLM utility for working with pdfs"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Luciano Bermudez",email = "lberm007@ucr.edu"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.10,<4.0.0"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"langchain-core (>=1.2.11,<2.0.0)",
|
|
12
|
+
"pydantic (>=2.12.5,<3.0.0)",
|
|
13
|
+
"pymupdf (>=1.27.1,<2.0.0)",
|
|
14
|
+
"pillow (>=12.1.1,<13.0.0)",
|
|
15
|
+
"langchain (>=1.2.10,<2.0.0)",
|
|
16
|
+
"dotenv (>=0.9.9,<0.10.0)"
|
|
17
|
+
]
|
|
18
|
+
license = "MIT"
|
|
19
|
+
license-files = ["LICEN[CS]E*"]
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
23
|
+
build-backend = "poetry.core.masonry.api"
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"pytest (>=9.0.2,<10.0.0)",
|
|
28
|
+
"langchain-openai (>=1.1.9,<2.0.0)"
|
|
29
|
+
]
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Iterable, List
|
|
3
|
+
|
|
4
|
+
import pymupdf
|
|
5
|
+
|
|
6
|
+
from pdf_invoke.types import ImageExt, PDFInput
|
|
7
|
+
from pdf_invoke.utils import get_image_type, is_pdf_bytes, validate_image_bytes
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PDFImageConverter:
|
|
11
|
+
def pdf_to_images(
|
|
12
|
+
self, pdf: PDFInput, zoom: float = 0.2, ext: ImageExt = "png"
|
|
13
|
+
) -> List[bytes]:
|
|
14
|
+
doc = None
|
|
15
|
+
try:
|
|
16
|
+
if isinstance(pdf, (bytes, bytearray, memoryview)):
|
|
17
|
+
pdf_bytes = bytes(pdf)
|
|
18
|
+
self._validate_pdf_bytes(pdf_bytes)
|
|
19
|
+
doc = pymupdf.open(stream=pdf, filetype="pdf")
|
|
20
|
+
|
|
21
|
+
elif isinstance(pdf, (Path, str)):
|
|
22
|
+
doc = pymupdf.open(Path(pdf).as_posix())
|
|
23
|
+
else:
|
|
24
|
+
raise TypeError("PDF is not of expected type")
|
|
25
|
+
assert doc
|
|
26
|
+
pass
|
|
27
|
+
except Exception as e:
|
|
28
|
+
raise ValueError(f"Failed to open pdf {e}")
|
|
29
|
+
|
|
30
|
+
matrix = pymupdf.Matrix(zoom, zoom)
|
|
31
|
+
image_bytes = [page.get_pixmap(matrix=matrix).tobytes(ext) for page in doc]
|
|
32
|
+
doc.close()
|
|
33
|
+
return image_bytes
|
|
34
|
+
|
|
35
|
+
def images_to_pdf(
|
|
36
|
+
self, images: Iterable[bytes], allowed_formats=["png", "jpeg"]
|
|
37
|
+
) -> bytes:
|
|
38
|
+
validate_image_bytes(images, allowed_formats)
|
|
39
|
+
doc = pymupdf.open()
|
|
40
|
+
for img_bytes in images:
|
|
41
|
+
img_doc = pymupdf.open(stream=img_bytes, filetype=get_image_type(img_bytes))
|
|
42
|
+
rect = img_doc[0].rect
|
|
43
|
+
page = doc.new_page(width=rect.width, height=rect.height)
|
|
44
|
+
page.insert_image(rect, stream=img_bytes)
|
|
45
|
+
img_doc.close()
|
|
46
|
+
pdf_bytes = doc.tobytes()
|
|
47
|
+
# Should be bytes either way but just to make sure
|
|
48
|
+
self._validate_pdf_bytes(pdf_bytes)
|
|
49
|
+
doc.close()
|
|
50
|
+
return pdf_bytes
|
|
51
|
+
|
|
52
|
+
def save_pdf_to_images(
|
|
53
|
+
self,
|
|
54
|
+
pdf: PDFInput,
|
|
55
|
+
output_path: str | Path,
|
|
56
|
+
pdf_name: str | None = None,
|
|
57
|
+
ext: ImageExt = "png",
|
|
58
|
+
start: int = 0,
|
|
59
|
+
) -> str:
|
|
60
|
+
pdf_name = self._validate_pdf_name(pdf, pdf_name)
|
|
61
|
+
output_path = self._validate_path(output_path)
|
|
62
|
+
data = self.pdf_to_images(pdf)
|
|
63
|
+
for i, b in enumerate(data, start=start):
|
|
64
|
+
|
|
65
|
+
out = output_path / f"{pdf_name}_page_{i}.{ext}"
|
|
66
|
+
out.write_bytes(b)
|
|
67
|
+
return output_path.as_posix()
|
|
68
|
+
|
|
69
|
+
def save_images_to_pdf(
|
|
70
|
+
self,
|
|
71
|
+
images: Iterable[bytes],
|
|
72
|
+
output_path: str | Path,
|
|
73
|
+
pdf_name: str,
|
|
74
|
+
) -> str:
|
|
75
|
+
pdf_name = self._validate_pdf_name(name=pdf_name)
|
|
76
|
+
output_path = self._validate_path(output_path)
|
|
77
|
+
pdf_path = output_path / pdf_name
|
|
78
|
+
|
|
79
|
+
if pdf_path.suffix.lower() != ".pdf":
|
|
80
|
+
pdf_path = pdf_path.with_suffix(".pdf")
|
|
81
|
+
|
|
82
|
+
pdf_bytes = self.images_to_pdf(images)
|
|
83
|
+
|
|
84
|
+
pdf_path.write_bytes(pdf_bytes)
|
|
85
|
+
return pdf_path.as_posix()
|
|
86
|
+
|
|
87
|
+
def _validate_pdf_name(
|
|
88
|
+
self,
|
|
89
|
+
pdf: PDFInput | None = None,
|
|
90
|
+
name: str | None = None,
|
|
91
|
+
) -> str:
|
|
92
|
+
|
|
93
|
+
if name is not None:
|
|
94
|
+
return name
|
|
95
|
+
|
|
96
|
+
if isinstance(pdf, (str, Path)):
|
|
97
|
+
return Path(pdf).stem
|
|
98
|
+
|
|
99
|
+
raise ValueError(
|
|
100
|
+
"Unable to determine PDF name. "
|
|
101
|
+
"Provide either a file path (str or Path) or explicitly pass `name` "
|
|
102
|
+
"when supplying raw PDF bytes."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def _validate_pdf_bytes(self, data):
|
|
106
|
+
if not is_pdf_bytes(data):
|
|
107
|
+
raise ValueError("Document is not pdf")
|
|
108
|
+
|
|
109
|
+
def _validate_path(self, path: str | Path) -> Path:
|
|
110
|
+
path = Path(path)
|
|
111
|
+
if not path.exists():
|
|
112
|
+
raise ValueError(f"Failed to validate pdf {path} cannot be resolved")
|
|
113
|
+
return path
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == "__main__":
|
|
117
|
+
print("Test")
|
|
118
|
+
path = Path(r"pdf_invoke\data\Lecture_02_03.pdf")
|
|
119
|
+
image = Path(r"pdf_invoke\data\images\Lecture_02_03_page_1.png").read_bytes()
|
|
120
|
+
print(image[:10])
|
|
121
|
+
print(get_image_type(image))
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from base64 import b64encode
|
|
2
|
+
from typing import Iterable, Optional, Sequence, Type
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from pdf_invoke.converter import PDFImageConverter
|
|
8
|
+
from pdf_invoke.types import PDFInput, ALLOWED_MIME, ImageInput
|
|
9
|
+
from pdf_invoke.utils import validate_image_bytes
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseOutput(BaseModel):
|
|
13
|
+
data: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MultiModalLLM:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
*,
|
|
20
|
+
prompt: str,
|
|
21
|
+
model: BaseChatModel,
|
|
22
|
+
):
|
|
23
|
+
# Base configuration
|
|
24
|
+
self.prompt = prompt
|
|
25
|
+
self.llm = model
|
|
26
|
+
|
|
27
|
+
def _validate_input(
|
|
28
|
+
self,
|
|
29
|
+
pdf: PDFInput | None = None,
|
|
30
|
+
images: Sequence[ImageInput] | None = None,
|
|
31
|
+
) -> Sequence[bytes]:
|
|
32
|
+
# Ensure values are okay
|
|
33
|
+
if pdf is None and images is None:
|
|
34
|
+
raise ValueError("Either pdfinput or image_bytes must be provided")
|
|
35
|
+
if pdf is not None and images is not None:
|
|
36
|
+
raise ValueError("Provide only one of pdfinput or image_bytes")
|
|
37
|
+
|
|
38
|
+
# Return
|
|
39
|
+
if pdf:
|
|
40
|
+
return PDFImageConverter().pdf_to_images(pdf)
|
|
41
|
+
elif images:
|
|
42
|
+
return [self._image_to_bytes(i) for i in images]
|
|
43
|
+
else:
|
|
44
|
+
raise RuntimeError("Unexpected Error Occured ")
|
|
45
|
+
|
|
46
|
+
def invoke(
|
|
47
|
+
self,
|
|
48
|
+
pdf: PDFInput | None = None,
|
|
49
|
+
images: Sequence[ImageInput] | None = None,
|
|
50
|
+
output_model: Optional[Type[BaseModel]] = BaseOutput,
|
|
51
|
+
mime: ALLOWED_MIME = "image/png",
|
|
52
|
+
):
|
|
53
|
+
image_bytes = self._validate_input(pdf, images)
|
|
54
|
+
try:
|
|
55
|
+
message = self.prepare_payload(image_bytes, mime)
|
|
56
|
+
if output_model:
|
|
57
|
+
chain = self.llm.with_structured_output(schema=output_model)
|
|
58
|
+
return chain.invoke([message])
|
|
59
|
+
else:
|
|
60
|
+
return self.llm.invoke([message])
|
|
61
|
+
except Exception as e:
|
|
62
|
+
raise RuntimeError(f"Failed to invoke model {e}")
|
|
63
|
+
|
|
64
|
+
async def ainvoke(
|
|
65
|
+
self,
|
|
66
|
+
pdf: PDFInput | None = None,
|
|
67
|
+
images: Sequence[ImageInput] | None = None,
|
|
68
|
+
output_model: Optional[Type[BaseModel]] = BaseOutput,
|
|
69
|
+
mime: ALLOWED_MIME = "image/png",
|
|
70
|
+
):
|
|
71
|
+
image_bytes = self._validate_input(pdf, images)
|
|
72
|
+
message = self.prepare_payload(image_bytes, mime)
|
|
73
|
+
if output_model:
|
|
74
|
+
chain = self.llm.with_structured_output(
|
|
75
|
+
schema=output_model,
|
|
76
|
+
)
|
|
77
|
+
return chain.ainvoke([message])
|
|
78
|
+
else:
|
|
79
|
+
return self.llm.ainvoke([message])
|
|
80
|
+
|
|
81
|
+
def prepare_payload(self, data: Sequence[bytes], mime: ALLOWED_MIME = "image/png"):
|
|
82
|
+
try:
|
|
83
|
+
image_payload = self.prepare_image_payload(data, mime=mime)
|
|
84
|
+
message = {
|
|
85
|
+
"role": "user",
|
|
86
|
+
"content": [{"type": "text", "text": self.prompt}, *image_payload],
|
|
87
|
+
}
|
|
88
|
+
return message
|
|
89
|
+
except Exception as e:
|
|
90
|
+
raise RuntimeError(f"Failed to prepare payload for LLM. Error: {e}")
|
|
91
|
+
|
|
92
|
+
def prepare_image_payload(
|
|
93
|
+
self,
|
|
94
|
+
payload: Iterable[bytes],
|
|
95
|
+
mime: ALLOWED_MIME = "image/png",
|
|
96
|
+
):
|
|
97
|
+
allowed_format = mime.split("/")[-1]
|
|
98
|
+
validate_image_bytes(payload, allowed_formats=set([allowed_format]))
|
|
99
|
+
return [
|
|
100
|
+
{
|
|
101
|
+
"type": "image_url",
|
|
102
|
+
"image_url": {
|
|
103
|
+
"url": f"data:{mime};base64,{b64encode(p).decode("utf-8")}"
|
|
104
|
+
},
|
|
105
|
+
}
|
|
106
|
+
for p in payload
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
def _image_to_bytes(self, image: ImageInput) -> bytes:
|
|
110
|
+
try:
|
|
111
|
+
if isinstance(image, (bytes, memoryview)):
|
|
112
|
+
return image
|
|
113
|
+
elif isinstance(image, (str | Path)):
|
|
114
|
+
return Path(image).read_bytes()
|
|
115
|
+
else:
|
|
116
|
+
raise TypeError(
|
|
117
|
+
f"Failed to conver image to bytes received incorrect type image is of type {type(image)}"
|
|
118
|
+
)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
raise ValueError(f"Failed to convert image to bytes {e}")
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from PIL import Image
|
|
2
|
+
import io
|
|
3
|
+
from typing import Iterable, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def is_pdf_bytes(data: bytes) -> bool:
|
|
7
|
+
return data.startswith(b"%PDF-")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_image_type(image_bytes: bytes) -> str:
|
|
11
|
+
try:
|
|
12
|
+
with Image.open(io.BytesIO(image_bytes)) as img:
|
|
13
|
+
assert img.format
|
|
14
|
+
return img.format # e.g., 'PNG', 'JPEG'
|
|
15
|
+
except Exception:
|
|
16
|
+
raise ValueError("Provided bytes are not a valid image.")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def validate_image_bytes(
|
|
20
|
+
images: Iterable[bytes],
|
|
21
|
+
allowed_formats: set[str] | None = None,
|
|
22
|
+
) -> List[str]:
|
|
23
|
+
formats = []
|
|
24
|
+
for idx, img_bytes in enumerate(images):
|
|
25
|
+
try:
|
|
26
|
+
fmt = get_image_type(img_bytes)
|
|
27
|
+
|
|
28
|
+
if allowed_formats and fmt.lower() not in allowed_formats:
|
|
29
|
+
raise ValueError(f"Image at index {idx} has unsupported format: {fmt}")
|
|
30
|
+
formats.append(fmt)
|
|
31
|
+
except Exception as e:
|
|
32
|
+
raise ValueError(f"Invalid image at index {idx}: {e}") from e
|
|
33
|
+
return formats
|