pdf_segmentation 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_segmentation-0.1.0/PKG-INFO +19 -0
- pdf_segmentation-0.1.0/README.md +0 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/annotator/__init__.py +1 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/annotator/pdf_annotator.py +150 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/graph/__init__.py +0 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/graph/graph.py +176 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/helper.py +7 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/image_payload_builder.py +35 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_image_converter/__init__.py +1 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_image_converter/pdf_image_converter.py +65 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_llm/__init__.py +1 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_llm/pdf_llm.py +89 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_seperator/__init__.py +1 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_seperator/pdf_seperator.py +61 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/type/__init__.py +2 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/type/models.py +12 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/type/types.py +20 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/utils/__init__.py +3 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/utils/image_utils.py +37 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/utils/langchain_utils.py +19 -0
- pdf_segmentation-0.1.0/pdf_segmentation/src/utils/serialization_utils.py +30 -0
- pdf_segmentation-0.1.0/pyproject.toml +27 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf_segmentation
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Luciano Bermudez
|
|
6
|
+
Author-email: lucianobmecheng69@gmail.com
|
|
7
|
+
Requires-Python: >=3.11,<4.0.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
13
|
+
Requires-Dist: langchain (>=1.2.8,<2.0.0)
|
|
14
|
+
Requires-Dist: langgraph (>=1.0.7,<2.0.0)
|
|
15
|
+
Requires-Dist: pymupdf (>=1.26.7,<2.0.0)
|
|
16
|
+
Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .pdf_annotator import PDFAnnotator
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Literal, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import pymupdf
|
|
5
|
+
from pymupdf import Page
|
|
6
|
+
|
|
7
|
+
from pdf_image_converter import PDFImageConverter
|
|
8
|
+
from type import Anchor, AnchorPos
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PDFAnnotator:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
pdf_path: str | Path,
|
|
15
|
+
anchor: Anchor | AnchorPos = Anchor.BOTTOM_LEFT,
|
|
16
|
+
margin_frac: float = 1 / 10,
|
|
17
|
+
offset: Tuple[int, int] = (10, 10),
|
|
18
|
+
zoom: float = 2.0,
|
|
19
|
+
):
|
|
20
|
+
self.pdf = Path(pdf_path).resolve()
|
|
21
|
+
self.anchor = Anchor(anchor)
|
|
22
|
+
self.margin_frac = margin_frac
|
|
23
|
+
self.offset = offset
|
|
24
|
+
self.zoom = zoom
|
|
25
|
+
self._validate()
|
|
26
|
+
|
|
27
|
+
def annotate_and_render_pages(
|
|
28
|
+
self,
|
|
29
|
+
) -> bytes:
|
|
30
|
+
doc = pymupdf.open(self.pdf)
|
|
31
|
+
try:
|
|
32
|
+
# Annotate all pages
|
|
33
|
+
for page in doc:
|
|
34
|
+
assert isinstance(page, Page)
|
|
35
|
+
self._annotate_page(page)
|
|
36
|
+
# Return annotated PDF as bytes
|
|
37
|
+
return doc.tobytes()
|
|
38
|
+
finally:
|
|
39
|
+
doc.close()
|
|
40
|
+
|
|
41
|
+
def _annotate_and_save(
|
|
42
|
+
self,
|
|
43
|
+
method: Literal["pdf", "image"] = "image",
|
|
44
|
+
output_path: Optional[str | Path] = None,
|
|
45
|
+
) -> str:
|
|
46
|
+
data = self.annotate_and_render_pages()
|
|
47
|
+
output_path = self.get_output_path(path, method)
|
|
48
|
+
if method == "pdf":
|
|
49
|
+
if not isinstance(data, (bytes, bytearray)):
|
|
50
|
+
raise ValueError("Expected PDF data to be bytes")
|
|
51
|
+
output_path.write_bytes(data)
|
|
52
|
+
return output_path.as_posix()
|
|
53
|
+
elif method == "image":
|
|
54
|
+
PDFImageConverter().save_to_images(
|
|
55
|
+
data, output_path, pdf_name=self.pdf.stem
|
|
56
|
+
)
|
|
57
|
+
return output_path.as_posix()
|
|
58
|
+
|
|
59
|
+
def _annotate_page(
|
|
60
|
+
self,
|
|
61
|
+
page: Page,
|
|
62
|
+
):
|
|
63
|
+
if page.rotation != 0:
|
|
64
|
+
page.set_rotation(0)
|
|
65
|
+
|
|
66
|
+
rect = page.rect
|
|
67
|
+
cx, cy = self._get_annotation_coords(
|
|
68
|
+
(rect.width, rect.height),
|
|
69
|
+
self.margin_frac,
|
|
70
|
+
self.offset,
|
|
71
|
+
self.anchor,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
radius = rect.width * self.margin_frac
|
|
75
|
+
|
|
76
|
+
# draw circle
|
|
77
|
+
page.draw_circle(
|
|
78
|
+
center=(cx, cy),
|
|
79
|
+
radius=radius,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# draw centered number
|
|
83
|
+
box_size = radius
|
|
84
|
+
label_rect = pymupdf.Rect(
|
|
85
|
+
cx - box_size,
|
|
86
|
+
cy - box_size,
|
|
87
|
+
cx + box_size,
|
|
88
|
+
cy + box_size,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
page.insert_textbox(
|
|
92
|
+
label_rect,
|
|
93
|
+
str(page.number),
|
|
94
|
+
fontsize=radius,
|
|
95
|
+
align=pymupdf.TEXT_ALIGN_CENTER,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _get_annotation_coords(
|
|
99
|
+
self,
|
|
100
|
+
size: tuple[int, int],
|
|
101
|
+
margin_frac: float = 1 / 10,
|
|
102
|
+
offset: tuple[int, int] = (10, 10),
|
|
103
|
+
anchor: Anchor = Anchor.BOTTOM_LEFT,
|
|
104
|
+
) -> Tuple[float, float]:
|
|
105
|
+
width, height = size
|
|
106
|
+
x_off, y_off = offset
|
|
107
|
+
|
|
108
|
+
match anchor.value:
|
|
109
|
+
case "top-left":
|
|
110
|
+
cx = width * margin_frac + x_off
|
|
111
|
+
cy = height * margin_frac + y_off
|
|
112
|
+
case "top-right":
|
|
113
|
+
cx = width - (width * margin_frac) - x_off
|
|
114
|
+
cy = height * margin_frac + y_off
|
|
115
|
+
case "bottom-left":
|
|
116
|
+
cx = width * margin_frac + x_off
|
|
117
|
+
cy = height - (height * margin_frac) - y_off
|
|
118
|
+
case "bottom-right":
|
|
119
|
+
cx = width - (width * margin_frac) - x_off
|
|
120
|
+
cy = height - (height * margin_frac) - y_off
|
|
121
|
+
case _:
|
|
122
|
+
raise ValueError(f"Invalid anchor: {anchor}")
|
|
123
|
+
return cx, cy
|
|
124
|
+
|
|
125
|
+
def get_output_path(
|
|
126
|
+
self,
|
|
127
|
+
path: Optional[str | Path] = None,
|
|
128
|
+
method: Literal["image", "pdf"] = "image",
|
|
129
|
+
) -> Path:
|
|
130
|
+
if path:
|
|
131
|
+
return Path(path).resolve()
|
|
132
|
+
if method == "pdf":
|
|
133
|
+
output_path = self.pdf.with_name(f"{self.pdf.stem}_annotated.pdf")
|
|
134
|
+
elif method == "image":
|
|
135
|
+
output_path = self.pdf.with_name(f"{self.pdf.stem}_annotated_pages")
|
|
136
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
137
|
+
|
|
138
|
+
return output_path
|
|
139
|
+
|
|
140
|
+
def _validate(self):
|
|
141
|
+
if not self.pdf.exists():
|
|
142
|
+
raise FileNotFoundError(f"PDF Path {self.pdf} does not exist")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
path = "data/Lecture_02_03.pdf"
|
|
147
|
+
output = Path(r"src\data\images").resolve()
|
|
148
|
+
PDFAnnotator(path, anchor="bottom-left", margin_frac=1 / 20)._annotate_and_save(
|
|
149
|
+
method="image"
|
|
150
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import List, Type
|
|
3
|
+
from typing import Generic, TypeVar, List
|
|
4
|
+
import base64
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, field_serializer, Field
|
|
7
|
+
from langgraph.graph import StateGraph, START, END
|
|
8
|
+
from langchain.chat_models import init_chat_model
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
|
|
11
|
+
from type import PDFInput, PageRange
|
|
12
|
+
from annotator.pdf_annotator import PDFAnnotator
|
|
13
|
+
from pdf_llm.pdf_llm import PDFMultiModalLLM
|
|
14
|
+
from pdf_seperator.pdf_seperator import PDFSeperator
|
|
15
|
+
from pdf_image_converter import PDFImageConverter
|
|
16
|
+
|
|
17
|
+
load_dotenv()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Section(BaseModel):
|
|
21
|
+
"""
|
|
22
|
+
Base class for any structured unit extracted from a document.
|
|
23
|
+
This should contain ONLY semantic fields produced by the LLM.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
page_range: PageRange
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
T = TypeVar("T", bound=Section)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ListOutput(BaseModel, Generic[T]):
|
|
33
|
+
items: List[T]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ParsedUnit(
|
|
37
|
+
BaseModel,
|
|
38
|
+
Generic[T],
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
A semantic unit enriched with pipeline-generated artifacts.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
data: T
|
|
45
|
+
pdf_bytes: bytes | None = None
|
|
46
|
+
|
|
47
|
+
@field_serializer("pdf_bytes")
|
|
48
|
+
def serialize_pdf_bytes(self, value: bytes):
|
|
49
|
+
return base64.b64encode(value).decode("ascii")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
model = init_chat_model(model="gpt-4o", model_provider="openai")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class State(BaseModel, Generic[T]):
|
|
56
|
+
# --- Inputs ---
|
|
57
|
+
pdf: str | Path
|
|
58
|
+
prompt: str
|
|
59
|
+
pdf_bytes: bytes | None = None
|
|
60
|
+
|
|
61
|
+
# --- Schema configuration ---
|
|
62
|
+
output_schema: Type[ListOutput[T]] = Field(exclude=True)
|
|
63
|
+
raw_output: List[T] = []
|
|
64
|
+
|
|
65
|
+
parsed: list[ParsedUnit[T]] = Field(default_factory=list, exclude=False)
|
|
66
|
+
|
|
67
|
+
@field_serializer("pdf_bytes")
|
|
68
|
+
def serialize_pdf_bytes(self, value: bytes):
|
|
69
|
+
return base64.b64encode(value).decode("ascii")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def prepare_pdf(state: State):
|
|
73
|
+
state.pdf = Path(state.pdf)
|
|
74
|
+
if not state.pdf.exists():
|
|
75
|
+
raise ValueError("PDF path cannot be resolved")
|
|
76
|
+
pdf = PDFAnnotator(state.pdf).annotate_and_render_pages()
|
|
77
|
+
return {"pdf_bytes": pdf}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_sections(state: State):
|
|
81
|
+
llm = PDFMultiModalLLM(
|
|
82
|
+
prompt=state.prompt,
|
|
83
|
+
pdf=state.pdf_bytes,
|
|
84
|
+
model=model,
|
|
85
|
+
)
|
|
86
|
+
result = llm.invoke(state.output_schema)
|
|
87
|
+
result = state.output_schema.model_validate(result)
|
|
88
|
+
return {"raw_output": result.items}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def seperate_pages(state: State[T]):
|
|
92
|
+
parsed = []
|
|
93
|
+
if not state.pdf_bytes:
|
|
94
|
+
raise ValueError("PDF bytes is None")
|
|
95
|
+
separator = PDFSeperator(pdf_bytes=state.pdf_bytes)
|
|
96
|
+
for unit in state.raw_output:
|
|
97
|
+
page_range = getattr(unit, "page_range", None)
|
|
98
|
+
if page_range is None:
|
|
99
|
+
raise ValueError("Unit does not define a page_range")
|
|
100
|
+
cleaned = ParsedUnit[T](
|
|
101
|
+
data=unit,
|
|
102
|
+
pdf_bytes=separator.extract_page_range(
|
|
103
|
+
start=page_range.start_page - 1,
|
|
104
|
+
end=page_range.end_page - 1,
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
parsed.append(cleaned)
|
|
108
|
+
return {"parsed": parsed}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
graph = StateGraph(State)
|
|
112
|
+
graph.add_node(prepare_pdf)
|
|
113
|
+
graph.add_node(get_sections)
|
|
114
|
+
graph.add_node(seperate_pages)
|
|
115
|
+
|
|
116
|
+
graph.add_edge(START, "prepare_pdf")
|
|
117
|
+
graph.add_edge("prepare_pdf", "get_sections")
|
|
118
|
+
graph.add_edge("get_sections", "seperate_pages")
|
|
119
|
+
graph.add_edge("seperate_pages", END)
|
|
120
|
+
graph = graph.compile()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
path = "data/Lecture_02_03.pdf"
|
|
125
|
+
from typing import Literal
|
|
126
|
+
|
|
127
|
+
class MySection(Section, BaseModel):
|
|
128
|
+
title: str
|
|
129
|
+
description: str
|
|
130
|
+
section_type: Literal["derivation", "question"]
|
|
131
|
+
|
|
132
|
+
class MySections(ListOutput[MySection]):
|
|
133
|
+
items: List[MySection]
|
|
134
|
+
|
|
135
|
+
result = graph.invoke(
|
|
136
|
+
State(
|
|
137
|
+
output_schema=MySections,
|
|
138
|
+
pdf=path,
|
|
139
|
+
prompt="""You are analyzing a set of lecture notes.
|
|
140
|
+
|
|
141
|
+
Your task is to identify and extract distinct sections that fall into ONE of the following two categories only:
|
|
142
|
+
|
|
143
|
+
1. **Derivation**
|
|
144
|
+
- A derivation is a mathematical development that proceeds step-by-step using equations, formulas, algebra, calculus, or symbolic manipulation.
|
|
145
|
+
- It typically starts from assumptions, definitions, or governing equations and arrives at a derived result.
|
|
146
|
+
- Only include content that is explicitly part of the mathematical derivation.
|
|
147
|
+
- Do NOT explain, summarize, or add interpretation beyond what is written.
|
|
148
|
+
|
|
149
|
+
2. **Question**
|
|
150
|
+
- A question is a clearly defined problem or practice exercise posed to the reader.
|
|
151
|
+
- It may begin with phrases such as “Find”, “Determine”, “Calculate”, “Show that”, or be labeled as an example, problem, or practice question.
|
|
152
|
+
- Only include the question statement itself.
|
|
153
|
+
- Do NOT include solution steps unless they are explicitly written as part of the question.
|
|
154
|
+
|
|
155
|
+
For each identified section:
|
|
156
|
+
- Create a separate section entry.
|
|
157
|
+
- Assign the appropriate `section_type` (`"derivation"` or `"question"`).
|
|
158
|
+
- Use the section’s visible heading or a concise descriptive title.
|
|
159
|
+
- Provide a short description that closely reflects the original content without adding new information.
|
|
160
|
+
|
|
161
|
+
Page indexing:
|
|
162
|
+
- The lecture pages are annotated with a circled page number in the bottom-left corner.
|
|
163
|
+
- Use this circled page number as the authoritative reference when determining where a section begins and ends.
|
|
164
|
+
|
|
165
|
+
Important constraints:
|
|
166
|
+
- Do not merge multiple derivations or questions into a single section.
|
|
167
|
+
- Do not invent structure that is not present in the lecture.
|
|
168
|
+
- If content is ambiguous, only include it if it clearly fits one of the two categories.""",
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
result = State.model_validate(result)
|
|
172
|
+
from utils import to_serializable
|
|
173
|
+
import json
|
|
174
|
+
|
|
175
|
+
output = Path("output.json").resolve()
|
|
176
|
+
output.write_text(json.dumps(to_serializable(result)))
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from graph.graph import State
|
|
3
|
+
import json
|
|
4
|
+
from utils import save_base64_image
|
|
5
|
+
path = Path("output.json").resolve()
|
|
6
|
+
data = State.model_validate(json.loads(path.read_text()))
|
|
7
|
+
save_base64_image(data.parsed[-1].pdf_bytes, "data_output.pdf")
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import base64
|
|
3
|
+
from typing import Sequence, List
|
|
4
|
+
from type import PDFInput
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ImagePayloadBuilder:
|
|
8
|
+
@staticmethod
|
|
9
|
+
def _to_bytes(data: PDFInput) -> bytes:
|
|
10
|
+
if isinstance(data, (bytes, bytearray, memoryview)):
|
|
11
|
+
return bytes(data)
|
|
12
|
+
|
|
13
|
+
path = Path(data)
|
|
14
|
+
if not path.exists():
|
|
15
|
+
raise FileNotFoundError(f"Image path not found: {path}")
|
|
16
|
+
return path.read_bytes()
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def encode(data: bytes) -> str:
|
|
20
|
+
return base64.b64encode(data).decode("utf-8")
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def prepare_llm_payload(
|
|
24
|
+
cls,
|
|
25
|
+
payload: Sequence[PDFInput],
|
|
26
|
+
mime: str = "image/jpeg",
|
|
27
|
+
) -> List[dict[str, str | dict[str, str]]]:
|
|
28
|
+
payload = [cls._to_bytes(p) for p in payload]
|
|
29
|
+
return [
|
|
30
|
+
{
|
|
31
|
+
"type": "image_url",
|
|
32
|
+
"image_url": {"url": f"data:{mime};base64,{cls.encode(p)}"},
|
|
33
|
+
}
|
|
34
|
+
for p in payload
|
|
35
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .pdf_image_converter import PDFImageConverter
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from type import PDFInput, ImageExt
|
|
2
|
+
import pymupdf
|
|
3
|
+
from typing import List, Optional, Iterable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PDFImageConverter:
|
|
8
|
+
def convert_to_images(
|
|
9
|
+
self, pdf: PDFInput, zoom: float = 0.2, ext: ImageExt = "png"
|
|
10
|
+
) -> List[bytes]:
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
doc = None
|
|
14
|
+
if isinstance(pdf, (bytes, bytearray, memoryview)):
|
|
15
|
+
doc = pymupdf.open(stream=pdf, filetype="pdf")
|
|
16
|
+
elif isinstance(pdf, (Path, str)):
|
|
17
|
+
doc = pymupdf.open(Path(pdf).as_posix())
|
|
18
|
+
else:
|
|
19
|
+
raise TypeError("PDF is not of expected type")
|
|
20
|
+
assert doc
|
|
21
|
+
except Exception as e:
|
|
22
|
+
raise ValueError(f"Failed to open document {e}")
|
|
23
|
+
matrix = pymupdf.Matrix(zoom, zoom)
|
|
24
|
+
image_bytes = [page.get_pixmap(matrix=matrix).tobytes(ext) for page in doc]
|
|
25
|
+
doc.close()
|
|
26
|
+
return image_bytes
|
|
27
|
+
|
|
28
|
+
def save_to_images(
|
|
29
|
+
self,
|
|
30
|
+
pdf: PDFInput,
|
|
31
|
+
output_path: str | Path,
|
|
32
|
+
pdf_name: str | None = None,
|
|
33
|
+
ext: ImageExt = "png",
|
|
34
|
+
start: int = 0,
|
|
35
|
+
) -> None:
|
|
36
|
+
|
|
37
|
+
if pdf_name is None:
|
|
38
|
+
if isinstance(pdf, (str, Path)):
|
|
39
|
+
pdf_name = Path(pdf).stem
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError("pdf_name must be provided when pdf is not a path")
|
|
42
|
+
|
|
43
|
+
output_path = self._validate(output_path)
|
|
44
|
+
data = self.convert_to_images(pdf)
|
|
45
|
+
for i, b in enumerate(data, start=start):
|
|
46
|
+
out = output_path / f"{pdf_name}_page_{i}.{ext}"
|
|
47
|
+
out.write_bytes(b)
|
|
48
|
+
|
|
49
|
+
def images_to_pdf(self, images: Iterable[bytes]) -> bytes:
|
|
50
|
+
doc = pymupdf.open()
|
|
51
|
+
for img_bytes in images:
|
|
52
|
+
img_doc = pymupdf.open(stream=img_bytes, filetype="png")
|
|
53
|
+
rect = img_doc[0].rect
|
|
54
|
+
page = doc.new_page(width=rect.width, height=rect.height)
|
|
55
|
+
page.insert_image(rect, stream=img_bytes)
|
|
56
|
+
img_doc.close()
|
|
57
|
+
pdf_bytes = doc.tobytes()
|
|
58
|
+
doc.close()
|
|
59
|
+
return pdf_bytes
|
|
60
|
+
|
|
61
|
+
def _validate(self, path: str | Path) -> Path:
|
|
62
|
+
path = Path(path)
|
|
63
|
+
if not path.exists():
|
|
64
|
+
raise ValueError(f"Failed to validate pdf {path} cannot be resolved")
|
|
65
|
+
return path
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .pdf_llm import PDFMultiModalLLM
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional, Sequence, Type
|
|
3
|
+
|
|
4
|
+
import pymupdf
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
|
7
|
+
from langchain.chat_models import init_chat_model
|
|
8
|
+
from type import PDFInput, BaseOutput
|
|
9
|
+
from image_payload_builder import ImagePayloadBuilder
|
|
10
|
+
from pdf_image_converter import PDFImageConverter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PDFMultiModalLLM:
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
prompt: str,
|
|
18
|
+
model: BaseChatModel,
|
|
19
|
+
pdf: PDFInput | None = None,
|
|
20
|
+
image_bytes: Sequence[bytes] | None = None,
|
|
21
|
+
):
|
|
22
|
+
if pdf is None and image_bytes is None:
|
|
23
|
+
raise ValueError("Either pdf_path or image_bytes must be provided")
|
|
24
|
+
if pdf is not None and image_bytes is not None:
|
|
25
|
+
raise ValueError("Provide only one of pdf_path or image_bytes")
|
|
26
|
+
|
|
27
|
+
self.prompt = prompt
|
|
28
|
+
self.builder = ImagePayloadBuilder()
|
|
29
|
+
self.llm = model
|
|
30
|
+
|
|
31
|
+
if pdf is not None:
|
|
32
|
+
self.pdf_bytes = PDFImageConverter().convert_to_images(pdf)
|
|
33
|
+
|
|
34
|
+
elif image_bytes:
|
|
35
|
+
self.pdf_bytes = list(image_bytes)
|
|
36
|
+
else:
|
|
37
|
+
raise RuntimeError("Unexpected Error Occured ")
|
|
38
|
+
|
|
39
|
+
def prepare_payload(self, mime="image/png"):
|
|
40
|
+
try:
|
|
41
|
+
image_payload = self.builder.prepare_llm_payload(self.pdf_bytes, mime=mime)
|
|
42
|
+
message = {
|
|
43
|
+
"role": "user",
|
|
44
|
+
"content": [{"type": "text", "text": self.prompt}, *image_payload],
|
|
45
|
+
}
|
|
46
|
+
return message
|
|
47
|
+
except Exception as e:
|
|
48
|
+
raise RuntimeError(f"Failed to prepare payload for LLM. Error: {e}")
|
|
49
|
+
|
|
50
|
+
def invoke(
|
|
51
|
+
self,
|
|
52
|
+
output_model: Optional[Type[BaseModel]] = BaseOutput,
|
|
53
|
+
mime: str = "image/png",
|
|
54
|
+
):
|
|
55
|
+
try:
|
|
56
|
+
message = self.prepare_payload(mime)
|
|
57
|
+
if output_model:
|
|
58
|
+
chain = self.llm.with_structured_output(schema=output_model)
|
|
59
|
+
return chain.invoke([message])
|
|
60
|
+
else:
|
|
61
|
+
return self.llm.invoke([message])
|
|
62
|
+
except Exception as e:
|
|
63
|
+
raise RuntimeError(f"Failed to invoke model {e}")
|
|
64
|
+
|
|
65
|
+
async def ainvoke(
|
|
66
|
+
self,
|
|
67
|
+
output_model: Optional[Type[BaseModel]] = BaseOutput,
|
|
68
|
+
):
|
|
69
|
+
message = self.prepare_payload()
|
|
70
|
+
if output_model:
|
|
71
|
+
chain = self.llm.with_structured_output(
|
|
72
|
+
schema=output_model,
|
|
73
|
+
)
|
|
74
|
+
return chain.ainvoke([message])
|
|
75
|
+
else:
|
|
76
|
+
return self.llm.ainvoke([message])
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == "__main__":
|
|
80
|
+
from dotenv import load_dotenv
|
|
81
|
+
|
|
82
|
+
load_dotenv()
|
|
83
|
+
path = "data/Lecture_02_03.pdf"
|
|
84
|
+
output = Path(r"src\data\images").resolve()
|
|
85
|
+
model = init_chat_model(model="gpt-4o", model_provider="openai")
|
|
86
|
+
data = PDFMultiModalLLM(
|
|
87
|
+
prompt="What is in the image", pdf=path, model=model
|
|
88
|
+
).invoke()
|
|
89
|
+
print(data)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .pdf_seperator import PDFSeperator
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import pymupdf
|
|
3
|
+
from type import PDFInput
|
|
4
|
+
from typing import Sequence, Literal
|
|
5
|
+
from pdf_image_converter import PDFImageConverter
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PDFSeperator:
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
pdf_path: PDFInput | None = None,
|
|
12
|
+
pdf_bytes: bytes | None = None,
|
|
13
|
+
image_bytes: Sequence[bytes] | None = None,
|
|
14
|
+
pdf_name: str | None = "input_pdf",
|
|
15
|
+
):
|
|
16
|
+
self.pdf_name = pdf_name
|
|
17
|
+
if pdf_path is None and pdf_bytes is None and image_bytes is None:
|
|
18
|
+
raise ValueError(
|
|
19
|
+
"Either pdf_path or image_bytes or pdf_bytes must be provided"
|
|
20
|
+
)
|
|
21
|
+
if pdf_path is not None and pdf_bytes is not None and image_bytes is None:
|
|
22
|
+
raise ValueError("Provide only one of pdf_path or image_bytes")
|
|
23
|
+
|
|
24
|
+
if pdf_path is not None:
|
|
25
|
+
self.pdf = pymupdf.open(pdf_path).tobytes()
|
|
26
|
+
elif pdf_bytes:
|
|
27
|
+
self.pdf = pdf_bytes
|
|
28
|
+
elif image_bytes:
|
|
29
|
+
self.pdf = PDFImageConverter().images_to_pdf(image_bytes)
|
|
30
|
+
|
|
31
|
+
else:
|
|
32
|
+
raise ValueError("Unexpected Error Occured")
|
|
33
|
+
|
|
34
|
+
def extract_page_range(self, start: int, end: int) -> bytes:
|
|
35
|
+
src = pymupdf.open(stream=self.pdf, filetype="pdf")
|
|
36
|
+
dst = pymupdf.open()
|
|
37
|
+
dst.insert_pdf(src, from_page=start, to_page=end, rotate=0)
|
|
38
|
+
return dst.tobytes()
|
|
39
|
+
|
|
40
|
+
def _extract_and_save(
|
|
41
|
+
self,
|
|
42
|
+
start: int,
|
|
43
|
+
end: int,
|
|
44
|
+
output_dir: str | Path,
|
|
45
|
+
pdf_name: str,
|
|
46
|
+
method: Literal["pdf", "image"] = "pdf",
|
|
47
|
+
) -> str:
|
|
48
|
+
output_dir = Path(output_dir).resolve()
|
|
49
|
+
if not output_dir.exists():
|
|
50
|
+
raise ValueError("Failed to extract pdf path {output_dir} does not exist")
|
|
51
|
+
data = self.extract_page_range(start, end)
|
|
52
|
+
output_path = output_dir / (f"{self.pdf_name}_extracted_{start}_{end}.pdf")
|
|
53
|
+
PDFImageConverter().save_to_images(data, output_dir, pdf_name)
|
|
54
|
+
return output_path.as_posix()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
if __name__ == "__main__":
|
|
58
|
+
path = r"data\Lecture_02_03.pdf"
|
|
59
|
+
PDFSeperator(path)._extract_and_save(
|
|
60
|
+
1, 3, output_dir="./data", pdf_name="pdf_extracted"
|
|
61
|
+
)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Literal
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
AnchorPos = Literal["top-left", "top-right", "bottom-right", "bottom-left"]
|
|
7
|
+
ImageExt = Literal["png", "jpeg"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Anchor(str, Enum):
|
|
11
|
+
TOP_LEFT = "top-left"
|
|
12
|
+
TOP_RIGHT = "top-right"
|
|
13
|
+
BOTTOM_RIGHT = "bottom-right"
|
|
14
|
+
BOTTOM_LEFT = "bottom-left"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
PDFInput = str | Path | bytes
|
|
18
|
+
ImageBytes = bytes
|
|
19
|
+
|
|
20
|
+
ImageInput = PDFInput | ImageBytes
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import base64
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def save_base64_image(b64_data: str, output_path: str | Path) -> Path:
|
|
6
|
+
"""
|
|
7
|
+
Decode a base64-encoded image and save it to disk.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
b64_data: Base64-encoded image string (no data URI prefix).
|
|
11
|
+
output_path: Path where the image will be saved.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Path to the saved image.
|
|
15
|
+
"""
|
|
16
|
+
output_path = Path(output_path).resolve()
|
|
17
|
+
|
|
18
|
+
image_bytes = base64.b64decode(b64_data)
|
|
19
|
+
output_path.write_bytes(image_bytes)
|
|
20
|
+
|
|
21
|
+
return output_path
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def write_image_data(image_bytes: bytes, folder_path: str | Path, filename: str) -> str:
|
|
25
|
+
try:
|
|
26
|
+
path = Path(folder_path).resolve()
|
|
27
|
+
path.mkdir(exist_ok=True)
|
|
28
|
+
save_path = path / filename
|
|
29
|
+
|
|
30
|
+
if save_path.suffix != ".png":
|
|
31
|
+
raise ValueError(
|
|
32
|
+
"Suffix allowed is only PNG either missing or nnot allowed"
|
|
33
|
+
)
|
|
34
|
+
save_path.write_bytes(image_bytes)
|
|
35
|
+
return save_path.as_posix()
|
|
36
|
+
except Exception as e:
|
|
37
|
+
raise ValueError(f"Could not save image {str(e)}")
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
from langgraph.graph.state import CompiledStateGraph
|
|
4
|
+
from .image_utils import write_image_data
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def save_graph_visualization(
|
|
8
|
+
graph: CompiledStateGraph | Any,
|
|
9
|
+
folder_path: str | Path,
|
|
10
|
+
filename: str,
|
|
11
|
+
):
|
|
12
|
+
try:
|
|
13
|
+
image_bytes = graph.get_graph().draw_mermaid_png()
|
|
14
|
+
save_path = write_image_data(image_bytes, folder_path, filename)
|
|
15
|
+
print(f"✅ Saved graph visualization at: {save_path}")
|
|
16
|
+
except ValueError:
|
|
17
|
+
raise
|
|
18
|
+
except Exception as error:
|
|
19
|
+
print(f"❌ Graph visualization failed: {error}")
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import base64
|
|
3
|
+
from datetime import date, datetime, time
|
|
4
|
+
from typing import Any
|
|
5
|
+
from uuid import UUID
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def to_serializable(obj: Any) -> Any:
|
|
10
|
+
"""
|
|
11
|
+
Recursively convert Pydantic models (and nested dicts/lists thereof)
|
|
12
|
+
into plain Python data structures.
|
|
13
|
+
"""
|
|
14
|
+
if isinstance(obj, BaseModel):
|
|
15
|
+
return obj.model_dump(mode="json")
|
|
16
|
+
if isinstance(obj, dict):
|
|
17
|
+
return {k: to_serializable(v) for k, v in obj.items()}
|
|
18
|
+
if isinstance(obj, list):
|
|
19
|
+
return [to_serializable(v) for v in obj]
|
|
20
|
+
|
|
21
|
+
# --- Special cases ---
|
|
22
|
+
if isinstance(obj, (datetime, date, time)):
|
|
23
|
+
return obj.isoformat()
|
|
24
|
+
if isinstance(obj, UUID):
|
|
25
|
+
return str(obj)
|
|
26
|
+
if isinstance(obj, Path):
|
|
27
|
+
return obj.as_posix()
|
|
28
|
+
if isinstance(obj, (bytes, bytearray, memoryview)):
|
|
29
|
+
return base64.b64encode(obj).decode("utf-8")
|
|
30
|
+
return obj
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pdf_segmentation"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Luciano Bermudez",email = "lucianobmecheng69@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11,<4.0.0"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"langchain (>=1.2.8,<2.0.0)",
|
|
12
|
+
"pymupdf (>=1.26.7,<2.0.0)",
|
|
13
|
+
"langgraph (>=1.0.7,<2.0.0)",
|
|
14
|
+
"python-dotenv (>=1.2.1,<2.0.0)"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
20
|
+
build-backend = "poetry.core.masonry.api"
|
|
21
|
+
|
|
22
|
+
[dependency-groups]
|
|
23
|
+
dev = [
|
|
24
|
+
"langchain-openai (>=1.1.7,<2.0.0)",
|
|
25
|
+
"ipykernel (>=7.2.0,<8.0.0)"
|
|
26
|
+
]
|
|
27
|
+
|