pdf_segmentation 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. pdf_segmentation-0.1.0/PKG-INFO +19 -0
  2. pdf_segmentation-0.1.0/README.md +0 -0
  3. pdf_segmentation-0.1.0/pdf_segmentation/src/annotator/__init__.py +1 -0
  4. pdf_segmentation-0.1.0/pdf_segmentation/src/annotator/pdf_annotator.py +150 -0
  5. pdf_segmentation-0.1.0/pdf_segmentation/src/graph/__init__.py +0 -0
  6. pdf_segmentation-0.1.0/pdf_segmentation/src/graph/graph.py +176 -0
  7. pdf_segmentation-0.1.0/pdf_segmentation/src/helper.py +7 -0
  8. pdf_segmentation-0.1.0/pdf_segmentation/src/image_payload_builder.py +35 -0
  9. pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_image_converter/__init__.py +1 -0
  10. pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_image_converter/pdf_image_converter.py +65 -0
  11. pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_llm/__init__.py +1 -0
  12. pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_llm/pdf_llm.py +89 -0
  13. pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_seperator/__init__.py +1 -0
  14. pdf_segmentation-0.1.0/pdf_segmentation/src/pdf_seperator/pdf_seperator.py +61 -0
  15. pdf_segmentation-0.1.0/pdf_segmentation/src/type/__init__.py +2 -0
  16. pdf_segmentation-0.1.0/pdf_segmentation/src/type/models.py +12 -0
  17. pdf_segmentation-0.1.0/pdf_segmentation/src/type/types.py +20 -0
  18. pdf_segmentation-0.1.0/pdf_segmentation/src/utils/__init__.py +3 -0
  19. pdf_segmentation-0.1.0/pdf_segmentation/src/utils/image_utils.py +37 -0
  20. pdf_segmentation-0.1.0/pdf_segmentation/src/utils/langchain_utils.py +19 -0
  21. pdf_segmentation-0.1.0/pdf_segmentation/src/utils/serialization_utils.py +30 -0
  22. pdf_segmentation-0.1.0/pyproject.toml +27 -0
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf_segmentation
3
+ Version: 0.1.0
4
+ Summary:
5
+ Author: Luciano Bermudez
6
+ Author-email: lucianobmecheng69@gmail.com
7
+ Requires-Python: >=3.11,<4.0.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Classifier: Programming Language :: Python :: 3.14
13
+ Requires-Dist: langchain (>=1.2.8,<2.0.0)
14
+ Requires-Dist: langgraph (>=1.0.7,<2.0.0)
15
+ Requires-Dist: pymupdf (>=1.26.7,<2.0.0)
16
+ Requires-Dist: python-dotenv (>=1.2.1,<2.0.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+
File without changes
@@ -0,0 +1 @@
1
+ from .pdf_annotator import PDFAnnotator
@@ -0,0 +1,150 @@
1
+ from pathlib import Path
2
+ from typing import Literal, Optional, Tuple
3
+
4
+ import pymupdf
5
+ from pymupdf import Page
6
+
7
+ from pdf_image_converter import PDFImageConverter
8
+ from type import Anchor, AnchorPos
9
+
10
+
11
+ class PDFAnnotator:
12
+ def __init__(
13
+ self,
14
+ pdf_path: str | Path,
15
+ anchor: Anchor | AnchorPos = Anchor.BOTTOM_LEFT,
16
+ margin_frac: float = 1 / 10,
17
+ offset: Tuple[int, int] = (10, 10),
18
+ zoom: float = 2.0,
19
+ ):
20
+ self.pdf = Path(pdf_path).resolve()
21
+ self.anchor = Anchor(anchor)
22
+ self.margin_frac = margin_frac
23
+ self.offset = offset
24
+ self.zoom = zoom
25
+ self._validate()
26
+
27
+ def annotate_and_render_pages(
28
+ self,
29
+ ) -> bytes:
30
+ doc = pymupdf.open(self.pdf)
31
+ try:
32
+ # Annotate all pages
33
+ for page in doc:
34
+ assert isinstance(page, Page)
35
+ self._annotate_page(page)
36
+ # Return annotated PDF as bytes
37
+ return doc.tobytes()
38
+ finally:
39
+ doc.close()
40
+
41
+ def _annotate_and_save(
42
+ self,
43
+ method: Literal["pdf", "image"] = "image",
44
+ output_path: Optional[str | Path] = None,
45
+ ) -> str:
46
+ data = self.annotate_and_render_pages()
47
+ output_path = self.get_output_path(path, method)
48
+ if method == "pdf":
49
+ if not isinstance(data, (bytes, bytearray)):
50
+ raise ValueError("Expected PDF data to be bytes")
51
+ output_path.write_bytes(data)
52
+ return output_path.as_posix()
53
+ elif method == "image":
54
+ PDFImageConverter().save_to_images(
55
+ data, output_path, pdf_name=self.pdf.stem
56
+ )
57
+ return output_path.as_posix()
58
+
59
+ def _annotate_page(
60
+ self,
61
+ page: Page,
62
+ ):
63
+ if page.rotation != 0:
64
+ page.set_rotation(0)
65
+
66
+ rect = page.rect
67
+ cx, cy = self._get_annotation_coords(
68
+ (rect.width, rect.height),
69
+ self.margin_frac,
70
+ self.offset,
71
+ self.anchor,
72
+ )
73
+
74
+ radius = rect.width * self.margin_frac
75
+
76
+ # draw circle
77
+ page.draw_circle(
78
+ center=(cx, cy),
79
+ radius=radius,
80
+ )
81
+
82
+ # draw centered number
83
+ box_size = radius
84
+ label_rect = pymupdf.Rect(
85
+ cx - box_size,
86
+ cy - box_size,
87
+ cx + box_size,
88
+ cy + box_size,
89
+ )
90
+
91
+ page.insert_textbox(
92
+ label_rect,
93
+ str(page.number),
94
+ fontsize=radius,
95
+ align=pymupdf.TEXT_ALIGN_CENTER,
96
+ )
97
+
98
+ def _get_annotation_coords(
99
+ self,
100
+ size: tuple[int, int],
101
+ margin_frac: float = 1 / 10,
102
+ offset: tuple[int, int] = (10, 10),
103
+ anchor: Anchor = Anchor.BOTTOM_LEFT,
104
+ ) -> Tuple[float, float]:
105
+ width, height = size
106
+ x_off, y_off = offset
107
+
108
+ match anchor.value:
109
+ case "top-left":
110
+ cx = width * margin_frac + x_off
111
+ cy = height * margin_frac + y_off
112
+ case "top-right":
113
+ cx = width - (width * margin_frac) - x_off
114
+ cy = height * margin_frac + y_off
115
+ case "bottom-left":
116
+ cx = width * margin_frac + x_off
117
+ cy = height - (height * margin_frac) - y_off
118
+ case "bottom-right":
119
+ cx = width - (width * margin_frac) - x_off
120
+ cy = height - (height * margin_frac) - y_off
121
+ case _:
122
+ raise ValueError(f"Invalid anchor: {anchor}")
123
+ return cx, cy
124
+
125
+ def get_output_path(
126
+ self,
127
+ path: Optional[str | Path] = None,
128
+ method: Literal["image", "pdf"] = "image",
129
+ ) -> Path:
130
+ if path:
131
+ return Path(path).resolve()
132
+ if method == "pdf":
133
+ output_path = self.pdf.with_name(f"{self.pdf.stem}_annotated.pdf")
134
+ elif method == "image":
135
+ output_path = self.pdf.with_name(f"{self.pdf.stem}_annotated_pages")
136
+ output_path.mkdir(parents=True, exist_ok=True)
137
+
138
+ return output_path
139
+
140
+ def _validate(self):
141
+ if not self.pdf.exists():
142
+ raise FileNotFoundError(f"PDF Path {self.pdf} does not exist")
143
+
144
+
145
+ if __name__ == "__main__":
146
+ path = "data/Lecture_02_03.pdf"
147
+ output = Path(r"src\data\images").resolve()
148
+ PDFAnnotator(path, anchor="bottom-left", margin_frac=1 / 20)._annotate_and_save(
149
+ method="image"
150
+ )
@@ -0,0 +1,176 @@
1
+ from pathlib import Path
2
+ from typing import List, Type
3
+ from typing import Generic, TypeVar, List
4
+ import base64
5
+
6
+ from pydantic import BaseModel, field_serializer, Field
7
+ from langgraph.graph import StateGraph, START, END
8
+ from langchain.chat_models import init_chat_model
9
+ from dotenv import load_dotenv
10
+
11
+ from type import PDFInput, PageRange
12
+ from annotator.pdf_annotator import PDFAnnotator
13
+ from pdf_llm.pdf_llm import PDFMultiModalLLM
14
+ from pdf_seperator.pdf_seperator import PDFSeperator
15
+ from pdf_image_converter import PDFImageConverter
16
+
17
+ load_dotenv()
18
+
19
+
20
+ class Section(BaseModel):
21
+ """
22
+ Base class for any structured unit extracted from a document.
23
+ This should contain ONLY semantic fields produced by the LLM.
24
+ """
25
+
26
+ page_range: PageRange
27
+
28
+
29
+ T = TypeVar("T", bound=Section)
30
+
31
+
32
+ class ListOutput(BaseModel, Generic[T]):
33
+ items: List[T]
34
+
35
+
36
+ class ParsedUnit(
37
+ BaseModel,
38
+ Generic[T],
39
+ ):
40
+ """
41
+ A semantic unit enriched with pipeline-generated artifacts.
42
+ """
43
+
44
+ data: T
45
+ pdf_bytes: bytes | None = None
46
+
47
+ @field_serializer("pdf_bytes")
48
+ def serialize_pdf_bytes(self, value: bytes):
49
+ return base64.b64encode(value).decode("ascii")
50
+
51
+
52
+ model = init_chat_model(model="gpt-4o", model_provider="openai")
53
+
54
+
55
+ class State(BaseModel, Generic[T]):
56
+ # --- Inputs ---
57
+ pdf: str | Path
58
+ prompt: str
59
+ pdf_bytes: bytes | None = None
60
+
61
+ # --- Schema configuration ---
62
+ output_schema: Type[ListOutput[T]] = Field(exclude=True)
63
+ raw_output: List[T] = []
64
+
65
+ parsed: list[ParsedUnit[T]] = Field(default_factory=list, exclude=False)
66
+
67
+ @field_serializer("pdf_bytes")
68
+ def serialize_pdf_bytes(self, value: bytes):
69
+ return base64.b64encode(value).decode("ascii")
70
+
71
+
72
+ def prepare_pdf(state: State):
73
+ state.pdf = Path(state.pdf)
74
+ if not state.pdf.exists():
75
+ raise ValueError("PDF path cannot be resolved")
76
+ pdf = PDFAnnotator(state.pdf).annotate_and_render_pages()
77
+ return {"pdf_bytes": pdf}
78
+
79
+
80
+ def get_sections(state: State):
81
+ llm = PDFMultiModalLLM(
82
+ prompt=state.prompt,
83
+ pdf=state.pdf_bytes,
84
+ model=model,
85
+ )
86
+ result = llm.invoke(state.output_schema)
87
+ result = state.output_schema.model_validate(result)
88
+ return {"raw_output": result.items}
89
+
90
+
91
+ def seperate_pages(state: State[T]):
92
+ parsed = []
93
+ if not state.pdf_bytes:
94
+ raise ValueError("PDF bytes is None")
95
+ separator = PDFSeperator(pdf_bytes=state.pdf_bytes)
96
+ for unit in state.raw_output:
97
+ page_range = getattr(unit, "page_range", None)
98
+ if page_range is None:
99
+ raise ValueError("Unit does not define a page_range")
100
+ cleaned = ParsedUnit[T](
101
+ data=unit,
102
+ pdf_bytes=separator.extract_page_range(
103
+ start=page_range.start_page - 1,
104
+ end=page_range.end_page - 1,
105
+ ),
106
+ )
107
+ parsed.append(cleaned)
108
+ return {"parsed": parsed}
109
+
110
+
111
+ graph = StateGraph(State)
112
+ graph.add_node(prepare_pdf)
113
+ graph.add_node(get_sections)
114
+ graph.add_node(seperate_pages)
115
+
116
+ graph.add_edge(START, "prepare_pdf")
117
+ graph.add_edge("prepare_pdf", "get_sections")
118
+ graph.add_edge("get_sections", "seperate_pages")
119
+ graph.add_edge("seperate_pages", END)
120
+ graph = graph.compile()
121
+
122
+
123
+ if __name__ == "__main__":
124
+ path = "data/Lecture_02_03.pdf"
125
+ from typing import Literal
126
+
127
+ class MySection(Section, BaseModel):
128
+ title: str
129
+ description: str
130
+ section_type: Literal["derivation", "question"]
131
+
132
+ class MySections(ListOutput[MySection]):
133
+ items: List[MySection]
134
+
135
+ result = graph.invoke(
136
+ State(
137
+ output_schema=MySections,
138
+ pdf=path,
139
+ prompt="""You are analyzing a set of lecture notes.
140
+
141
+ Your task is to identify and extract distinct sections that fall into ONE of the following two categories only:
142
+
143
+ 1. **Derivation**
144
+ - A derivation is a mathematical development that proceeds step-by-step using equations, formulas, algebra, calculus, or symbolic manipulation.
145
+ - It typically starts from assumptions, definitions, or governing equations and arrives at a derived result.
146
+ - Only include content that is explicitly part of the mathematical derivation.
147
+ - Do NOT explain, summarize, or add interpretation beyond what is written.
148
+
149
+ 2. **Question**
150
+ - A question is a clearly defined problem or practice exercise posed to the reader.
151
+ - It may begin with phrases such as “Find”, “Determine”, “Calculate”, “Show that”, or be labeled as an example, problem, or practice question.
152
+ - Only include the question statement itself.
153
+ - Do NOT include solution steps unless they are explicitly written as part of the question.
154
+
155
+ For each identified section:
156
+ - Create a separate section entry.
157
+ - Assign the appropriate `section_type` (`"derivation"` or `"question"`).
158
+ - Use the section’s visible heading or a concise descriptive title.
159
+ - Provide a short description that closely reflects the original content without adding new information.
160
+
161
+ Page indexing:
162
+ - The lecture pages are annotated with a circled page number in the bottom-left corner.
163
+ - Use this circled page number as the authoritative reference when determining where a section begins and ends.
164
+
165
+ Important constraints:
166
+ - Do not merge multiple derivations or questions into a single section.
167
+ - Do not invent structure that is not present in the lecture.
168
+ - If content is ambiguous, only include it if it clearly fits one of the two categories.""",
169
+ )
170
+ )
171
+ result = State.model_validate(result)
172
+ from utils import to_serializable
173
+ import json
174
+
175
+ output = Path("output.json").resolve()
176
+ output.write_text(json.dumps(to_serializable(result)))
@@ -0,0 +1,7 @@
1
+ from pathlib import Path
2
+ from graph.graph import State
3
+ import json
4
+ from utils import save_base64_image
5
+ path = Path("output.json").resolve()
6
+ data = State.model_validate(json.loads(path.read_text()))
7
+ save_base64_image(data.parsed[-1].pdf_bytes, "data_output.pdf")
@@ -0,0 +1,35 @@
1
+ from pathlib import Path
2
+ import base64
3
+ from typing import Sequence, List
4
+ from type import PDFInput
5
+
6
+
7
+ class ImagePayloadBuilder:
8
+ @staticmethod
9
+ def _to_bytes(data: PDFInput) -> bytes:
10
+ if isinstance(data, (bytes, bytearray, memoryview)):
11
+ return bytes(data)
12
+
13
+ path = Path(data)
14
+ if not path.exists():
15
+ raise FileNotFoundError(f"Image path not found: {path}")
16
+ return path.read_bytes()
17
+
18
+ @staticmethod
19
+ def encode(data: bytes) -> str:
20
+ return base64.b64encode(data).decode("utf-8")
21
+
22
+ @classmethod
23
+ def prepare_llm_payload(
24
+ cls,
25
+ payload: Sequence[PDFInput],
26
+ mime: str = "image/jpeg",
27
+ ) -> List[dict[str, str | dict[str, str]]]:
28
+ payload = [cls._to_bytes(p) for p in payload]
29
+ return [
30
+ {
31
+ "type": "image_url",
32
+ "image_url": {"url": f"data:{mime};base64,{cls.encode(p)}"},
33
+ }
34
+ for p in payload
35
+ ]
@@ -0,0 +1 @@
1
+ from .pdf_image_converter import PDFImageConverter
@@ -0,0 +1,65 @@
1
+ from type import PDFInput, ImageExt
2
+ import pymupdf
3
+ from typing import List, Optional, Iterable
4
+ from pathlib import Path
5
+
6
+
7
+ class PDFImageConverter:
8
+ def convert_to_images(
9
+ self, pdf: PDFInput, zoom: float = 0.2, ext: ImageExt = "png"
10
+ ) -> List[bytes]:
11
+
12
+ try:
13
+ doc = None
14
+ if isinstance(pdf, (bytes, bytearray, memoryview)):
15
+ doc = pymupdf.open(stream=pdf, filetype="pdf")
16
+ elif isinstance(pdf, (Path, str)):
17
+ doc = pymupdf.open(Path(pdf).as_posix())
18
+ else:
19
+ raise TypeError("PDF is not of expected type")
20
+ assert doc
21
+ except Exception as e:
22
+ raise ValueError(f"Failed to open document {e}")
23
+ matrix = pymupdf.Matrix(zoom, zoom)
24
+ image_bytes = [page.get_pixmap(matrix=matrix).tobytes(ext) for page in doc]
25
+ doc.close()
26
+ return image_bytes
27
+
28
+ def save_to_images(
29
+ self,
30
+ pdf: PDFInput,
31
+ output_path: str | Path,
32
+ pdf_name: str | None = None,
33
+ ext: ImageExt = "png",
34
+ start: int = 0,
35
+ ) -> None:
36
+
37
+ if pdf_name is None:
38
+ if isinstance(pdf, (str, Path)):
39
+ pdf_name = Path(pdf).stem
40
+ else:
41
+ raise ValueError("pdf_name must be provided when pdf is not a path")
42
+
43
+ output_path = self._validate(output_path)
44
+ data = self.convert_to_images(pdf)
45
+ for i, b in enumerate(data, start=start):
46
+ out = output_path / f"{pdf_name}_page_{i}.{ext}"
47
+ out.write_bytes(b)
48
+
49
+ def images_to_pdf(self, images: Iterable[bytes]) -> bytes:
50
+ doc = pymupdf.open()
51
+ for img_bytes in images:
52
+ img_doc = pymupdf.open(stream=img_bytes, filetype="png")
53
+ rect = img_doc[0].rect
54
+ page = doc.new_page(width=rect.width, height=rect.height)
55
+ page.insert_image(rect, stream=img_bytes)
56
+ img_doc.close()
57
+ pdf_bytes = doc.tobytes()
58
+ doc.close()
59
+ return pdf_bytes
60
+
61
+ def _validate(self, path: str | Path) -> Path:
62
+ path = Path(path)
63
+ if not path.exists():
64
+ raise ValueError(f"Failed to validate pdf {path} cannot be resolved")
65
+ return path
@@ -0,0 +1 @@
1
+ from .pdf_llm import PDFMultiModalLLM
@@ -0,0 +1,89 @@
1
+ from pathlib import Path
2
+ from typing import Optional, Sequence, Type
3
+
4
+ import pymupdf
5
+ from pydantic import BaseModel
6
+ from langchain_core.language_models.chat_models import BaseChatModel
7
+ from langchain.chat_models import init_chat_model
8
+ from type import PDFInput, BaseOutput
9
+ from image_payload_builder import ImagePayloadBuilder
10
+ from pdf_image_converter import PDFImageConverter
11
+
12
+
13
+ class PDFMultiModalLLM:
14
+ def __init__(
15
+ self,
16
+ *,
17
+ prompt: str,
18
+ model: BaseChatModel,
19
+ pdf: PDFInput | None = None,
20
+ image_bytes: Sequence[bytes] | None = None,
21
+ ):
22
+ if pdf is None and image_bytes is None:
23
+ raise ValueError("Either pdf_path or image_bytes must be provided")
24
+ if pdf is not None and image_bytes is not None:
25
+ raise ValueError("Provide only one of pdf_path or image_bytes")
26
+
27
+ self.prompt = prompt
28
+ self.builder = ImagePayloadBuilder()
29
+ self.llm = model
30
+
31
+ if pdf is not None:
32
+ self.pdf_bytes = PDFImageConverter().convert_to_images(pdf)
33
+
34
+ elif image_bytes:
35
+ self.pdf_bytes = list(image_bytes)
36
+ else:
37
+ raise RuntimeError("Unexpected Error Occured ")
38
+
39
+ def prepare_payload(self, mime="image/png"):
40
+ try:
41
+ image_payload = self.builder.prepare_llm_payload(self.pdf_bytes, mime=mime)
42
+ message = {
43
+ "role": "user",
44
+ "content": [{"type": "text", "text": self.prompt}, *image_payload],
45
+ }
46
+ return message
47
+ except Exception as e:
48
+ raise RuntimeError(f"Failed to prepare payload for LLM. Error: {e}")
49
+
50
+ def invoke(
51
+ self,
52
+ output_model: Optional[Type[BaseModel]] = BaseOutput,
53
+ mime: str = "image/png",
54
+ ):
55
+ try:
56
+ message = self.prepare_payload(mime)
57
+ if output_model:
58
+ chain = self.llm.with_structured_output(schema=output_model)
59
+ return chain.invoke([message])
60
+ else:
61
+ return self.llm.invoke([message])
62
+ except Exception as e:
63
+ raise RuntimeError(f"Failed to invoke model {e}")
64
+
65
+ async def ainvoke(
66
+ self,
67
+ output_model: Optional[Type[BaseModel]] = BaseOutput,
68
+ ):
69
+ message = self.prepare_payload()
70
+ if output_model:
71
+ chain = self.llm.with_structured_output(
72
+ schema=output_model,
73
+ )
74
+ return chain.ainvoke([message])
75
+ else:
76
+ return self.llm.ainvoke([message])
77
+
78
+
79
+ if __name__ == "__main__":
80
+ from dotenv import load_dotenv
81
+
82
+ load_dotenv()
83
+ path = "data/Lecture_02_03.pdf"
84
+ output = Path(r"src\data\images").resolve()
85
+ model = init_chat_model(model="gpt-4o", model_provider="openai")
86
+ data = PDFMultiModalLLM(
87
+ prompt="What is in the image", pdf=path, model=model
88
+ ).invoke()
89
+ print(data)
@@ -0,0 +1 @@
1
+ from .pdf_seperator import PDFSeperator
@@ -0,0 +1,61 @@
1
+ from pathlib import Path
2
+ import pymupdf
3
+ from type import PDFInput
4
+ from typing import Sequence, Literal
5
+ from pdf_image_converter import PDFImageConverter
6
+
7
+
8
+ class PDFSeperator:
9
+ def __init__(
10
+ self,
11
+ pdf_path: PDFInput | None = None,
12
+ pdf_bytes: bytes | None = None,
13
+ image_bytes: Sequence[bytes] | None = None,
14
+ pdf_name: str | None = "input_pdf",
15
+ ):
16
+ self.pdf_name = pdf_name
17
+ if pdf_path is None and pdf_bytes is None and image_bytes is None:
18
+ raise ValueError(
19
+ "Either pdf_path or image_bytes or pdf_bytes must be provided"
20
+ )
21
+ if pdf_path is not None and pdf_bytes is not None and image_bytes is None:
22
+ raise ValueError("Provide only one of pdf_path or image_bytes")
23
+
24
+ if pdf_path is not None:
25
+ self.pdf = pymupdf.open(pdf_path).tobytes()
26
+ elif pdf_bytes:
27
+ self.pdf = pdf_bytes
28
+ elif image_bytes:
29
+ self.pdf = PDFImageConverter().images_to_pdf(image_bytes)
30
+
31
+ else:
32
+ raise ValueError("Unexpected Error Occured")
33
+
34
+ def extract_page_range(self, start: int, end: int) -> bytes:
35
+ src = pymupdf.open(stream=self.pdf, filetype="pdf")
36
+ dst = pymupdf.open()
37
+ dst.insert_pdf(src, from_page=start, to_page=end, rotate=0)
38
+ return dst.tobytes()
39
+
40
+ def _extract_and_save(
41
+ self,
42
+ start: int,
43
+ end: int,
44
+ output_dir: str | Path,
45
+ pdf_name: str,
46
+ method: Literal["pdf", "image"] = "pdf",
47
+ ) -> str:
48
+ output_dir = Path(output_dir).resolve()
49
+ if not output_dir.exists():
50
+ raise ValueError("Failed to extract pdf path {output_dir} does not exist")
51
+ data = self.extract_page_range(start, end)
52
+ output_path = output_dir / (f"{self.pdf_name}_extracted_{start}_{end}.pdf")
53
+ PDFImageConverter().save_to_images(data, output_dir, pdf_name)
54
+ return output_path.as_posix()
55
+
56
+
57
+ if __name__ == "__main__":
58
+ path = r"data\Lecture_02_03.pdf"
59
+ PDFSeperator(path)._extract_and_save(
60
+ 1, 3, output_dir="./data", pdf_name="pdf_extracted"
61
+ )
@@ -0,0 +1,2 @@
1
+ from .types import *
2
+ from .models import *
@@ -0,0 +1,12 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class BaseOutput(BaseModel):
5
+ data: str
6
+
7
+
8
+ class PageRange(BaseModel):
9
+ start_page: int
10
+ end_page: int
11
+
12
+
@@ -0,0 +1,20 @@
1
+ from pathlib import Path
2
+ from typing import Literal
3
+ from enum import Enum
4
+
5
+
6
+ AnchorPos = Literal["top-left", "top-right", "bottom-right", "bottom-left"]
7
+ ImageExt = Literal["png", "jpeg"]
8
+
9
+
10
+ class Anchor(str, Enum):
11
+ TOP_LEFT = "top-left"
12
+ TOP_RIGHT = "top-right"
13
+ BOTTOM_RIGHT = "bottom-right"
14
+ BOTTOM_LEFT = "bottom-left"
15
+
16
+
17
+ PDFInput = str | Path | bytes
18
+ ImageBytes = bytes
19
+
20
+ ImageInput = PDFInput | ImageBytes
@@ -0,0 +1,3 @@
1
+ from .image_utils import *
2
+ from .langchain_utils import *
3
+ from .serialization_utils import *
@@ -0,0 +1,37 @@
1
+ from pathlib import Path
2
+ import base64
3
+
4
+
5
+ def save_base64_image(b64_data: str, output_path: str | Path) -> Path:
6
+ """
7
+ Decode a base64-encoded image and save it to disk.
8
+
9
+ Args:
10
+ b64_data: Base64-encoded image string (no data URI prefix).
11
+ output_path: Path where the image will be saved.
12
+
13
+ Returns:
14
+ Path to the saved image.
15
+ """
16
+ output_path = Path(output_path).resolve()
17
+
18
+ image_bytes = base64.b64decode(b64_data)
19
+ output_path.write_bytes(image_bytes)
20
+
21
+ return output_path
22
+
23
+
24
+ def write_image_data(image_bytes: bytes, folder_path: str | Path, filename: str) -> str:
25
+ try:
26
+ path = Path(folder_path).resolve()
27
+ path.mkdir(exist_ok=True)
28
+ save_path = path / filename
29
+
30
+ if save_path.suffix != ".png":
31
+ raise ValueError(
32
+ "Suffix allowed is only PNG either missing or nnot allowed"
33
+ )
34
+ save_path.write_bytes(image_bytes)
35
+ return save_path.as_posix()
36
+ except Exception as e:
37
+ raise ValueError(f"Could not save image {str(e)}")
@@ -0,0 +1,19 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+ from langgraph.graph.state import CompiledStateGraph
4
+ from .image_utils import write_image_data
5
+
6
+
7
+ def save_graph_visualization(
8
+ graph: CompiledStateGraph | Any,
9
+ folder_path: str | Path,
10
+ filename: str,
11
+ ):
12
+ try:
13
+ image_bytes = graph.get_graph().draw_mermaid_png()
14
+ save_path = write_image_data(image_bytes, folder_path, filename)
15
+ print(f"✅ Saved graph visualization at: {save_path}")
16
+ except ValueError:
17
+ raise
18
+ except Exception as error:
19
+ print(f"❌ Graph visualization failed: {error}")
@@ -0,0 +1,30 @@
1
+ from pathlib import Path
2
+ import base64
3
+ from datetime import date, datetime, time
4
+ from typing import Any
5
+ from uuid import UUID
6
+ from pydantic import BaseModel
7
+
8
+
9
+ def to_serializable(obj: Any) -> Any:
10
+ """
11
+ Recursively convert Pydantic models (and nested dicts/lists thereof)
12
+ into plain Python data structures.
13
+ """
14
+ if isinstance(obj, BaseModel):
15
+ return obj.model_dump(mode="json")
16
+ if isinstance(obj, dict):
17
+ return {k: to_serializable(v) for k, v in obj.items()}
18
+ if isinstance(obj, list):
19
+ return [to_serializable(v) for v in obj]
20
+
21
+ # --- Special cases ---
22
+ if isinstance(obj, (datetime, date, time)):
23
+ return obj.isoformat()
24
+ if isinstance(obj, UUID):
25
+ return str(obj)
26
+ if isinstance(obj, Path):
27
+ return obj.as_posix()
28
+ if isinstance(obj, (bytes, bytearray, memoryview)):
29
+ return base64.b64encode(obj).decode("utf-8")
30
+ return obj
@@ -0,0 +1,27 @@
1
+ [project]
2
+ name = "pdf_segmentation"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = [
6
+ {name = "Luciano Bermudez",email = "lucianobmecheng69@gmail.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.11,<4.0.0"
10
+ dependencies = [
11
+ "langchain (>=1.2.8,<2.0.0)",
12
+ "pymupdf (>=1.26.7,<2.0.0)",
13
+ "langgraph (>=1.0.7,<2.0.0)",
14
+ "python-dotenv (>=1.2.1,<2.0.0)"
15
+ ]
16
+
17
+
18
+ [build-system]
19
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
20
+ build-backend = "poetry.core.masonry.api"
21
+
22
+ [dependency-groups]
23
+ dev = [
24
+ "langchain-openai (>=1.1.7,<2.0.0)",
25
+ "ipykernel (>=7.2.0,<8.0.0)"
26
+ ]
27
+