pdf-kintsugi 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. pdf_kintsugi-0.1.0/PKG-INFO +88 -0
  2. pdf_kintsugi-0.1.0/README.md +73 -0
  3. pdf_kintsugi-0.1.0/pyproject.toml +43 -0
  4. pdf_kintsugi-0.1.0/src/pdf_kintsugi/__init__.py +6 -0
  5. pdf_kintsugi-0.1.0/src/pdf_kintsugi/line/__init__.py +0 -0
  6. pdf_kintsugi-0.1.0/src/pdf_kintsugi/line/extractor.py +203 -0
  7. pdf_kintsugi-0.1.0/src/pdf_kintsugi/line/merger.py +136 -0
  8. pdf_kintsugi-0.1.0/src/pdf_kintsugi/page_parser.py +104 -0
  9. pdf_kintsugi-0.1.0/src/pdf_kintsugi/parser.py +170 -0
  10. pdf_kintsugi-0.1.0/src/pdf_kintsugi/py.typed +0 -0
  11. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/__init__.py +0 -0
  12. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/cell_unmerger.py +144 -0
  13. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/char_assigner.py +40 -0
  14. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/detector.py +61 -0
  15. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/docling_integrator.py +101 -0
  16. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/merger.py +120 -0
  17. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/omitted_line_detector.py +117 -0
  18. pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/table.py +280 -0
  19. pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/__init__.py +0 -0
  20. pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/char_extractor.py +49 -0
  21. pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/corrector.py +198 -0
  22. pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/extract_target_chars.py +8 -0
  23. pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/row_builder.py +91 -0
  24. pdf_kintsugi-0.1.0/src/pdf_kintsugi/utils/__init__.py +0 -0
  25. pdf_kintsugi-0.1.0/src/pdf_kintsugi/utils/coord.py +2 -0
  26. pdf_kintsugi-0.1.0/src/pdf_kintsugi/utils/union_find.py +30 -0
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.3
2
+ Name: pdf-kintsugi
3
+ Version: 0.1.0
4
+ Summary: PDF table extraction with Docling and pdfplumber
5
+ Author: moss (moss-tms)
6
+ License: MIT
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Dist: docling>=2.73.0
12
+ Requires-Dist: pdfplumber>=0.11.9
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+
16
+ # pdf-kintsugi
17
+
18
+ **pdf-kintsugi** is a Python library that PDF table extraction and text recovery by combining the high-level document understanding of **Docling** with the precise, character-level bounding box analysis of **pdfplumber**.
19
+
20
+ Like the Japanese art of *kintsugi* (repairing broken pottery with gold), this library takes the initial parsing results from Docling and "repairs" complex table structures and text with garbled characters using pdfplumber's precise geometric layout analysis.
21
+
22
+ ## Features
23
+
24
+ - **Table Parsing:** Improves Docling's table extraction by using pdfplumber to detect omitted geometric lines and infer table edges.
25
+ - **Text Correction:** Optionally corrects garbled text using fine-grained character bounding boxes.
26
+ - **Seamless Integration with Docling:** Acts as a post-processor for Docling. It takes a `ConversionResult` from Docling, enhances the tables and text in place, and returns the updated document model.
27
+
28
+ ## Installation
29
+
30
+ The project requires **Python 3.10+**.
31
+
32
+ You can install it using `pip` or your favorite package manager:
33
+
34
+ ```bash
35
+ pip install pdf-kintsugi
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ `pdf-kintsugi` works alongside `docling`. Here is a basic example of how to use `PDFKintsugi`:
41
+
42
+ ```python
43
+ from docling.document_converter import DocumentConverter
44
+ from pdf_kintsugi import PDFKintsugi
45
+
46
+ pdf_path = "path/to/your/document.pdf"
47
+
48
+ # 1. Parse the document using Docling first
49
+ converter = DocumentConverter()
50
+ docling_result = converter.convert(pdf_path)
51
+
52
+ # 2. Initialize PDFKintsugi with the source PDF and the Docling result
53
+ kintsugi_parser = PDFKintsugi(
54
+ source=pdf_path,
55
+ docling_result=docling_result,
56
+ tolerance=3.0, # Adjust merging tolerance for table lines
57
+ replace_text=False, # Set to True to enable text/character correction
58
+ replace_table=True # Set to True to enable table structure repair
59
+ )
60
+
61
+ # 3. Get the parsing result
62
+ kintsugi_result = kintsugi_parser.parse()
63
+
64
+ # Now you can use kintsugi_result just like a regular Docling document
65
+ print(kintsugi_result.document.export_to_markdown())
66
+ ```
67
+
68
+ ## Configuration
69
+
70
+ The `PDFKintsugi` class accepts several parameters to tune the extraction:
71
+
72
+ - `source` (`str`): The file path to the source PDF document.
73
+ - `docling_result` (`ConversionResult`): The parsed document object returned by Docling.
74
+ - `tolerance` (`float`, default `3.0`): The line-merging tolerance. A smaller value (e.g., `1.5`) helps prevent adjacent tables from being incorrectly merged, while a larger value (e.g., `5.0`) can help stitch together fragmented tables.
75
+ - `replace_text` (`bool`, default `False`): If `True`, utilizes `pdfplumber` to extract characters and correct garbled text.
76
+ - `replace_table` (`bool`, default `True`): If `True`, rebuilds and overrides the Docling table representations using geometric line intersections and edge inference.
77
+
78
+ ## Contributing
79
+
80
+ Contributions are welcome! This project uses `uv` for dependency management.
81
+
82
+ 1. Clone the repository.
83
+ 2. Setup the environment: `uv sync`
84
+ 3. Run tests before submitting a Pull Request.
85
+
86
+ ## License
87
+
88
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,73 @@
1
+ # pdf-kintsugi
2
+
3
+ **pdf-kintsugi** is a Python library that PDF table extraction and text recovery by combining the high-level document understanding of **Docling** with the precise, character-level bounding box analysis of **pdfplumber**.
4
+
5
+ Like the Japanese art of *kintsugi* (repairing broken pottery with gold), this library takes the initial parsing results from Docling and "repairs" complex table structures and text with garbled characters using pdfplumber's precise geometric layout analysis.
6
+
7
+ ## Features
8
+
9
+ - **Table Parsing:** Improves Docling's table extraction by using pdfplumber to detect omitted geometric lines and infer table edges.
10
+ - **Text Correction:** Optionally corrects garbled text using fine-grained character bounding boxes.
11
+ - **Seamless Integration with Docling:** Acts as a post-processor for Docling. It takes a `ConversionResult` from Docling, enhances the tables and text in place, and returns the updated document model.
12
+
13
+ ## Installation
14
+
15
+ The project requires **Python 3.10+**.
16
+
17
+ You can install it using `pip` or your favorite package manager:
18
+
19
+ ```bash
20
+ pip install pdf-kintsugi
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ `pdf-kintsugi` works alongside `docling`. Here is a basic example of how to use `PDFKintsugi`:
26
+
27
+ ```python
28
+ from docling.document_converter import DocumentConverter
29
+ from pdf_kintsugi import PDFKintsugi
30
+
31
+ pdf_path = "path/to/your/document.pdf"
32
+
33
+ # 1. Parse the document using Docling first
34
+ converter = DocumentConverter()
35
+ docling_result = converter.convert(pdf_path)
36
+
37
+ # 2. Initialize PDFKintsugi with the source PDF and the Docling result
38
+ kintsugi_parser = PDFKintsugi(
39
+ source=pdf_path,
40
+ docling_result=docling_result,
41
+ tolerance=3.0, # Adjust merging tolerance for table lines
42
+ replace_text=False, # Set to True to enable text/character correction
43
+ replace_table=True # Set to True to enable table structure repair
44
+ )
45
+
46
+ # 3. Get the parsing result
47
+ kintsugi_result = kintsugi_parser.parse()
48
+
49
+ # Now you can use kintsugi_result just like a regular Docling document
50
+ print(kintsugi_result.document.export_to_markdown())
51
+ ```
52
+
53
+ ## Configuration
54
+
55
+ The `PDFKintsugi` class accepts several parameters to tune the extraction:
56
+
57
+ - `source` (`str`): The file path to the source PDF document.
58
+ - `docling_result` (`ConversionResult`): The parsed document object returned by Docling.
59
+ - `tolerance` (`float`, default `3.0`): The line-merging tolerance. A smaller value (e.g., `1.5`) helps prevent adjacent tables from being incorrectly merged, while a larger value (e.g., `5.0`) can help stitch together fragmented tables.
60
+ - `replace_text` (`bool`, default `False`): If `True`, utilizes `pdfplumber` to extract characters and correct garbled text.
61
+ - `replace_table` (`bool`, default `True`): If `True`, rebuilds and overrides the Docling table representations using geometric line intersections and edge inference.
62
+
63
+ ## Contributing
64
+
65
+ Contributions are welcome! This project uses `uv` for dependency management.
66
+
67
+ 1. Clone the repository.
68
+ 2. Setup the environment: `uv sync`
69
+ 3. Run tests before submitting a Pull Request.
70
+
71
+ ## License
72
+
73
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,43 @@
1
+ [project]
2
+ name = "pdf-kintsugi"
3
+ version = "0.1.0"
4
+ description = "PDF table extraction with Docling and pdfplumber"
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [
8
+ { name = "moss (moss-tms)" },
9
+ ]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ ]
16
+ requires-python = ">=3.10"
17
+ dependencies = [
18
+ "docling>=2.73.0",
19
+ "pdfplumber>=0.11.9",
20
+ ]
21
+
22
+ [build-system]
23
+ requires = ["uv_build>=0.9.9,<0.10.0"]
24
+ build-backend = "uv_build"
25
+
26
+ [tool.ruff]
27
+ line-length = 88
28
+ target-version = "py310"
29
+
30
+ [tool.ruff.lint]
31
+ select = [
32
+ "E", "F",
33
+ "I",
34
+ ]
35
+
36
+ [tool.ruff.format]
37
+ quote-style = "double"
38
+ indent-style = "space"
39
+
40
+ [dependency-groups]
41
+ dev = [
42
+ "ruff>=0.15.8",
43
+ ]
@@ -0,0 +1,6 @@
1
+ """pdf-kintsugi - Enhanced PDF table extraction with Docling and pdfplumber."""
2
+
3
+ from pdf_kintsugi.parser import PDFKintsugi
4
+
5
+ __version__ = "0.1.0"
6
+ __all__ = ["PDFKintsugi"]
File without changes
@@ -0,0 +1,203 @@
1
+ from operator import itemgetter
2
+
3
+ from pdfplumber.page import Page
4
+
5
+
6
+ class LineExtractor:
7
+ def __init__(self, page: Page | None, lines: list[dict] | None = None) -> None:
8
+ self.lines: list[dict] = []
9
+ self.horizontal_lines: list[dict] = []
10
+ self.vertical_lines: list[dict] = []
11
+
12
+ if page:
13
+ self.init_from_page(page)
14
+ if lines:
15
+ self.init_from_lines(lines)
16
+
17
+ def init_from_page(self, page: Page) -> None:
18
+ self.extract_lines(page.lines + page.rects + page.curves + page.images)
19
+ self.extract_rects(page.rects)
20
+
21
+ def init_from_lines(self, lines: list[dict]) -> None:
22
+ self.extract_lines(lines)
23
+
24
+ def get_orientation(self, line: dict) -> str:
25
+ width = abs(line["x1"] - line["x0"])
26
+ height = abs(line["bottom"] - line["top"])
27
+
28
+ ratio = 1.5
29
+ max_thickness = 5.0
30
+
31
+ if width > height * ratio and height < max_thickness:
32
+ return "x"
33
+ if height > width * ratio and width < max_thickness:
34
+ return "y"
35
+
36
+ return "none"
37
+
38
+ def extract_lines(self, lines: list[dict]) -> None:
39
+ """we process page.lines, page.rects, page.curves, page.images in bulk"""
40
+ for line in lines:
41
+ orientation = self.get_orientation(line)
42
+ if orientation == "x":
43
+ rep_y = (line["top"] + line["bottom"]) / 2
44
+ self.horizontal_lines.append(
45
+ {
46
+ "x0": line["x0"],
47
+ "x1": line["x1"],
48
+ "top": rep_y,
49
+ "bottom": rep_y,
50
+ "orientation": orientation,
51
+ }
52
+ )
53
+ if orientation == "y":
54
+ rep_x = (line["x0"] + line["x1"]) / 2
55
+ self.vertical_lines.append(
56
+ {
57
+ "x0": rep_x,
58
+ "x1": rep_x,
59
+ "top": line["top"],
60
+ "bottom": line["bottom"],
61
+ "orientation": orientation,
62
+ }
63
+ )
64
+
65
+ def extract_rects(self, rects: list[dict]) -> None:
66
+ """we extract page.rects which can be disassembled to 4 lines"""
67
+ for rect in rects:
68
+ if self.get_orientation(rect) != "none" or (
69
+ rect["width"] < 5.0 and rect["height"] < 5.0
70
+ ):
71
+ continue
72
+
73
+ # 枠線がない or 色付けのためのrectならばcontinue
74
+ if rect["stroking_color"] is None or rect["fill"] is True:
75
+ continue
76
+
77
+ for y_key in ["top", "bottom"]:
78
+ self.horizontal_lines.append(
79
+ {
80
+ "x0": rect["x0"],
81
+ "x1": rect["x1"],
82
+ "top": rect[y_key],
83
+ "bottom": rect[y_key],
84
+ "orientation": "x",
85
+ }
86
+ )
87
+
88
+ for x_key in ["x0", "x1"]:
89
+ self.vertical_lines.append(
90
+ {
91
+ "x0": rect[x_key],
92
+ "x1": rect[x_key],
93
+ "top": rect["top"],
94
+ "bottom": rect["bottom"],
95
+ "orientation": "y",
96
+ }
97
+ )
98
+
99
+ def can_merge(
100
+ self, line1_: dict, line2_: dict, main_coord, sub_start, sub_end, tolerance=1.5
101
+ ) -> bool:
102
+ """whether two lines can be merged"""
103
+ if line1_[sub_start] < line2_[sub_start]:
104
+ line1, line2 = line1_, line2_
105
+ else:
106
+ line1, line2 = line2_, line1_
107
+
108
+ is_collinear = abs(line1[main_coord] - line2[main_coord]) <= tolerance
109
+ is_connected = line2[sub_start] <= line1[sub_end] + tolerance
110
+
111
+ return is_collinear and is_connected
112
+
113
+ def merge_lines(
114
+ self, lines: list[dict], orientation: str, tolerance=1.5
115
+ ) -> list[dict]:
116
+ """merge collinear line segments"""
117
+ if not lines:
118
+ return []
119
+
120
+ # 1. lineを座標でsort
121
+ if orientation == "y":
122
+ # 1. y座標方向のlineをmergeしたい。x座標が等しいものを寄せる、y座標でsrot
123
+ lines.sort(key=itemgetter("x0", "top"))
124
+ main_coord = "x0"
125
+ sub_start = "top"
126
+ sub_end = "bottom"
127
+ else:
128
+ # 2. x座標方向のlineをmergeしたい。y座標が等しいものを寄せる、x座標でsrot
129
+ lines.sort(key=itemgetter("top", "x0"))
130
+ main_coord = "top"
131
+ sub_start = "x0"
132
+ sub_end = "x1"
133
+
134
+ merged = []
135
+ current = lines[0]
136
+
137
+ for next_line in lines[1:]:
138
+ if self.can_merge(
139
+ current, next_line, main_coord, sub_start, sub_end, tolerance
140
+ ):
141
+ current[sub_start] = min(current[sub_start], next_line[sub_start])
142
+ current[sub_end] = max(current[sub_end], next_line[sub_end])
143
+ else:
144
+ merged.append(current)
145
+ current = next_line
146
+
147
+ merged.append(current)
148
+ return merged
149
+
150
+ def remove_included_lines(
151
+ self, lines: list[dict], orientation: str, tolerance=1.5
152
+ ) -> list[dict]:
153
+ if orientation == "y":
154
+ # 1. y座標方向のlineをmergeしたい。x座標が等しいものを寄せる
155
+ main_coord = "x0"
156
+ sub_start = "top"
157
+ sub_end = "bottom"
158
+ else:
159
+ # 2. x座標方向のlineをmergeしたい。y座標が等しいものを寄せる
160
+ main_coord = "top"
161
+ sub_start = "x0"
162
+ sub_end = "x1"
163
+
164
+ skip = set()
165
+ for i, long in enumerate(lines):
166
+ for j, short in enumerate(lines):
167
+ if i in skip or j in skip or i == j:
168
+ continue
169
+
170
+ is_collinear = abs(long[main_coord] - short[main_coord]) <= tolerance
171
+ include_start = long[sub_start] - tolerance <= short[sub_start]
172
+ include_end = short[sub_end] <= long[sub_end] + tolerance
173
+
174
+ if is_collinear and include_start and include_end:
175
+ skip.add(j)
176
+
177
+ extracted = [line for i, line in enumerate(lines) if i not in skip]
178
+ return extracted
179
+
180
+ def extract_by_length(self, min_line_length=3.0) -> list[dict]:
181
+ """exclude the lines whose length is shorter than min_line_lenght"""
182
+ extract_lines: list[dict] = []
183
+ for line in self.lines:
184
+ orientation = line["orientation"]
185
+ length = (
186
+ line["x1"] - line["x0"]
187
+ if orientation == "x"
188
+ else line["bottom"] - line["top"]
189
+ )
190
+ if length >= min_line_length:
191
+ extract_lines.append(line)
192
+
193
+ return extract_lines
194
+
195
+ def extract(self) -> list[dict]:
196
+ """extract the lines contained in the page"""
197
+ merged_horizontal = self.merge_lines(self.horizontal_lines, "x")
198
+ merged_vertical = self.merge_lines(self.vertical_lines, "y")
199
+
200
+ self.lines = merged_horizontal + merged_vertical
201
+ self.lines = self.extract_by_length()
202
+
203
+ return self.lines
@@ -0,0 +1,136 @@
1
+ from pdf_kintsugi.utils.coord import is_same
2
+ from pdf_kintsugi.utils.union_find import UnionFind
3
+
4
+
5
+ class LineMerger:
6
+ def __init__(self, lines: list[dict], tolerance: float) -> None:
7
+ self.lines: list[dict] = lines
8
+ self.tolerance: float = tolerance
9
+
10
+ def merge(self) -> list[list[dict]]:
11
+ line_num = len(self.lines)
12
+ uf = UnionFind(line_num)
13
+
14
+ for i in range(line_num):
15
+ for j in range(i + 1, line_num):
16
+ if self._is_crossing_lines(i, j):
17
+ uf.merge(i, j)
18
+
19
+ self.group_ids: list[list[int]] = uf.groups()
20
+ self.groups: list[list[dict]] = [
21
+ [self.lines[id] for id in group] for group in self.group_ids
22
+ ]
23
+
24
+ self._infer_table_edges()
25
+
26
+ return self.groups
27
+
28
+ def _infer_table_edges(self) -> None:
29
+ core_groups = []
30
+ loose_lines = []
31
+
32
+ for group in self.groups:
33
+ has_x = any(line["orientation"] == "x" for line in group)
34
+ has_y = any(line["orientation"] == "y" for line in group)
35
+ if has_x and has_y and len(group) > 4:
36
+ core_groups.append(group)
37
+ elif len(group) == 1:
38
+ loose_lines.extend(group)
39
+
40
+ for group in core_groups:
41
+ x_lines = [line for line in group if line["orientation"] == "x"]
42
+ y_lines = [line for line in group if line["orientation"] == "y"]
43
+
44
+ logical_left = min(line["x0"] for line in x_lines)
45
+ logical_right = max(line["x1"] for line in x_lines)
46
+ logical_top = min(line["top"] for line in y_lines)
47
+ logical_bottom = max(line["bottom"] for line in y_lines)
48
+
49
+ # Check y-edges (left and right)
50
+ for target_x in (logical_left, logical_right):
51
+ has_edge = any(is_same(line["x0"], target_x, 1.5) for line in y_lines)
52
+
53
+ if not has_edge:
54
+ found = False
55
+ for i, loose in enumerate(loose_lines):
56
+ if loose["orientation"] == "y" and is_same(
57
+ loose["x0"], target_x, self.tolerance
58
+ ):
59
+ overlap = min(loose["bottom"], logical_bottom) - max(
60
+ loose["top"], logical_top
61
+ )
62
+ if overlap > 0:
63
+ group.append(loose)
64
+ y_lines.append(loose)
65
+ loose_lines.pop(i)
66
+ found = True
67
+ break
68
+ if not found:
69
+ interpolated = {
70
+ "orientation": "y",
71
+ "x0": target_x,
72
+ "x1": target_x,
73
+ "top": logical_top,
74
+ "bottom": logical_bottom,
75
+ }
76
+ group.append(interpolated)
77
+ y_lines.append(interpolated)
78
+
79
+ # Check x-edges (top and bottom)
80
+ for target_y in (logical_top, logical_bottom):
81
+ has_edge = any(is_same(line["top"], target_y, 1.5) for line in x_lines)
82
+ if not has_edge:
83
+ found = False
84
+ for i, loose in enumerate(loose_lines):
85
+ if loose["orientation"] == "x" and is_same(
86
+ loose["top"], target_y, self.tolerance
87
+ ):
88
+ overlap = min(loose["x1"], logical_right) - max(
89
+ loose["x0"], logical_left
90
+ )
91
+ if overlap > 0:
92
+ group.append(loose)
93
+ x_lines.append(loose)
94
+ loose_lines.pop(i)
95
+ found = True
96
+ break
97
+ if not found:
98
+ interpolated = {
99
+ "orientation": "x",
100
+ "x0": logical_left,
101
+ "x1": logical_right,
102
+ "top": target_y,
103
+ "bottom": target_y,
104
+ }
105
+ group.append(interpolated)
106
+ x_lines.append(interpolated)
107
+
108
+ self.groups = core_groups
109
+ if loose_lines:
110
+ self.groups.extend([[line] for line in loose_lines])
111
+
112
+ def _is_crossing_lines(self, id1: int, id2: int, tolerance: float = 1.5) -> bool:
113
+ if self.lines[id1]["orientation"] == self.lines[id2]["orientation"]:
114
+ return False
115
+
116
+ if self.lines[id1]["orientation"] == "y":
117
+ id1, id2 = id2, id1
118
+
119
+ x_line = self.lines[id1]
120
+ y_line = self.lines[id2]
121
+
122
+ return (
123
+ x_line["x0"] - tolerance <= y_line["x0"] <= x_line["x1"] + tolerance
124
+ and y_line["top"] - tolerance
125
+ <= x_line["top"]
126
+ <= y_line["bottom"] + tolerance
127
+ )
128
+
129
+ def extract_independent_horizontal_lines(self) -> list[dict]:
130
+ independent_horizontal_lines = [
131
+ self.lines[group[0]]
132
+ for group in self.group_ids
133
+ if len(group) == 1 and self.lines[group[0]]["orientation"] == "x"
134
+ ]
135
+
136
+ return independent_horizontal_lines
@@ -0,0 +1,104 @@
1
+ from pdfplumber.page import Page
2
+
3
+ from pdf_kintsugi.line.extractor import LineExtractor
4
+ from pdf_kintsugi.line.merger import LineMerger
5
+ from pdf_kintsugi.table.detector import TableDetector
6
+ from pdf_kintsugi.table.merger import TableMerger
7
+ from pdf_kintsugi.table.table import Table
8
+ from pdf_kintsugi.text.char_extractor import CharExtractor
9
+
10
+
11
+ class PageParser:
12
+ def __init__(self, page: Page, docling_tables: list[dict], tolerance) -> None:
13
+ self.page: Page = page
14
+ self.page_number: int = page.page_number
15
+ self.doclings: list[dict] = docling_tables
16
+ self.tol: float = tolerance
17
+
18
+ char_extractor = CharExtractor(page)
19
+ self.chars: list[dict] = char_extractor.extract()
20
+
21
+ line_extractor = LineExtractor(self.page)
22
+ self.lines: list[dict] = line_extractor.extract()
23
+
24
+ self.build(counter=0)
25
+
26
+ def build(self, counter: int = 1) -> list[list[Table]]:
27
+ if counter == 1:
28
+ line_extractor = LineExtractor(None, self.lines)
29
+ self.lines = line_extractor.extract()
30
+
31
+ my_tables: list[Table] = self._detect_tables()
32
+ links: list[list[int]] = self._link_docling(my_tables)
33
+
34
+ # doclingとtableが1対1対応しているやつを解析する, それ以外のmy tableは不要
35
+ valid_indices = [i for i in range(len(my_tables)) if len(links[i]) == 1]
36
+ my_tables = [my_tables[i] for i in valid_indices]
37
+ links = [links[i] for i in valid_indices]
38
+ build_success_tables: list[Table] = []
39
+
40
+ # build
41
+ for i, table in enumerate(my_tables):
42
+ docling = self.doclings[links[i][0]]
43
+ success = table.build(self.chars, docling)
44
+
45
+ if success:
46
+ build_success_tables.append(table)
47
+
48
+ my_tables = build_success_tables
49
+ links = self._link_docling(my_tables)
50
+
51
+ if counter == 0:
52
+ self._merge_tables(my_tables, links)
53
+ return []
54
+
55
+ rev_links: list[list[Table]] = [[] for _ in range(len(self.doclings))]
56
+ for my_idx, doc_idx in enumerate(links):
57
+ rev_links[doc_idx[0]].append(my_tables[my_idx])
58
+
59
+ return rev_links
60
+
61
+ def _merge_tables(self, my_tables: list[Table], links: list[list[int]]) -> None:
62
+ same_cluster: dict[int, list[Table]] = {}
63
+ for i, table in enumerate(my_tables):
64
+ idx = links[i][0]
65
+ if idx not in same_cluster:
66
+ same_cluster[idx] = []
67
+ same_cluster[idx].append(table)
68
+
69
+ table_merger = TableMerger()
70
+ for tables in same_cluster.values():
71
+ sz = len(tables)
72
+ for i in range(sz):
73
+ for j in range(i + 1, sz):
74
+ table_merger.merge(tables[i], tables[j])
75
+
76
+ def _link_docling(self, tables: list[Table]) -> list[list[int]]:
77
+ """
78
+ docling_table内にあるmy_tableを紐づける, my_tableがkeyでdoclingのindexがvalue
79
+ """
80
+ links: list[list[int]] = [
81
+ [
82
+ i
83
+ for i, doc_table in enumerate(self.doclings)
84
+ if max(doc_table["bbox"][0], table.bbox[0])
85
+ < min(doc_table["bbox"][2], table.bbox[2])
86
+ and max(doc_table["bbox"][1], table.bbox[1])
87
+ < min(doc_table["bbox"][3], table.bbox[3])
88
+ ]
89
+ for table in tables
90
+ ]
91
+
92
+ return links
93
+
94
+ def _detect_tables(self) -> list[Table]:
95
+ line_groups = self._merge_lines()
96
+
97
+ table_detector = TableDetector(line_groups, self.page_number, self.tol)
98
+ tables: list[Table] = table_detector.detect()
99
+ return tables
100
+
101
+ def _merge_lines(self) -> list[list[dict]]:
102
+ line_merger = LineMerger(self.lines, self.tol)
103
+ line_groups: list[list[dict]] = line_merger.merge()
104
+ return line_groups