pdf-kintsugi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf_kintsugi-0.1.0/PKG-INFO +88 -0
- pdf_kintsugi-0.1.0/README.md +73 -0
- pdf_kintsugi-0.1.0/pyproject.toml +43 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/__init__.py +6 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/line/__init__.py +0 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/line/extractor.py +203 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/line/merger.py +136 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/page_parser.py +104 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/parser.py +170 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/py.typed +0 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/__init__.py +0 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/cell_unmerger.py +144 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/char_assigner.py +40 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/detector.py +61 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/docling_integrator.py +101 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/merger.py +120 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/omitted_line_detector.py +117 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/table/table.py +280 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/__init__.py +0 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/char_extractor.py +49 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/corrector.py +198 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/extract_target_chars.py +8 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/text/row_builder.py +91 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/utils/__init__.py +0 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/utils/coord.py +2 -0
- pdf_kintsugi-0.1.0/src/pdf_kintsugi/utils/union_find.py +30 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: pdf-kintsugi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PDF table extraction with Docling and pdfplumber
|
|
5
|
+
Author: moss (moss-tms)
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Requires-Dist: docling>=2.73.0
|
|
12
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# pdf-kintsugi
|
|
17
|
+
|
|
18
|
+
**pdf-kintsugi** is a Python library that PDF table extraction and text recovery by combining the high-level document understanding of **Docling** with the precise, character-level bounding box analysis of **pdfplumber**.
|
|
19
|
+
|
|
20
|
+
Like the Japanese art of *kintsugi* (repairing broken pottery with gold), this library takes the initial parsing results from Docling and "repairs" complex table structures and text with garbled characters using pdfplumber's precise geometric layout analysis.
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
|
|
24
|
+
- **Table Parsing:** Improves Docling's table extraction by using pdfplumber to detect omitted geometric lines and infer table edges.
|
|
25
|
+
- **Text Correction:** Optionally corrects garbled text using fine-grained character bounding boxes.
|
|
26
|
+
- **Seamless Integration with Docling:** Acts as a post-processor for Docling. It takes a `ConversionResult` from Docling, enhances the tables and text in place, and returns the updated document model.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
The project requires **Python 3.10+**.
|
|
31
|
+
|
|
32
|
+
You can install it using `pip` or your favorite package manager:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install pdf-kintsugi
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
`pdf-kintsugi` works alongside `docling`. Here is a basic example of how to use `PDFKintsugi`:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from docling.document_converter import DocumentConverter
|
|
44
|
+
from pdf_kintsugi import PDFKintsugi
|
|
45
|
+
|
|
46
|
+
pdf_path = "path/to/your/document.pdf"
|
|
47
|
+
|
|
48
|
+
# 1. Parse the document using Docling first
|
|
49
|
+
converter = DocumentConverter()
|
|
50
|
+
docling_result = converter.convert(pdf_path)
|
|
51
|
+
|
|
52
|
+
# 2. Initialize PDFKintsugi with the source PDF and the Docling result
|
|
53
|
+
kintsugi_parser = PDFKintsugi(
|
|
54
|
+
source=pdf_path,
|
|
55
|
+
docling_result=docling_result,
|
|
56
|
+
tolerance=3.0, # Adjust merging tolerance for table lines
|
|
57
|
+
replace_text=False, # Set to True to enable text/character correction
|
|
58
|
+
replace_table=True # Set to True to enable table structure repair
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# 3. Get the parsing result
|
|
62
|
+
kintsugi_result = kintsugi_parser.parse()
|
|
63
|
+
|
|
64
|
+
# Now you can use kintsugi_result just like a regular Docling document
|
|
65
|
+
print(kintsugi_result.document.export_to_markdown())
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Configuration
|
|
69
|
+
|
|
70
|
+
The `PDFKintsugi` class accepts several parameters to tune the extraction:
|
|
71
|
+
|
|
72
|
+
- `source` (`str`): The file path to the source PDF document.
|
|
73
|
+
- `docling_result` (`ConversionResult`): The parsed document object returned by Docling.
|
|
74
|
+
- `tolerance` (`float`, default `3.0`): The line-merging tolerance. A smaller value (e.g., `1.5`) helps prevent adjacent tables from being incorrectly merged, while a larger value (e.g., `5.0`) can help stitch together fragmented tables.
|
|
75
|
+
- `replace_text` (`bool`, default `False`): If `True`, utilizes `pdfplumber` to extract characters and correct garbled text.
|
|
76
|
+
- `replace_table` (`bool`, default `True`): If `True`, rebuilds and overrides the Docling table representations using geometric line intersections and edge inference.
|
|
77
|
+
|
|
78
|
+
## Contributing
|
|
79
|
+
|
|
80
|
+
Contributions are welcome! This project uses `uv` for dependency management.
|
|
81
|
+
|
|
82
|
+
1. Clone the repository.
|
|
83
|
+
2. Setup the environment: `uv sync`
|
|
84
|
+
3. Run tests before submitting a Pull Request.
|
|
85
|
+
|
|
86
|
+
## License
|
|
87
|
+
|
|
88
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# pdf-kintsugi
|
|
2
|
+
|
|
3
|
+
**pdf-kintsugi** is a Python library that PDF table extraction and text recovery by combining the high-level document understanding of **Docling** with the precise, character-level bounding box analysis of **pdfplumber**.
|
|
4
|
+
|
|
5
|
+
Like the Japanese art of *kintsugi* (repairing broken pottery with gold), this library takes the initial parsing results from Docling and "repairs" complex table structures and text with garbled characters using pdfplumber's precise geometric layout analysis.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Table Parsing:** Improves Docling's table extraction by using pdfplumber to detect omitted geometric lines and infer table edges.
|
|
10
|
+
- **Text Correction:** Optionally corrects garbled text using fine-grained character bounding boxes.
|
|
11
|
+
- **Seamless Integration with Docling:** Acts as a post-processor for Docling. It takes a `ConversionResult` from Docling, enhances the tables and text in place, and returns the updated document model.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
The project requires **Python 3.10+**.
|
|
16
|
+
|
|
17
|
+
You can install it using `pip` or your favorite package manager:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install pdf-kintsugi
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
`pdf-kintsugi` works alongside `docling`. Here is a basic example of how to use `PDFKintsugi`:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from docling.document_converter import DocumentConverter
|
|
29
|
+
from pdf_kintsugi import PDFKintsugi
|
|
30
|
+
|
|
31
|
+
pdf_path = "path/to/your/document.pdf"
|
|
32
|
+
|
|
33
|
+
# 1. Parse the document using Docling first
|
|
34
|
+
converter = DocumentConverter()
|
|
35
|
+
docling_result = converter.convert(pdf_path)
|
|
36
|
+
|
|
37
|
+
# 2. Initialize PDFKintsugi with the source PDF and the Docling result
|
|
38
|
+
kintsugi_parser = PDFKintsugi(
|
|
39
|
+
source=pdf_path,
|
|
40
|
+
docling_result=docling_result,
|
|
41
|
+
tolerance=3.0, # Adjust merging tolerance for table lines
|
|
42
|
+
replace_text=False, # Set to True to enable text/character correction
|
|
43
|
+
replace_table=True # Set to True to enable table structure repair
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# 3. Get the parsing result
|
|
47
|
+
kintsugi_result = kintsugi_parser.parse()
|
|
48
|
+
|
|
49
|
+
# Now you can use kintsugi_result just like a regular Docling document
|
|
50
|
+
print(kintsugi_result.document.export_to_markdown())
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Configuration
|
|
54
|
+
|
|
55
|
+
The `PDFKintsugi` class accepts several parameters to tune the extraction:
|
|
56
|
+
|
|
57
|
+
- `source` (`str`): The file path to the source PDF document.
|
|
58
|
+
- `docling_result` (`ConversionResult`): The parsed document object returned by Docling.
|
|
59
|
+
- `tolerance` (`float`, default `3.0`): The line-merging tolerance. A smaller value (e.g., `1.5`) helps prevent adjacent tables from being incorrectly merged, while a larger value (e.g., `5.0`) can help stitch together fragmented tables.
|
|
60
|
+
- `replace_text` (`bool`, default `False`): If `True`, utilizes `pdfplumber` to extract characters and correct garbled text.
|
|
61
|
+
- `replace_table` (`bool`, default `True`): If `True`, rebuilds and overrides the Docling table representations using geometric line intersections and edge inference.
|
|
62
|
+
|
|
63
|
+
## Contributing
|
|
64
|
+
|
|
65
|
+
Contributions are welcome! This project uses `uv` for dependency management.
|
|
66
|
+
|
|
67
|
+
1. Clone the repository.
|
|
68
|
+
2. Setup the environment: `uv sync`
|
|
69
|
+
3. Run tests before submitting a Pull Request.
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pdf-kintsugi"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "PDF table extraction with Docling and pdfplumber"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "moss (moss-tms)" },
|
|
9
|
+
]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 3 - Alpha",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
]
|
|
16
|
+
requires-python = ">=3.10"
|
|
17
|
+
dependencies = [
|
|
18
|
+
"docling>=2.73.0",
|
|
19
|
+
"pdfplumber>=0.11.9",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["uv_build>=0.9.9,<0.10.0"]
|
|
24
|
+
build-backend = "uv_build"
|
|
25
|
+
|
|
26
|
+
[tool.ruff]
|
|
27
|
+
line-length = 88
|
|
28
|
+
target-version = "py310"
|
|
29
|
+
|
|
30
|
+
[tool.ruff.lint]
|
|
31
|
+
select = [
|
|
32
|
+
"E", "F",
|
|
33
|
+
"I",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[tool.ruff.format]
|
|
37
|
+
quote-style = "double"
|
|
38
|
+
indent-style = "space"
|
|
39
|
+
|
|
40
|
+
[dependency-groups]
|
|
41
|
+
dev = [
|
|
42
|
+
"ruff>=0.15.8",
|
|
43
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from operator import itemgetter
|
|
2
|
+
|
|
3
|
+
from pdfplumber.page import Page
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LineExtractor:
|
|
7
|
+
def __init__(self, page: Page | None, lines: list[dict] | None = None) -> None:
|
|
8
|
+
self.lines: list[dict] = []
|
|
9
|
+
self.horizontal_lines: list[dict] = []
|
|
10
|
+
self.vertical_lines: list[dict] = []
|
|
11
|
+
|
|
12
|
+
if page:
|
|
13
|
+
self.init_from_page(page)
|
|
14
|
+
if lines:
|
|
15
|
+
self.init_from_lines(lines)
|
|
16
|
+
|
|
17
|
+
def init_from_page(self, page: Page) -> None:
|
|
18
|
+
self.extract_lines(page.lines + page.rects + page.curves + page.images)
|
|
19
|
+
self.extract_rects(page.rects)
|
|
20
|
+
|
|
21
|
+
def init_from_lines(self, lines: list[dict]) -> None:
|
|
22
|
+
self.extract_lines(lines)
|
|
23
|
+
|
|
24
|
+
def get_orientation(self, line: dict) -> str:
|
|
25
|
+
width = abs(line["x1"] - line["x0"])
|
|
26
|
+
height = abs(line["bottom"] - line["top"])
|
|
27
|
+
|
|
28
|
+
ratio = 1.5
|
|
29
|
+
max_thickness = 5.0
|
|
30
|
+
|
|
31
|
+
if width > height * ratio and height < max_thickness:
|
|
32
|
+
return "x"
|
|
33
|
+
if height > width * ratio and width < max_thickness:
|
|
34
|
+
return "y"
|
|
35
|
+
|
|
36
|
+
return "none"
|
|
37
|
+
|
|
38
|
+
def extract_lines(self, lines: list[dict]) -> None:
|
|
39
|
+
"""we process page.lines, page.rects, page.curves, page.images in bulk"""
|
|
40
|
+
for line in lines:
|
|
41
|
+
orientation = self.get_orientation(line)
|
|
42
|
+
if orientation == "x":
|
|
43
|
+
rep_y = (line["top"] + line["bottom"]) / 2
|
|
44
|
+
self.horizontal_lines.append(
|
|
45
|
+
{
|
|
46
|
+
"x0": line["x0"],
|
|
47
|
+
"x1": line["x1"],
|
|
48
|
+
"top": rep_y,
|
|
49
|
+
"bottom": rep_y,
|
|
50
|
+
"orientation": orientation,
|
|
51
|
+
}
|
|
52
|
+
)
|
|
53
|
+
if orientation == "y":
|
|
54
|
+
rep_x = (line["x0"] + line["x1"]) / 2
|
|
55
|
+
self.vertical_lines.append(
|
|
56
|
+
{
|
|
57
|
+
"x0": rep_x,
|
|
58
|
+
"x1": rep_x,
|
|
59
|
+
"top": line["top"],
|
|
60
|
+
"bottom": line["bottom"],
|
|
61
|
+
"orientation": orientation,
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def extract_rects(self, rects: list[dict]) -> None:
|
|
66
|
+
"""we extract page.rects which can be disassembled to 4 lines"""
|
|
67
|
+
for rect in rects:
|
|
68
|
+
if self.get_orientation(rect) != "none" or (
|
|
69
|
+
rect["width"] < 5.0 and rect["height"] < 5.0
|
|
70
|
+
):
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
# 枠線がない or 色付けのためのrectならばcontinue
|
|
74
|
+
if rect["stroking_color"] is None or rect["fill"] is True:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
for y_key in ["top", "bottom"]:
|
|
78
|
+
self.horizontal_lines.append(
|
|
79
|
+
{
|
|
80
|
+
"x0": rect["x0"],
|
|
81
|
+
"x1": rect["x1"],
|
|
82
|
+
"top": rect[y_key],
|
|
83
|
+
"bottom": rect[y_key],
|
|
84
|
+
"orientation": "x",
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
for x_key in ["x0", "x1"]:
|
|
89
|
+
self.vertical_lines.append(
|
|
90
|
+
{
|
|
91
|
+
"x0": rect[x_key],
|
|
92
|
+
"x1": rect[x_key],
|
|
93
|
+
"top": rect["top"],
|
|
94
|
+
"bottom": rect["bottom"],
|
|
95
|
+
"orientation": "y",
|
|
96
|
+
}
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def can_merge(
|
|
100
|
+
self, line1_: dict, line2_: dict, main_coord, sub_start, sub_end, tolerance=1.5
|
|
101
|
+
) -> bool:
|
|
102
|
+
"""whether two lines can be merged"""
|
|
103
|
+
if line1_[sub_start] < line2_[sub_start]:
|
|
104
|
+
line1, line2 = line1_, line2_
|
|
105
|
+
else:
|
|
106
|
+
line1, line2 = line2_, line1_
|
|
107
|
+
|
|
108
|
+
is_collinear = abs(line1[main_coord] - line2[main_coord]) <= tolerance
|
|
109
|
+
is_connected = line2[sub_start] <= line1[sub_end] + tolerance
|
|
110
|
+
|
|
111
|
+
return is_collinear and is_connected
|
|
112
|
+
|
|
113
|
+
def merge_lines(
|
|
114
|
+
self, lines: list[dict], orientation: str, tolerance=1.5
|
|
115
|
+
) -> list[dict]:
|
|
116
|
+
"""merge collinear line segments"""
|
|
117
|
+
if not lines:
|
|
118
|
+
return []
|
|
119
|
+
|
|
120
|
+
# 1. lineを座標でsort
|
|
121
|
+
if orientation == "y":
|
|
122
|
+
# 1. y座標方向のlineをmergeしたい。x座標が等しいものを寄せる、y座標でsrot
|
|
123
|
+
lines.sort(key=itemgetter("x0", "top"))
|
|
124
|
+
main_coord = "x0"
|
|
125
|
+
sub_start = "top"
|
|
126
|
+
sub_end = "bottom"
|
|
127
|
+
else:
|
|
128
|
+
# 2. x座標方向のlineをmergeしたい。y座標が等しいものを寄せる、x座標でsrot
|
|
129
|
+
lines.sort(key=itemgetter("top", "x0"))
|
|
130
|
+
main_coord = "top"
|
|
131
|
+
sub_start = "x0"
|
|
132
|
+
sub_end = "x1"
|
|
133
|
+
|
|
134
|
+
merged = []
|
|
135
|
+
current = lines[0]
|
|
136
|
+
|
|
137
|
+
for next_line in lines[1:]:
|
|
138
|
+
if self.can_merge(
|
|
139
|
+
current, next_line, main_coord, sub_start, sub_end, tolerance
|
|
140
|
+
):
|
|
141
|
+
current[sub_start] = min(current[sub_start], next_line[sub_start])
|
|
142
|
+
current[sub_end] = max(current[sub_end], next_line[sub_end])
|
|
143
|
+
else:
|
|
144
|
+
merged.append(current)
|
|
145
|
+
current = next_line
|
|
146
|
+
|
|
147
|
+
merged.append(current)
|
|
148
|
+
return merged
|
|
149
|
+
|
|
150
|
+
def remove_included_lines(
|
|
151
|
+
self, lines: list[dict], orientation: str, tolerance=1.5
|
|
152
|
+
) -> list[dict]:
|
|
153
|
+
if orientation == "y":
|
|
154
|
+
# 1. y座標方向のlineをmergeしたい。x座標が等しいものを寄せる
|
|
155
|
+
main_coord = "x0"
|
|
156
|
+
sub_start = "top"
|
|
157
|
+
sub_end = "bottom"
|
|
158
|
+
else:
|
|
159
|
+
# 2. x座標方向のlineをmergeしたい。y座標が等しいものを寄せる
|
|
160
|
+
main_coord = "top"
|
|
161
|
+
sub_start = "x0"
|
|
162
|
+
sub_end = "x1"
|
|
163
|
+
|
|
164
|
+
skip = set()
|
|
165
|
+
for i, long in enumerate(lines):
|
|
166
|
+
for j, short in enumerate(lines):
|
|
167
|
+
if i in skip or j in skip or i == j:
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
is_collinear = abs(long[main_coord] - short[main_coord]) <= tolerance
|
|
171
|
+
include_start = long[sub_start] - tolerance <= short[sub_start]
|
|
172
|
+
include_end = short[sub_end] <= long[sub_end] + tolerance
|
|
173
|
+
|
|
174
|
+
if is_collinear and include_start and include_end:
|
|
175
|
+
skip.add(j)
|
|
176
|
+
|
|
177
|
+
extracted = [line for i, line in enumerate(lines) if i not in skip]
|
|
178
|
+
return extracted
|
|
179
|
+
|
|
180
|
+
def extract_by_length(self, min_line_length=3.0) -> list[dict]:
|
|
181
|
+
"""exclude the lines whose length is shorter than min_line_lenght"""
|
|
182
|
+
extract_lines: list[dict] = []
|
|
183
|
+
for line in self.lines:
|
|
184
|
+
orientation = line["orientation"]
|
|
185
|
+
length = (
|
|
186
|
+
line["x1"] - line["x0"]
|
|
187
|
+
if orientation == "x"
|
|
188
|
+
else line["bottom"] - line["top"]
|
|
189
|
+
)
|
|
190
|
+
if length >= min_line_length:
|
|
191
|
+
extract_lines.append(line)
|
|
192
|
+
|
|
193
|
+
return extract_lines
|
|
194
|
+
|
|
195
|
+
def extract(self) -> list[dict]:
|
|
196
|
+
"""extract the lines contained in the page"""
|
|
197
|
+
merged_horizontal = self.merge_lines(self.horizontal_lines, "x")
|
|
198
|
+
merged_vertical = self.merge_lines(self.vertical_lines, "y")
|
|
199
|
+
|
|
200
|
+
self.lines = merged_horizontal + merged_vertical
|
|
201
|
+
self.lines = self.extract_by_length()
|
|
202
|
+
|
|
203
|
+
return self.lines
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from pdf_kintsugi.utils.coord import is_same
|
|
2
|
+
from pdf_kintsugi.utils.union_find import UnionFind
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class LineMerger:
|
|
6
|
+
def __init__(self, lines: list[dict], tolerance: float) -> None:
|
|
7
|
+
self.lines: list[dict] = lines
|
|
8
|
+
self.tolerance: float = tolerance
|
|
9
|
+
|
|
10
|
+
def merge(self) -> list[list[dict]]:
|
|
11
|
+
line_num = len(self.lines)
|
|
12
|
+
uf = UnionFind(line_num)
|
|
13
|
+
|
|
14
|
+
for i in range(line_num):
|
|
15
|
+
for j in range(i + 1, line_num):
|
|
16
|
+
if self._is_crossing_lines(i, j):
|
|
17
|
+
uf.merge(i, j)
|
|
18
|
+
|
|
19
|
+
self.group_ids: list[list[int]] = uf.groups()
|
|
20
|
+
self.groups: list[list[dict]] = [
|
|
21
|
+
[self.lines[id] for id in group] for group in self.group_ids
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
self._infer_table_edges()
|
|
25
|
+
|
|
26
|
+
return self.groups
|
|
27
|
+
|
|
28
|
+
def _infer_table_edges(self) -> None:
|
|
29
|
+
core_groups = []
|
|
30
|
+
loose_lines = []
|
|
31
|
+
|
|
32
|
+
for group in self.groups:
|
|
33
|
+
has_x = any(line["orientation"] == "x" for line in group)
|
|
34
|
+
has_y = any(line["orientation"] == "y" for line in group)
|
|
35
|
+
if has_x and has_y and len(group) > 4:
|
|
36
|
+
core_groups.append(group)
|
|
37
|
+
elif len(group) == 1:
|
|
38
|
+
loose_lines.extend(group)
|
|
39
|
+
|
|
40
|
+
for group in core_groups:
|
|
41
|
+
x_lines = [line for line in group if line["orientation"] == "x"]
|
|
42
|
+
y_lines = [line for line in group if line["orientation"] == "y"]
|
|
43
|
+
|
|
44
|
+
logical_left = min(line["x0"] for line in x_lines)
|
|
45
|
+
logical_right = max(line["x1"] for line in x_lines)
|
|
46
|
+
logical_top = min(line["top"] for line in y_lines)
|
|
47
|
+
logical_bottom = max(line["bottom"] for line in y_lines)
|
|
48
|
+
|
|
49
|
+
# Check y-edges (left and right)
|
|
50
|
+
for target_x in (logical_left, logical_right):
|
|
51
|
+
has_edge = any(is_same(line["x0"], target_x, 1.5) for line in y_lines)
|
|
52
|
+
|
|
53
|
+
if not has_edge:
|
|
54
|
+
found = False
|
|
55
|
+
for i, loose in enumerate(loose_lines):
|
|
56
|
+
if loose["orientation"] == "y" and is_same(
|
|
57
|
+
loose["x0"], target_x, self.tolerance
|
|
58
|
+
):
|
|
59
|
+
overlap = min(loose["bottom"], logical_bottom) - max(
|
|
60
|
+
loose["top"], logical_top
|
|
61
|
+
)
|
|
62
|
+
if overlap > 0:
|
|
63
|
+
group.append(loose)
|
|
64
|
+
y_lines.append(loose)
|
|
65
|
+
loose_lines.pop(i)
|
|
66
|
+
found = True
|
|
67
|
+
break
|
|
68
|
+
if not found:
|
|
69
|
+
interpolated = {
|
|
70
|
+
"orientation": "y",
|
|
71
|
+
"x0": target_x,
|
|
72
|
+
"x1": target_x,
|
|
73
|
+
"top": logical_top,
|
|
74
|
+
"bottom": logical_bottom,
|
|
75
|
+
}
|
|
76
|
+
group.append(interpolated)
|
|
77
|
+
y_lines.append(interpolated)
|
|
78
|
+
|
|
79
|
+
# Check x-edges (top and bottom)
|
|
80
|
+
for target_y in (logical_top, logical_bottom):
|
|
81
|
+
has_edge = any(is_same(line["top"], target_y, 1.5) for line in x_lines)
|
|
82
|
+
if not has_edge:
|
|
83
|
+
found = False
|
|
84
|
+
for i, loose in enumerate(loose_lines):
|
|
85
|
+
if loose["orientation"] == "x" and is_same(
|
|
86
|
+
loose["top"], target_y, self.tolerance
|
|
87
|
+
):
|
|
88
|
+
overlap = min(loose["x1"], logical_right) - max(
|
|
89
|
+
loose["x0"], logical_left
|
|
90
|
+
)
|
|
91
|
+
if overlap > 0:
|
|
92
|
+
group.append(loose)
|
|
93
|
+
x_lines.append(loose)
|
|
94
|
+
loose_lines.pop(i)
|
|
95
|
+
found = True
|
|
96
|
+
break
|
|
97
|
+
if not found:
|
|
98
|
+
interpolated = {
|
|
99
|
+
"orientation": "x",
|
|
100
|
+
"x0": logical_left,
|
|
101
|
+
"x1": logical_right,
|
|
102
|
+
"top": target_y,
|
|
103
|
+
"bottom": target_y,
|
|
104
|
+
}
|
|
105
|
+
group.append(interpolated)
|
|
106
|
+
x_lines.append(interpolated)
|
|
107
|
+
|
|
108
|
+
self.groups = core_groups
|
|
109
|
+
if loose_lines:
|
|
110
|
+
self.groups.extend([[line] for line in loose_lines])
|
|
111
|
+
|
|
112
|
+
def _is_crossing_lines(self, id1: int, id2: int, tolerance: float = 1.5) -> bool:
|
|
113
|
+
if self.lines[id1]["orientation"] == self.lines[id2]["orientation"]:
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
if self.lines[id1]["orientation"] == "y":
|
|
117
|
+
id1, id2 = id2, id1
|
|
118
|
+
|
|
119
|
+
x_line = self.lines[id1]
|
|
120
|
+
y_line = self.lines[id2]
|
|
121
|
+
|
|
122
|
+
return (
|
|
123
|
+
x_line["x0"] - tolerance <= y_line["x0"] <= x_line["x1"] + tolerance
|
|
124
|
+
and y_line["top"] - tolerance
|
|
125
|
+
<= x_line["top"]
|
|
126
|
+
<= y_line["bottom"] + tolerance
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def extract_independent_horizontal_lines(self) -> list[dict]:
|
|
130
|
+
independent_horizontal_lines = [
|
|
131
|
+
self.lines[group[0]]
|
|
132
|
+
for group in self.group_ids
|
|
133
|
+
if len(group) == 1 and self.lines[group[0]]["orientation"] == "x"
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
return independent_horizontal_lines
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from pdfplumber.page import Page
|
|
2
|
+
|
|
3
|
+
from pdf_kintsugi.line.extractor import LineExtractor
|
|
4
|
+
from pdf_kintsugi.line.merger import LineMerger
|
|
5
|
+
from pdf_kintsugi.table.detector import TableDetector
|
|
6
|
+
from pdf_kintsugi.table.merger import TableMerger
|
|
7
|
+
from pdf_kintsugi.table.table import Table
|
|
8
|
+
from pdf_kintsugi.text.char_extractor import CharExtractor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PageParser:
|
|
12
|
+
def __init__(self, page: Page, docling_tables: list[dict], tolerance) -> None:
|
|
13
|
+
self.page: Page = page
|
|
14
|
+
self.page_number: int = page.page_number
|
|
15
|
+
self.doclings: list[dict] = docling_tables
|
|
16
|
+
self.tol: float = tolerance
|
|
17
|
+
|
|
18
|
+
char_extractor = CharExtractor(page)
|
|
19
|
+
self.chars: list[dict] = char_extractor.extract()
|
|
20
|
+
|
|
21
|
+
line_extractor = LineExtractor(self.page)
|
|
22
|
+
self.lines: list[dict] = line_extractor.extract()
|
|
23
|
+
|
|
24
|
+
self.build(counter=0)
|
|
25
|
+
|
|
26
|
+
def build(self, counter: int = 1) -> list[list[Table]]:
|
|
27
|
+
if counter == 1:
|
|
28
|
+
line_extractor = LineExtractor(None, self.lines)
|
|
29
|
+
self.lines = line_extractor.extract()
|
|
30
|
+
|
|
31
|
+
my_tables: list[Table] = self._detect_tables()
|
|
32
|
+
links: list[list[int]] = self._link_docling(my_tables)
|
|
33
|
+
|
|
34
|
+
# doclingとtableが1対1対応しているやつを解析する, それ以外のmy tableは不要
|
|
35
|
+
valid_indices = [i for i in range(len(my_tables)) if len(links[i]) == 1]
|
|
36
|
+
my_tables = [my_tables[i] for i in valid_indices]
|
|
37
|
+
links = [links[i] for i in valid_indices]
|
|
38
|
+
build_success_tables: list[Table] = []
|
|
39
|
+
|
|
40
|
+
# build
|
|
41
|
+
for i, table in enumerate(my_tables):
|
|
42
|
+
docling = self.doclings[links[i][0]]
|
|
43
|
+
success = table.build(self.chars, docling)
|
|
44
|
+
|
|
45
|
+
if success:
|
|
46
|
+
build_success_tables.append(table)
|
|
47
|
+
|
|
48
|
+
my_tables = build_success_tables
|
|
49
|
+
links = self._link_docling(my_tables)
|
|
50
|
+
|
|
51
|
+
if counter == 0:
|
|
52
|
+
self._merge_tables(my_tables, links)
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
rev_links: list[list[Table]] = [[] for _ in range(len(self.doclings))]
|
|
56
|
+
for my_idx, doc_idx in enumerate(links):
|
|
57
|
+
rev_links[doc_idx[0]].append(my_tables[my_idx])
|
|
58
|
+
|
|
59
|
+
return rev_links
|
|
60
|
+
|
|
61
|
+
def _merge_tables(self, my_tables: list[Table], links: list[list[int]]) -> None:
|
|
62
|
+
same_cluster: dict[int, list[Table]] = {}
|
|
63
|
+
for i, table in enumerate(my_tables):
|
|
64
|
+
idx = links[i][0]
|
|
65
|
+
if idx not in same_cluster:
|
|
66
|
+
same_cluster[idx] = []
|
|
67
|
+
same_cluster[idx].append(table)
|
|
68
|
+
|
|
69
|
+
table_merger = TableMerger()
|
|
70
|
+
for tables in same_cluster.values():
|
|
71
|
+
sz = len(tables)
|
|
72
|
+
for i in range(sz):
|
|
73
|
+
for j in range(i + 1, sz):
|
|
74
|
+
table_merger.merge(tables[i], tables[j])
|
|
75
|
+
|
|
76
|
+
def _link_docling(self, tables: list[Table]) -> list[list[int]]:
|
|
77
|
+
"""
|
|
78
|
+
docling_table内にあるmy_tableを紐づける, my_tableがkeyでdoclingのindexがvalue
|
|
79
|
+
"""
|
|
80
|
+
links: list[list[int]] = [
|
|
81
|
+
[
|
|
82
|
+
i
|
|
83
|
+
for i, doc_table in enumerate(self.doclings)
|
|
84
|
+
if max(doc_table["bbox"][0], table.bbox[0])
|
|
85
|
+
< min(doc_table["bbox"][2], table.bbox[2])
|
|
86
|
+
and max(doc_table["bbox"][1], table.bbox[1])
|
|
87
|
+
< min(doc_table["bbox"][3], table.bbox[3])
|
|
88
|
+
]
|
|
89
|
+
for table in tables
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
return links
|
|
93
|
+
|
|
94
|
+
def _detect_tables(self) -> list[Table]:
|
|
95
|
+
line_groups = self._merge_lines()
|
|
96
|
+
|
|
97
|
+
table_detector = TableDetector(line_groups, self.page_number, self.tol)
|
|
98
|
+
tables: list[Table] = table_detector.detect()
|
|
99
|
+
return tables
|
|
100
|
+
|
|
101
|
+
def _merge_lines(self) -> list[list[dict]]:
|
|
102
|
+
line_merger = LineMerger(self.lines, self.tol)
|
|
103
|
+
line_groups: list[list[dict]] = line_merger.merge()
|
|
104
|
+
return line_groups
|