doc-page-extractor 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc_page_extractor/extractor.py +7 -6
- doc_page_extractor/ocr.py +1 -1
- doc_page_extractor/rectangle.py +6 -0
- {doc_page_extractor-0.1.0.dist-info → doc_page_extractor-0.1.1.dist-info}/METADATA +1 -1
- {doc_page_extractor-0.1.0.dist-info → doc_page_extractor-0.1.1.dist-info}/RECORD +8 -8
- {doc_page_extractor-0.1.0.dist-info → doc_page_extractor-0.1.1.dist-info}/WHEEL +0 -0
- {doc_page_extractor-0.1.0.dist-info → doc_page_extractor-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {doc_page_extractor-0.1.0.dist-info → doc_page_extractor-0.1.1.dist-info}/top_level.txt +0 -0
doc_page_extractor/extractor.py
CHANGED
|
@@ -116,12 +116,13 @@ class DocExtractor:
|
|
|
116
116
|
lb=(x1, y2),
|
|
117
117
|
rb=(x2, y2),
|
|
118
118
|
)
|
|
119
|
-
if
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
119
|
+
if rect.is_valid:
|
|
120
|
+
if cls == LayoutClass.TABLE:
|
|
121
|
+
yield TableLayout(cls=cls, rect=rect, fragments=[], parsed=None)
|
|
122
|
+
elif cls == LayoutClass.ISOLATE_FORMULA:
|
|
123
|
+
yield FormulaLayout(cls=cls, rect=rect, fragments=[], latex=None)
|
|
124
|
+
else:
|
|
125
|
+
yield PlainLayout(cls=cls, rect=rect, fragments=[])
|
|
125
126
|
|
|
126
127
|
def _layouts_matched_by_fragments(self, fragments: list[OCRFragment], layouts: list[Layout]):
|
|
127
128
|
layouts_group = self._split_layouts_by_group(layouts)
|
doc_page_extractor/ocr.py
CHANGED
doc_page_extractor/rectangle.py
CHANGED
|
@@ -19,6 +19,10 @@ class Rectangle:
|
|
|
19
19
|
yield self.rb
|
|
20
20
|
yield self.rt
|
|
21
21
|
|
|
22
|
+
@property
|
|
23
|
+
def is_valid(self) -> bool:
|
|
24
|
+
return Polygon(self).is_valid
|
|
25
|
+
|
|
22
26
|
@property
|
|
23
27
|
def segments(self) -> Generator[tuple[Point, Point], None, None]:
|
|
24
28
|
yield (self.lt, self.lb)
|
|
@@ -60,6 +64,8 @@ class Rectangle:
|
|
|
60
64
|
def intersection_area(rect1: Rectangle, rect2: Rectangle) -> float:
|
|
61
65
|
poly1 = Polygon(rect1)
|
|
62
66
|
poly2 = Polygon(rect2)
|
|
67
|
+
if not poly1.is_valid or not poly2.is_valid:
|
|
68
|
+
return 0.0
|
|
63
69
|
intersection = poly1.intersection(poly2)
|
|
64
70
|
if intersection.is_empty:
|
|
65
71
|
return 0.0
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
doc_page_extractor/__init__.py,sha256=9rWKSMTgzP7Xv15zA4upsyPaR8S8JeNpMyhWElRCW0M,311
|
|
2
2
|
doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
|
|
3
3
|
doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
|
|
4
|
-
doc_page_extractor/extractor.py,sha256=
|
|
4
|
+
doc_page_extractor/extractor.py,sha256=V0S9Nn65lKpkz8DnTUMcsAzUJGwSQvupIBTiVqzLpJ8,7303
|
|
5
5
|
doc_page_extractor/latex.py,sha256=W_zAcksNRuru-WjCq4CSn07s_SWrDhikadJSy_Cg3Do,1954
|
|
6
6
|
doc_page_extractor/layout_order.py,sha256=NwMzTPr4xsriz4slCwqwhw2-vrMu-qfwtcFsDu8d1yM,7426
|
|
7
7
|
doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
|
|
8
|
-
doc_page_extractor/ocr.py,sha256=
|
|
8
|
+
doc_page_extractor/ocr.py,sha256=hQhT9bdsJmWESqt1FODCoE19wfOroM8uHZiFoZZrkQU,5182
|
|
9
9
|
doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
|
|
10
10
|
doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
|
|
11
11
|
doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
|
|
12
12
|
doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
|
|
13
|
-
doc_page_extractor/rectangle.py,sha256=
|
|
13
|
+
doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
|
|
14
14
|
doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
|
|
15
15
|
doc_page_extractor/table.py,sha256=AWymTRbOet55uImW8QJqb90Qs_v2V2U1mZv0U6rSz3c,1891
|
|
16
16
|
doc_page_extractor/types.py,sha256=7blT8YNKrOsc4qQdAhM7J7MEQjFcBwE0QV8-lipZBeQ,1305
|
|
@@ -35,10 +35,10 @@ doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py,sha256=ACHxFntxS
|
|
|
35
35
|
doc_page_extractor/struct_eqtable/pix2s/__init__.py,sha256=cXRo4eg6u1-TXktZ8rQf0HIzLmmScIwYQhbxMKl-MyA,76
|
|
36
36
|
doc_page_extractor/struct_eqtable/pix2s/pix2s.py,sha256=fCNve8PNeJ3-AWJIhSeGtp-mYKoMXfW0CIpszkQnAaA,2535
|
|
37
37
|
doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py,sha256=zSGw45JhWdZ3iuJel5Chsy-NzsOHx9QyPQIUAzzCjFE,43880
|
|
38
|
-
doc_page_extractor-0.1.
|
|
38
|
+
doc_page_extractor-0.1.1.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
|
|
39
39
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
40
|
tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
|
|
41
|
-
doc_page_extractor-0.1.
|
|
42
|
-
doc_page_extractor-0.1.
|
|
43
|
-
doc_page_extractor-0.1.
|
|
44
|
-
doc_page_extractor-0.1.
|
|
41
|
+
doc_page_extractor-0.1.1.dist-info/METADATA,sha256=5bQtvYgjNghsYER1zmc19_6BH4JrgrLj9_KxUmnLnHc,2436
|
|
42
|
+
doc_page_extractor-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
43
|
+
doc_page_extractor-0.1.1.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
|
|
44
|
+
doc_page_extractor-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|