doc-page-extractor 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -116,12 +116,13 @@ class DocExtractor:
116
116
  lb=(x1, y2),
117
117
  rb=(x2, y2),
118
118
  )
119
- if cls == LayoutClass.TABLE:
120
- yield TableLayout(cls=cls, rect=rect, fragments=[], parsed=None)
121
- elif cls == LayoutClass.ISOLATE_FORMULA:
122
- yield FormulaLayout(cls=cls, rect=rect, fragments=[], latex=None)
123
- else:
124
- yield PlainLayout(cls=cls, rect=rect, fragments=[])
119
+ if rect.is_valid:
120
+ if cls == LayoutClass.TABLE:
121
+ yield TableLayout(cls=cls, rect=rect, fragments=[], parsed=None)
122
+ elif cls == LayoutClass.ISOLATE_FORMULA:
123
+ yield FormulaLayout(cls=cls, rect=rect, fragments=[], latex=None)
124
+ else:
125
+ yield PlainLayout(cls=cls, rect=rect, fragments=[])
125
126
 
126
127
  def _layouts_matched_by_fragments(self, fragments: list[OCRFragment], layouts: list[Layout]):
127
128
  layouts_group = self._split_layouts_by_group(layouts)
doc_page_extractor/ocr.py CHANGED
@@ -69,7 +69,7 @@ class OCR:
69
69
  rb=(box[2][0], box[2][1]),
70
70
  lb=(box[3][0], box[3][1]),
71
71
  )
72
- if rect.area == 0.0:
72
+ if not rect.is_valid or rect.area == 0.0:
73
73
  continue
74
74
 
75
75
  yield OCRFragment(
@@ -19,6 +19,10 @@ class Rectangle:
19
19
  yield self.rb
20
20
  yield self.rt
21
21
 
22
+ @property
23
+ def is_valid(self) -> bool:
24
+ return Polygon(self).is_valid
25
+
22
26
  @property
23
27
  def segments(self) -> Generator[tuple[Point, Point], None, None]:
24
28
  yield (self.lt, self.lb)
@@ -60,6 +64,8 @@ class Rectangle:
60
64
  def intersection_area(rect1: Rectangle, rect2: Rectangle) -> float:
61
65
  poly1 = Polygon(rect1)
62
66
  poly2 = Polygon(rect2)
67
+ if not poly1.is_valid or not poly2.is_valid:
68
+ return 0.0
63
69
  intersection = poly1.intersection(poly2)
64
70
  if intersection.is_empty:
65
71
  return 0.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -1,16 +1,16 @@
1
1
  doc_page_extractor/__init__.py,sha256=9rWKSMTgzP7Xv15zA4upsyPaR8S8JeNpMyhWElRCW0M,311
2
2
  doc_page_extractor/clipper.py,sha256=5S1TI0aqMebwlPv_Ih4Nxpp6MchEjOih-CiZfMWUAhI,3201
3
3
  doc_page_extractor/downloader.py,sha256=NbGN9ARnER8-gd4T1uc3W98WMEClVxMrqnShq8HibTw,455
4
- doc_page_extractor/extractor.py,sha256=njLl8VdOgm-noWPDYTfjIAUU1giNc-wLvCSR1pHkfS8,7267
4
+ doc_page_extractor/extractor.py,sha256=V0S9Nn65lKpkz8DnTUMcsAzUJGwSQvupIBTiVqzLpJ8,7303
5
5
  doc_page_extractor/latex.py,sha256=W_zAcksNRuru-WjCq4CSn07s_SWrDhikadJSy_Cg3Do,1954
6
6
  doc_page_extractor/layout_order.py,sha256=NwMzTPr4xsriz4slCwqwhw2-vrMu-qfwtcFsDu8d1yM,7426
7
7
  doc_page_extractor/layoutreader.py,sha256=BdC4oPbtpXoLmYhjuSFrKn6SNoT2zWw_gi95sGAUwrk,4031
8
- doc_page_extractor/ocr.py,sha256=KJ5PqtBa4_n8LAfMLGApUVNPUS1DBEwVKcC-zck283I,5161
8
+ doc_page_extractor/ocr.py,sha256=hQhT9bdsJmWESqt1FODCoE19wfOroM8uHZiFoZZrkQU,5182
9
9
  doc_page_extractor/ocr_corrector.py,sha256=RfRA1jESEuqC8_a2kUEvHblT_B4xBjE0OApLMl1JiRg,3917
10
10
  doc_page_extractor/overlap.py,sha256=z1DF4_2OPvauDHwmz1SC1WosULkE84HKaRfNEgexPzc,5337
11
11
  doc_page_extractor/plot.py,sha256=4uibjS_x1SyEyjaJJd0YsBbzkgldDOCct4Ry2cOhdXU,2556
12
12
  doc_page_extractor/raw_optimizer.py,sha256=1KghECq_rJwuZZITTLQnGTKYivFKg_qDvMLN9g17sks,2844
13
- doc_page_extractor/rectangle.py,sha256=Tp__NPiY6JlYwYxejST7BUXhv_bl8tkmDXi4JgHCK6E,1539
13
+ doc_page_extractor/rectangle.py,sha256=yeW6srdrsxaJg1eb3nn8oxtY0sfgeBk3hMiuJGaRXwY,1678
14
14
  doc_page_extractor/rotation.py,sha256=QCZ-HqfDxIhnQw8KRHki2myj6-UusvNY7Mpjsu-wI-4,4334
15
15
  doc_page_extractor/table.py,sha256=AWymTRbOet55uImW8QJqb90Qs_v2V2U1mZv0U6rSz3c,1891
16
16
  doc_page_extractor/types.py,sha256=7blT8YNKrOsc4qQdAhM7J7MEQjFcBwE0QV8-lipZBeQ,1305
@@ -35,10 +35,10 @@ doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py,sha256=ACHxFntxS
35
35
  doc_page_extractor/struct_eqtable/pix2s/__init__.py,sha256=cXRo4eg6u1-TXktZ8rQf0HIzLmmScIwYQhbxMKl-MyA,76
36
36
  doc_page_extractor/struct_eqtable/pix2s/pix2s.py,sha256=fCNve8PNeJ3-AWJIhSeGtp-mYKoMXfW0CIpszkQnAaA,2535
37
37
  doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py,sha256=zSGw45JhWdZ3iuJel5Chsy-NzsOHx9QyPQIUAzzCjFE,43880
38
- doc_page_extractor-0.1.0.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
38
+ doc_page_extractor-0.1.1.dist-info/licenses/LICENSE,sha256=TfPDBt3ar0uv_f9cqCDMZ5rIzW3CY8anRRd4PkL6ejs,34522
39
39
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  tests/test_history_bus.py,sha256=WaCUW3U75SESMcLq0f5FKnpVUVRDvmfxLFE7Zo83e48,2517
41
- doc_page_extractor-0.1.0.dist-info/METADATA,sha256=8AM05x2gY75j70rD2HNi5zyaKE1okNJG7Pw5iLuIlnQ,2436
42
- doc_page_extractor-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
43
- doc_page_extractor-0.1.0.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
44
- doc_page_extractor-0.1.0.dist-info/RECORD,,
41
+ doc_page_extractor-0.1.1.dist-info/METADATA,sha256=5bQtvYgjNghsYER1zmc19_6BH4JrgrLj9_KxUmnLnHc,2436
42
+ doc_page_extractor-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
43
+ doc_page_extractor-0.1.1.dist-info/top_level.txt,sha256=ErNybD_lBzAmw8mVBAK4htsAH_hp14jioZVex-tUqvM,25
44
+ doc_page_extractor-0.1.1.dist-info/RECORD,,