doc-page-extractor 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (49) hide show
  1. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/PKG-INFO +1 -1
  2. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/extractor.py +7 -6
  3. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/ocr.py +1 -1
  4. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/rectangle.py +6 -0
  5. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor.egg-info/PKG-INFO +1 -1
  6. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/setup.py +1 -1
  7. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/LICENSE +0 -0
  8. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/README.md +0 -0
  9. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/__init__.py +0 -0
  10. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/clipper.py +0 -0
  11. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/downloader.py +0 -0
  12. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/latex.py +0 -0
  13. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/layout_order.py +0 -0
  14. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/layoutreader.py +0 -0
  15. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/ocr_corrector.py +0 -0
  16. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/__init__.py +0 -0
  17. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  18. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  19. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/imaug.py +0 -0
  20. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/operators.py +0 -0
  21. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  22. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  23. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  24. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  25. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  26. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  27. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/onnxocr/utils.py +0 -0
  28. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/overlap.py +0 -0
  29. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/plot.py +0 -0
  30. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/raw_optimizer.py +0 -0
  31. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/rotation.py +0 -0
  32. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/__init__.py +0 -0
  33. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/internvl/__init__.py +0 -0
  34. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/internvl/conversation.py +0 -0
  35. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/internvl/internvl.py +0 -0
  36. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/internvl/internvl_lmdeploy.py +0 -0
  37. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/pix2s/__init__.py +0 -0
  38. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/pix2s/pix2s.py +0 -0
  39. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/struct_eqtable/pix2s/pix2s_trt.py +0 -0
  40. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/table.py +0 -0
  41. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/types.py +0 -0
  42. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor/utils.py +0 -0
  43. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor.egg-info/SOURCES.txt +0 -0
  44. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
  45. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor.egg-info/requires.txt +0 -0
  46. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/doc_page_extractor.egg-info/top_level.txt +0 -0
  47. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/setup.cfg +0 -0
  48. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/tests/__init__.py +0 -0
  49. {doc_page_extractor-0.1.0 → doc_page_extractor-0.1.1}/tests/test_history_bus.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -116,12 +116,13 @@ class DocExtractor:
116
116
  lb=(x1, y2),
117
117
  rb=(x2, y2),
118
118
  )
119
- if cls == LayoutClass.TABLE:
120
- yield TableLayout(cls=cls, rect=rect, fragments=[], parsed=None)
121
- elif cls == LayoutClass.ISOLATE_FORMULA:
122
- yield FormulaLayout(cls=cls, rect=rect, fragments=[], latex=None)
123
- else:
124
- yield PlainLayout(cls=cls, rect=rect, fragments=[])
119
+ if rect.is_valid:
120
+ if cls == LayoutClass.TABLE:
121
+ yield TableLayout(cls=cls, rect=rect, fragments=[], parsed=None)
122
+ elif cls == LayoutClass.ISOLATE_FORMULA:
123
+ yield FormulaLayout(cls=cls, rect=rect, fragments=[], latex=None)
124
+ else:
125
+ yield PlainLayout(cls=cls, rect=rect, fragments=[])
125
126
 
126
127
  def _layouts_matched_by_fragments(self, fragments: list[OCRFragment], layouts: list[Layout]):
127
128
  layouts_group = self._split_layouts_by_group(layouts)
@@ -69,7 +69,7 @@ class OCR:
69
69
  rb=(box[2][0], box[2][1]),
70
70
  lb=(box[3][0], box[3][1]),
71
71
  )
72
- if rect.area == 0.0:
72
+ if not rect.is_valid or rect.area == 0.0:
73
73
  continue
74
74
 
75
75
  yield OCRFragment(
@@ -19,6 +19,10 @@ class Rectangle:
19
19
  yield self.rb
20
20
  yield self.rt
21
21
 
22
+ @property
23
+ def is_valid(self) -> bool:
24
+ return Polygon(self).is_valid
25
+
22
26
  @property
23
27
  def segments(self) -> Generator[tuple[Point, Point], None, None]:
24
28
  yield (self.lt, self.lb)
@@ -60,6 +64,8 @@ class Rectangle:
60
64
  def intersection_area(rect1: Rectangle, rect2: Rectangle) -> float:
61
65
  poly1 = Polygon(rect1)
62
66
  poly2 = Polygon(rect2)
67
+ if not poly1.is_valid or not poly2.is_valid:
68
+ return 0.0
63
69
  intersection = poly1.intersection(poly2)
64
70
  if intersection.is_empty:
65
71
  return 0.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -5,7 +5,7 @@ if "doc_page_extractor.struct_eqtable" not in find_packages():
5
5
 
6
6
  setup(
7
7
  name="doc-page-extractor",
8
- version="0.1.0",
8
+ version="0.1.1",
9
9
  author="Tao Zeyu",
10
10
  author_email="i@taozeyu.com",
11
11
  url="https://github.com/Moskize91/doc-page-extractor",