doc-page-extractor 0.0.8__tar.gz → 0.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

Files changed (39) hide show
  1. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/PKG-INFO +3 -2
  2. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/overlap.py +10 -8
  3. doc_page_extractor-0.0.9/doc_page_extractor/plot.py +91 -0
  4. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/PKG-INFO +3 -2
  5. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/setup.py +1 -1
  6. doc_page_extractor-0.0.8/doc_page_extractor/plot.py +0 -38
  7. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/LICENSE +0 -0
  8. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/README.md +0 -0
  9. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/__init__.py +0 -0
  10. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/clipper.py +0 -0
  11. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/downloader.py +0 -0
  12. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/extractor.py +0 -0
  13. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/layoutreader.py +0 -0
  14. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/ocr.py +0 -0
  15. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/ocr_corrector.py +0 -0
  16. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/__init__.py +0 -0
  17. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/cls_postprocess.py +0 -0
  18. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/db_postprocess.py +0 -0
  19. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/imaug.py +0 -0
  20. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/operators.py +0 -0
  21. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_base.py +0 -0
  22. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_cls.py +0 -0
  23. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_det.py +0 -0
  24. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_rec.py +0 -0
  25. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/predict_system.py +0 -0
  26. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/rec_postprocess.py +0 -0
  27. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/onnxocr/utils.py +0 -0
  28. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/raw_optimizer.py +0 -0
  29. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/rectangle.py +0 -0
  30. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/rotation.py +0 -0
  31. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/types.py +0 -0
  32. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor/utils.py +0 -0
  33. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/SOURCES.txt +0 -0
  34. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/dependency_links.txt +0 -0
  35. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/requires.txt +0 -0
  36. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/doc_page_extractor.egg-info/top_level.txt +0 -0
  37. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/setup.cfg +0 -0
  38. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/tests/__init__.py +0 -0
  39. {doc_page_extractor-0.0.8 → doc_page_extractor-0.0.9}/tests/test_history_bus.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -19,6 +19,7 @@ Dynamic: author-email
19
19
  Dynamic: description
20
20
  Dynamic: description-content-type
21
21
  Dynamic: home-page
22
+ Dynamic: license-file
22
23
  Dynamic: requires-dist
23
24
  Dynamic: summary
24
25
 
@@ -88,7 +88,7 @@ def regroup_lines(origin_fragments: list[OCRFragment]) -> list[OCRFragment]:
88
88
  x2 = max(x2, x)
89
89
  y2 = max(y2, y)
90
90
 
91
- if len(proto_texts_len) == 0:
91
+ if proto_texts_len == 0:
92
92
  continue
93
93
 
94
94
  fragments.append(OCRFragment(
@@ -115,17 +115,19 @@ def _split_fragments_into_groups(fragments: list[OCRFragment]) -> Generator[list
115
115
  height = y2 - y1
116
116
  median = (y1 + y2) / 2.0
117
117
 
118
+ if height == 0:
119
+ continue
120
+
118
121
  if len(group) > 0:
119
122
  next_mean_median = (sum_median + median) / (len(group) + 1)
120
123
  next_mean_height = (sum_height + height) / (len(group) + 1)
121
124
 
122
- if next_mean_height > 0:
123
- deviation_rate = abs(median - next_mean_median) / next_mean_height
124
- if deviation_rate > max_deviation_rate:
125
- yield group
126
- group = []
127
- sum_height = 0.0
128
- sum_median = 0.0
125
+ deviation_rate = abs(median - next_mean_median) / next_mean_height
126
+ if deviation_rate > max_deviation_rate:
127
+ yield group
128
+ group = []
129
+ sum_height = 0.0
130
+ sum_median = 0.0
129
131
 
130
132
  group.append(fragment)
131
133
  sum_height += height
@@ -0,0 +1,91 @@
1
+ from typing import Iterable
2
+ from PIL import ImageDraw
3
+ from PIL.ImageFont import load_default, FreeTypeFont
4
+ from PIL.Image import Image
5
+ from .types import Layout, LayoutClass
6
+ from .rectangle import Point
7
+
8
+ _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
9
+ _Color = tuple[int, int, int]
10
+
11
+ def plot(image: Image, layouts: Iterable[Layout]) -> None:
12
+ layout_font = load_default(size=35)
13
+ fragment_font = load_default(size=25)
14
+ draw = ImageDraw.Draw(image, mode="RGBA")
15
+
16
+ def _draw_number(position: Point, number: int, font: FreeTypeFont, bold: bool, color: _Color) -> None:
17
+ nonlocal draw
18
+ x, y = position
19
+ text = str(object=number)
20
+ width = len(text) * font.size
21
+ offset = round(font.size * 0.15)
22
+
23
+ for dx, dy in _generate_delta(bold):
24
+ draw.text(
25
+ xy=(x + dx - width - offset, y + dy),
26
+ text=text,
27
+ font=font,
28
+ fill=color,
29
+ )
30
+
31
+ for layout in layouts:
32
+ draw.polygon(
33
+ xy=[p for p in layout.rect],
34
+ outline=_layout_color(layout),
35
+ width=5,
36
+ )
37
+
38
+ for layout in layouts:
39
+ for fragment in layout.fragments:
40
+ draw.polygon(
41
+ xy=[p for p in fragment.rect],
42
+ outline=_FRAGMENT_COLOR,
43
+ width=3,
44
+ )
45
+ _draw_number(
46
+ position=fragment.rect.lt,
47
+ number=fragment.order + 1,
48
+ font=fragment_font,
49
+ bold=False,
50
+ color=_FRAGMENT_COLOR,
51
+ )
52
+
53
+ for i, layout in enumerate(layouts):
54
+ _draw_number(
55
+ position=layout.rect.lt,
56
+ number=i + 1,
57
+ font=layout_font,
58
+ bold=True,
59
+ color=_layout_color(layout),
60
+ )
61
+
62
+ def _generate_delta(bold: bool):
63
+ if bold:
64
+ for dx in range(-1, 2):
65
+ for dy in range(-1, 2):
66
+ yield dx, dy
67
+ else:
68
+ yield 0, 0
69
+
70
+ def _layout_color(layout: Layout) -> _Color:
71
+ cls = layout.cls
72
+ if cls == LayoutClass.TITLE:
73
+ return (0x0A, 0x12, 0x2C) # Dark
74
+ elif cls == LayoutClass.PLAIN_TEXT:
75
+ return (0x3C, 0x67, 0x90) # Blue
76
+ elif cls == LayoutClass.ABANDON:
77
+ return (0xC0, 0xBB, 0xA9) # Gray
78
+ elif cls == LayoutClass.FIGURE:
79
+ return (0x5B, 0x91, 0x3C) # Dark Green
80
+ elif cls == LayoutClass.FIGURE_CAPTION:
81
+ return (0x77, 0xB3, 0x54) # Green
82
+ elif cls == LayoutClass.TABLE:
83
+ return (0x44, 0x17, 0x52) # Dark Purple
84
+ elif cls == LayoutClass.TABLE_CAPTION:
85
+ return (0x81, 0x75, 0xA0) # Purple
86
+ elif cls == LayoutClass.TABLE_FOOTNOTE:
87
+ return (0xEF, 0xB6, 0xC9) # Pink Purple
88
+ elif cls == LayoutClass.ISOLATE_FORMULA:
89
+ return (0xFA, 0x38, 0x27) # Red
90
+ elif cls == LayoutClass.FORMULA_CAPTION:
91
+ return (0xFF, 0x9D, 0x24) # Orange
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: doc-page-extractor
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: doc page extractor can identify text and format in images and return structured data.
5
5
  Home-page: https://github.com/Moskize91/doc-page-extractor
6
6
  Author: Tao Zeyu
@@ -19,6 +19,7 @@ Dynamic: author-email
19
19
  Dynamic: description
20
20
  Dynamic: description-content-type
21
21
  Dynamic: home-page
22
+ Dynamic: license-file
22
23
  Dynamic: requires-dist
23
24
  Dynamic: summary
24
25
 
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="doc-page-extractor",
5
- version="0.0.8",
5
+ version="0.0.9",
6
6
  author="Tao Zeyu",
7
7
  author_email="i@taozeyu.com",
8
8
  url="https://github.com/Moskize91/doc-page-extractor",
@@ -1,38 +0,0 @@
1
- from typing import Iterable
2
- from PIL import ImageDraw
3
- from PIL.Image import Image
4
- from .types import Layout, LayoutClass
5
-
6
- _FRAGMENT_COLOR = (0x49, 0xCF, 0xCB) # Light Green
7
-
8
- def plot(image: Image, layouts: Iterable[Layout]):
9
- draw = ImageDraw.Draw(image, mode="RGBA")
10
- for layout in layouts:
11
- draw.polygon([p for p in layout.rect], outline=_layout_color(layout), width=5)
12
-
13
- for layout in layouts:
14
- for fragments in layout.fragments:
15
- draw.polygon([p for p in fragments.rect], outline=_FRAGMENT_COLOR, width=3)
16
-
17
- def _layout_color(layout: Layout) -> tuple[int, int, int]:
18
- cls = layout.cls
19
- if cls == LayoutClass.TITLE:
20
- return (0x0A, 0x12, 0x2C) # Dark
21
- elif cls == LayoutClass.PLAIN_TEXT:
22
- return (0x3C, 0x67, 0x90) # Blue
23
- elif cls == LayoutClass.ABANDON:
24
- return (0xC0, 0xBB, 0xA9) # Gray
25
- elif cls == LayoutClass.FIGURE:
26
- return (0x5B, 0x91, 0x3C) # Dark Green
27
- elif cls == LayoutClass.FIGURE_CAPTION:
28
- return (0x77, 0xB3, 0x54) # Green
29
- elif cls == LayoutClass.TABLE:
30
- return (0x44, 0x17, 0x52) # Dark Purple
31
- elif cls == LayoutClass.TABLE_CAPTION:
32
- return (0x81, 0x75, 0xA0) # Purple
33
- elif cls == LayoutClass.TABLE_FOOTNOTE:
34
- return (0xEF, 0xB6, 0xC9) # Pink Purple
35
- elif cls == LayoutClass.ISOLATE_FORMULA:
36
- return (0xFA, 0x38, 0x27) # Red
37
- elif cls == LayoutClass.FORMULA_CAPTION:
38
- return (0xFF, 0x9D, 0x24) # Orange