doc-page-extractor 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of doc-page-extractor might be problematic. Click here for more details.

@@ -1,2 +0,0 @@
1
- doc_page_extractor
2
- tests
tests/__init__.py DELETED
File without changes
tests/test_history_bus.py DELETED
@@ -1,55 +0,0 @@
1
- import os
2
- import unittest
3
-
4
- from PIL import Image
5
- from doc_page_extractor import DocExtractor, Layout, LayoutClass
6
-
7
-
8
- class TestGroup(unittest.TestCase):
9
- def test_history_bugs(self):
10
- model_path = os.path.join(self._project_path(), "model")
11
- image_path = os.path.join(self._project_path(), "tests", "images", "figure.png")
12
- os.makedirs(model_path, exist_ok=True)
13
-
14
- extractor = DocExtractor(model_path, "cpu")
15
- layouts: list[tuple[LayoutClass, list[str]]]
16
-
17
- with Image.open(image_path) as image:
18
- result = extractor.extract(image, extract_formula=False)
19
- layouts = [self._format_Layout(layout) for layout in result.layouts]
20
-
21
- self.assertEqual(layouts, [
22
- (LayoutClass.PLAIN_TEXT, [
23
- "口的11.8%①。这既是江南农业落后的反映,又是它的原因。当战国以",
24
- "后黄河流域因铁器牛耕的普及获得基本的开发,农区联结成一大片的",
25
- "时候,南方农业开发始终没有突破星点状或斑块状分布的格局。由于",
26
- "地旷人稀,耕作相当粗放,许多水田采取火耕水瓣的方式,旱田则多",
27
- "行刀耕火种②。司马迁在《史记·货殖列传》中说:“总之,楚越之",
28
- "地,地厂人希,饭稻囊鱼,或火耕而水瓣,果隋(蕨)赢(螺)蛤,",
29
- "不待贾而足,地势饶食,无饥谨之患,以故皆偷生,无积聚而多",
30
- "贫。”这种概括虽然未免太突出了南方经济的落后面,有一定片面性,",
31
- "但大体还是反映了实际情形的。战国秦汉时期,南方与黄河流域农业",
32
- "的差距显然拉大了。",
33
- ]),
34
- (LayoutClass.FIGURE, []),
35
- (LayoutClass.FIGURE_CAPTION, [
36
- "西晋陶水田犁耙模型(广东连县出土)"
37
- ]),
38
- (LayoutClass.FIGURE, []),
39
- (LayoutClass.FIGURE_CAPTION, [
40
- "南朝陶耙田模型 (广西苍梧倒水出土)"
41
- ]),
42
- (LayoutClass.PLAIN_TEXT, [
43
- "①据赵文林、谢淑君:《中国人口史》(人民出版社1988年)有关资料统计。",
44
- "②《盐铁论·通有》:“荆扬…………伐木而树谷,焚莱而播粟,火耕而水。”"
45
- ]),
46
- (LayoutClass.ABANDON, [
47
- "136"
48
- ]),
49
- ])
50
-
51
- def _format_Layout(self, layout: Layout) -> tuple[LayoutClass, list[str]]:
52
- return layout.cls, [f.text.strip() for f in layout.fragments]
53
-
54
- def _project_path(self) -> str:
55
- return os.path.abspath(os.path.join(__file__, "..", ".."))