fmtr.tools 1.0.32__py3-none-any.whl → 1.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fmtr.tools might be problematic. Click here for more details.

fmtr/tools/__init__.py CHANGED
@@ -143,6 +143,11 @@ try:
143
143
  except ImportError as exception:
144
144
  caching = MissingExtraMockModule('caching', exception)
145
145
 
146
+ try:
147
+ from fmtr.tools import pdf_tools as pdf
148
+ except ImportError as exception:
149
+ pdf = MissingExtraMockModule('pdf', exception)
150
+
146
151
 
147
152
  __all__ = [
148
153
  'config',
@@ -0,0 +1,172 @@
1
+ import pymupdf as pm
2
+ import pymupdf4llm
3
+ from typing import List, Tuple, Dict, Any, Self
4
+
5
+ from fmtr.tools import data_modelling_tools
6
+
7
+
8
+ class BoundingBox(data_modelling_tools.Base):
9
+ left: float
10
+ top: float
11
+ right: float
12
+ bottom: float
13
+
14
+ @property
15
+ def order(self):
16
+ """
17
+
18
+ Approximate natural reading order
19
+
20
+ """
21
+ return (self.top, self.left), (self.bottom, self.right)
22
+
23
+ @property
24
+ def rect(self) -> pm.Rect:
25
+ """
26
+
27
+ Position as a PyMuPDF Rect
28
+
29
+ """
30
+ return pm.Rect(self.left, self.top, self.right, self.bottom)
31
+
32
+ @classmethod
33
+ def from_dict(cls, data: Tuple[float]) -> Self:
34
+ """
35
+
36
+ Instantiate from PyMuPDF dictionary data
37
+
38
+ """
39
+ data = {key: value for key, value in zip(cls.model_fields.keys(), data)}
40
+ return cls(**data)
41
+
42
+
43
+ class Span(data_modelling_tools.Base):
44
+ size: float
45
+ flags: int
46
+ font: str
47
+ color: int
48
+ ascender: float
49
+ descender: float
50
+ text: str
51
+ origin: Tuple[float, float]
52
+ bbox: BoundingBox
53
+
54
+ @classmethod
55
+ def from_dict(cls, data: Dict[str, Any]) -> Self:
56
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
57
+ return cls(**data)
58
+
59
+
60
+ class Line(data_modelling_tools.Base):
61
+ spans: List[Span]
62
+ wmode: int
63
+ dir: Tuple[float, float]
64
+ bbox: BoundingBox
65
+
66
+ @classmethod
67
+ def from_dict(cls, data: Dict[str, Any]) -> 'Line':
68
+ """
69
+
70
+ Instantiate from PyMuPDF dictionary data
71
+
72
+ """
73
+ data['spans'] = [Span.from_dict(span) for span in data['spans']]
74
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
75
+ return cls(**data)
76
+
77
+ @property
78
+ def text(self) -> str:
79
+ """
80
+
81
+ Simple text representation
82
+
83
+ """
84
+ return ' '.join([span.text for span in self.spans])
85
+
86
+
87
+ class Block(data_modelling_tools.Base):
88
+ number: int
89
+ type: int
90
+ bbox: BoundingBox
91
+ lines: List[Line]
92
+
93
+ @property
94
+ def text(self) -> str:
95
+ """
96
+
97
+ Simple text representation
98
+
99
+ """
100
+ return ' '.join([span.text for span in self.spans])
101
+
102
+ @classmethod
103
+ def from_dict(cls, data: Dict) -> Self:
104
+ """
105
+
106
+ Instantiate from PyMuPDF dictionary data
107
+
108
+ """
109
+ data['lines'] = [Line.from_dict(line) for line in data['lines']]
110
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
111
+ return cls(**data)
112
+
113
+ @property
114
+ def rect(self) -> pm.Rect:
115
+ """
116
+
117
+ Position as a PyMuPDF Rect
118
+
119
+ """
120
+ return self.bbox.rect
121
+
122
+
123
+ class Document(pm.Document):
124
+ """
125
+
126
+ Subclassed Document object with data-modelled elements property and markdown conversion.
127
+
128
+ """
129
+
130
+ @property
131
+ def data(self) -> List[Block]:
132
+ """
133
+
134
+ Get representation of Document elements as Python objects.
135
+
136
+ """
137
+
138
+ blocks = []
139
+
140
+ for page in self:
141
+ for block in page.get_text("dict")["blocks"]:
142
+ obj = Block.from_dict(block)
143
+ blocks.append(obj)
144
+
145
+ return blocks
146
+
147
+ def to_markdown(self, **kwargs) -> str:
148
+ """
149
+
150
+ Markdown output via `pymupdf4llm`
151
+
152
+ """
153
+ return pymupdf4llm.to_markdown(self, **kwargs)
154
+
155
+
156
+ if __name__ == '__main__':
157
+ from fmtr.tools.path_tools import Path
158
+
159
+ PATH_DATA = Path.data()
160
+ # PATH_PDF=PATH_DATA/'chib.pdf'
161
+ PATH_PDF = PATH_DATA / 'kvm.pdf'
162
+ assert PATH_PDF.exists()
163
+
164
+ doc = Document(PATH_PDF)
165
+ doc.data
166
+
167
+ for page in doc:
168
+ print(page.get_text('dict'))
169
+ print(page.get_text('html'))
170
+
171
+ md = doc.to_markdown()
172
+ md
fmtr/tools/version CHANGED
@@ -1 +1 @@
1
- 1.0.32
1
+ 1.0.33
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fmtr.tools
3
- Version: 1.0.32
3
+ Version: 1.0.33
4
4
  Summary: Collection of high-level tools to simplify everyday development tasks, with a focus on AI/ML
5
5
  Home-page: https://github.com/fmtr/fmtr.tools
6
6
  Author: Frontmatter
@@ -67,6 +67,10 @@ Provides-Extra: parallel
67
67
  Requires-Dist: bokeh ; extra == 'parallel'
68
68
  Requires-Dist: dask[bag] ; extra == 'parallel'
69
69
  Requires-Dist: distributed ; extra == 'parallel'
70
+ Provides-Extra: pdf
71
+ Requires-Dist: pydantic ; extra == 'pdf'
72
+ Requires-Dist: pymupdf ; extra == 'pdf'
73
+ Requires-Dist: pymupdf4llm ; extra == 'pdf'
70
74
  Provides-Extra: process
71
75
  Requires-Dist: logfire ; extra == 'process'
72
76
  Requires-Dist: semver ; extra == 'process'
@@ -105,6 +109,8 @@ Requires-Dist: pandas ; extra == 'test'
105
109
  Requires-Dist: peft ; extra == 'test'
106
110
  Requires-Dist: pydantic ; extra == 'test'
107
111
  Requires-Dist: pydantic-ai[logfire,openai] ; extra == 'test'
112
+ Requires-Dist: pymupdf ; extra == 'test'
113
+ Requires-Dist: pymupdf4llm ; extra == 'test'
108
114
  Requires-Dist: pytest-cov ; extra == 'test'
109
115
  Requires-Dist: pyyaml ; extra == 'test'
110
116
  Requires-Dist: semver ; extra == 'test'
@@ -1,4 +1,4 @@
1
- fmtr/tools/__init__.py,sha256=imKjjZg5HypnRy5bD4Xe6HBkQhavg6BOkwIeFzItG_8,5052
1
+ fmtr/tools/__init__.py,sha256=3eJEUYhp82YvQUmoaFdNYaiNeQnyoRuxmez0uqKgZdU,5186
2
2
  fmtr/tools/ai_tools.py,sha256=hN7DzuATXfurCDHugaluUsbmF_PzeKu3BTc2WXhG59g,11806
3
3
  fmtr/tools/api_tools.py,sha256=u5YEdKyKto8MKY8legULLU7xeJ7lY2Bgyaep_xa8iZg,2089
4
4
  fmtr/tools/async_tools.py,sha256=ewz757WcveQJd-G5SVr2JDOQVbdLGecCgl-tsBGVZz4,284
@@ -31,6 +31,7 @@ fmtr/tools/netrc_tools.py,sha256=PpNpz_mWlQi6VHGromKwFfTyLpHUXsd4LY6-OKLCbeI,376
31
31
  fmtr/tools/openai_tools.py,sha256=6SUgejgzUzmlKKct2_ePXntvMegu3FJgfk9x7aqtqYc,742
32
32
  fmtr/tools/parallel_tools.py,sha256=G__ZbLRRx4cP5OyqY1hKwnE-VI3m5prYABB0tnZHnes,3132
33
33
  fmtr/tools/path_tools.py,sha256=1GeWXdhV5rH99IfLI5ZFEnOJfs4Q4mYTT2R-rA791iQ,4273
34
+ fmtr/tools/pdf_tools.py,sha256=3XZ6Tpvuf46oeYENsYQsYn-6hbTqHDSJesqNZeJ1Rko,3563
34
35
  fmtr/tools/platform_tools.py,sha256=7p69CmAHe_sF68Fx9uVhns1k5EewTHTWgUYzkl6ZQKA,308
35
36
  fmtr/tools/process_tools.py,sha256=Ysh5Dk2QFBhXQerArjKdt7xZd3JrN5Ho02AaOjH0Nnw,1425
36
37
  fmtr/tools/profiling_tools.py,sha256=jpXVjaNKPydTasEQVNXvxzGtMhXPit08AnJddkU8uIc,46
@@ -41,7 +42,7 @@ fmtr/tools/string_tools.py,sha256=U2EptMWR6KDOP22ZQ4ReUHV4i25SP7xwCmZScI1sy4M,32
41
42
  fmtr/tools/tokenization_tools.py,sha256=9FP5vgPufWv0XA961eVKObFll0d_2mM0W3ut3rtZyeo,4329
42
43
  fmtr/tools/tools.py,sha256=xnfUrOnrT4OxFYez6vV5tAhydzCICJFiGVnviiZDEQo,796
43
44
  fmtr/tools/unicode_tools.py,sha256=yS_9wpu8ogNoiIL7s1G_8bETFFO_YQlo4LNPv1NLDeY,52
44
- fmtr/tools/version,sha256=lJlaFE-y4dFjsYr6jbTGb5uzUQ_dp_LO5e9G4R2TeMk,6
45
+ fmtr/tools/version,sha256=V-CY1GAptD08dwDUv8tP0IXGjOPKFQq8iTv5XUR7qhc,6
45
46
  fmtr/tools/version_tools.py,sha256=axzzHBS9V1n6YuSacsDKG3VfAvRqR8qr6aENCibR8vs,1248
46
47
  fmtr/tools/yaml_tools.py,sha256=Ol43ZwbnSXGnn1K98Uxx61KPGSqfC4axE-X2q1LKMwk,349
47
48
  fmtr/tools/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -52,9 +53,9 @@ fmtr/tools/tests/test_environment.py,sha256=iHaiMQfECYZPkPKwfuIZV9uHuWe3aE-p_dN_
52
53
  fmtr/tools/tests/test_json.py,sha256=IeSP4ziPvRcmS8kq7k9tHonC9rN5YYq9GSNT2ul6Msk,287
53
54
  fmtr/tools/tests/test_path.py,sha256=AkZQa6_8BQ-VaCyL_J-iKmdf2ZaM-xFYR37Kun3k4_g,2188
54
55
  fmtr/tools/tests/test_yaml.py,sha256=jc0TwwKu9eC0LvFGNMERdgBue591xwLxYXFbtsRwXVM,287
55
- fmtr.tools-1.0.32.dist-info/LICENSE,sha256=FW9aa6vVN5IjRQWLT43hs4_koYSmpcbIovlKeAJ0_cI,10757
56
- fmtr.tools-1.0.32.dist-info/METADATA,sha256=iPCoZOCJ1UbUM0x2NVVcWQiEg-Co8RZNSQIy1cgHErQ,13207
57
- fmtr.tools-1.0.32.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
58
- fmtr.tools-1.0.32.dist-info/entry_points.txt,sha256=CEStVkwJ1mTFvhN1WV5RdW83SkNW1d5Syj-KZ6A19ng,72
59
- fmtr.tools-1.0.32.dist-info/top_level.txt,sha256=t5341a8ii3n4RFizwTeXGmcq_pf4GqL1h9ylE5LIWRk,12
60
- fmtr.tools-1.0.32.dist-info/RECORD,,
56
+ fmtr.tools-1.0.33.dist-info/LICENSE,sha256=FW9aa6vVN5IjRQWLT43hs4_koYSmpcbIovlKeAJ0_cI,10757
57
+ fmtr.tools-1.0.33.dist-info/METADATA,sha256=G6ybzpAyBJfpfyoTPLQBXB6HJRrG-3lKwc84YaSI9nI,13438
58
+ fmtr.tools-1.0.33.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
59
+ fmtr.tools-1.0.33.dist-info/entry_points.txt,sha256=CEStVkwJ1mTFvhN1WV5RdW83SkNW1d5Syj-KZ6A19ng,72
60
+ fmtr.tools-1.0.33.dist-info/top_level.txt,sha256=t5341a8ii3n4RFizwTeXGmcq_pf4GqL1h9ylE5LIWRk,12
61
+ fmtr.tools-1.0.33.dist-info/RECORD,,