fmtr.tools 1.0.31__py3-none-any.whl → 1.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fmtr.tools might be problematic. Click here for more details.

fmtr/tools/__init__.py CHANGED
@@ -143,6 +143,11 @@ try:
143
143
  except ImportError as exception:
144
144
  caching = MissingExtraMockModule('caching', exception)
145
145
 
146
+ try:
147
+ from fmtr.tools import pdf_tools as pdf
148
+ except ImportError as exception:
149
+ pdf = MissingExtraMockModule('pdf', exception)
150
+
146
151
 
147
152
  __all__ = [
148
153
  'config',
fmtr/tools/api_tools.py CHANGED
@@ -83,6 +83,11 @@ class ApiBase:
83
83
 
84
84
  @classmethod
85
85
  def launch(cls):
86
+ """
87
+
88
+ Initialise self and launch.
89
+
90
+ """
86
91
  self = cls()
87
92
  logger.info(f'Launching API {cls.TITLE}...')
88
93
  uvicorn.run(self.app, host=self.HOST, port=self.PORT)
fmtr/tools/path_tools.py CHANGED
@@ -96,6 +96,29 @@ class Path(type(Path())):
96
96
  """
97
97
  return cls(gettempdir())
98
98
 
99
+ @classmethod
100
+ def data(cls, name='data') -> 'Path':
101
+ """
102
+
103
+ Fetch canonical "data"/"artifacts" path, whether calling package is regular or namespace package.
104
+
105
+ """
106
+ from fmtr.tools.inspection_tools import get_call_path
107
+ path = get_call_path()
108
+ path = path.absolute().parent.parent
109
+
110
+ path /= name
111
+
112
+ if path.exists():
113
+ return path
114
+
115
+ path = path.parent.parent / name
116
+
117
+ if path.exists():
118
+ return path
119
+
120
+ raise FileNotFoundError(f'No "{name}" directory found at "{path}"')
121
+
99
122
  def write_json(self, obj) -> int:
100
123
  """
101
124
 
@@ -0,0 +1,172 @@
1
+ import pymupdf as pm
2
+ import pymupdf4llm
3
+ from typing import List, Tuple, Dict, Any, Self
4
+
5
+ from fmtr.tools import data_modelling_tools
6
+
7
+
8
+ class BoundingBox(data_modelling_tools.Base):
9
+ left: float
10
+ top: float
11
+ right: float
12
+ bottom: float
13
+
14
+ @property
15
+ def order(self):
16
+ """
17
+
18
+ Approximate natural reading order
19
+
20
+ """
21
+ return (self.top, self.left), (self.bottom, self.right)
22
+
23
+ @property
24
+ def rect(self) -> pm.Rect:
25
+ """
26
+
27
+ Position as a PyMuPDF Rect
28
+
29
+ """
30
+ return pm.Rect(self.left, self.top, self.right, self.bottom)
31
+
32
+ @classmethod
33
+ def from_dict(cls, data: Tuple[float]) -> Self:
34
+ """
35
+
36
+ Instantiate from PyMuPDF dictionary data
37
+
38
+ """
39
+ data = {key: value for key, value in zip(cls.model_fields.keys(), data)}
40
+ return cls(**data)
41
+
42
+
43
+ class Span(data_modelling_tools.Base):
44
+ size: float
45
+ flags: int
46
+ font: str
47
+ color: int
48
+ ascender: float
49
+ descender: float
50
+ text: str
51
+ origin: Tuple[float, float]
52
+ bbox: BoundingBox
53
+
54
+ @classmethod
55
+ def from_dict(cls, data: Dict[str, Any]) -> Self:
56
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
57
+ return cls(**data)
58
+
59
+
60
+ class Line(data_modelling_tools.Base):
61
+ spans: List[Span]
62
+ wmode: int
63
+ dir: Tuple[float, float]
64
+ bbox: BoundingBox
65
+
66
+ @classmethod
67
+ def from_dict(cls, data: Dict[str, Any]) -> 'Line':
68
+ """
69
+
70
+ Instantiate from PyMuPDF dictionary data
71
+
72
+ """
73
+ data['spans'] = [Span.from_dict(span) for span in data['spans']]
74
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
75
+ return cls(**data)
76
+
77
+ @property
78
+ def text(self) -> str:
79
+ """
80
+
81
+ Simple text representation
82
+
83
+ """
84
+ return ' '.join([span.text for span in self.spans])
85
+
86
+
87
+ class Block(data_modelling_tools.Base):
88
+ number: int
89
+ type: int
90
+ bbox: BoundingBox
91
+ lines: List[Line]
92
+
93
+ @property
94
+ def text(self) -> str:
95
+ """
96
+
97
+ Simple text representation
98
+
99
+ """
100
+ return ' '.join([span.text for span in self.spans])
101
+
102
+ @classmethod
103
+ def from_dict(cls, data: Dict) -> Self:
104
+ """
105
+
106
+ Instantiate from PyMuPDF dictionary data
107
+
108
+ """
109
+ data['lines'] = [Line.from_dict(line) for line in data['lines']]
110
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
111
+ return cls(**data)
112
+
113
+ @property
114
+ def rect(self) -> pm.Rect:
115
+ """
116
+
117
+ Position as a PyMuPDF Rect
118
+
119
+ """
120
+ return self.bbox.rect
121
+
122
+
123
+ class Document(pm.Document):
124
+ """
125
+
126
+ Subclassed Document object with data-modelled elements property and markdown conversion.
127
+
128
+ """
129
+
130
+ @property
131
+ def data(self) -> List[Block]:
132
+ """
133
+
134
+ Get representation of Document elements as Python objects.
135
+
136
+ """
137
+
138
+ blocks = []
139
+
140
+ for page in self:
141
+ for block in page.get_text("dict")["blocks"]:
142
+ obj = Block.from_dict(block)
143
+ blocks.append(obj)
144
+
145
+ return blocks
146
+
147
+ def to_markdown(self, **kwargs) -> str:
148
+ """
149
+
150
+ Markdown output via `pymupdf4llm`
151
+
152
+ """
153
+ return pymupdf4llm.to_markdown(self, **kwargs)
154
+
155
+
156
+ if __name__ == '__main__':
157
+ from fmtr.tools.path_tools import Path
158
+
159
+ PATH_DATA = Path.data()
160
+ # PATH_PDF=PATH_DATA/'chib.pdf'
161
+ PATH_PDF = PATH_DATA / 'kvm.pdf'
162
+ assert PATH_PDF.exists()
163
+
164
+ doc = Document(PATH_PDF)
165
+ doc.data
166
+
167
+ for page in doc:
168
+ print(page.get_text('dict'))
169
+ print(page.get_text('html'))
170
+
171
+ md = doc.to_markdown()
172
+ md
@@ -145,3 +145,10 @@ class Mask:
145
145
  return text
146
146
  except (KeyError, IndexError):
147
147
  return self
148
+
149
+
150
+ if __name__ == '__main__':
151
+ import numpy as np
152
+
153
+ st = join([1, None, 'test', np.nan, 0, '', 'yeah'])
154
+ st
fmtr/tools/version CHANGED
@@ -1 +1 @@
1
- 1.0.31
1
+ 1.0.33
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fmtr.tools
3
- Version: 1.0.31
3
+ Version: 1.0.33
4
4
  Summary: Collection of high-level tools to simplify everyday development tasks, with a focus on AI/ML
5
5
  Home-page: https://github.com/fmtr/fmtr.tools
6
6
  Author: Frontmatter
@@ -67,6 +67,10 @@ Provides-Extra: parallel
67
67
  Requires-Dist: bokeh ; extra == 'parallel'
68
68
  Requires-Dist: dask[bag] ; extra == 'parallel'
69
69
  Requires-Dist: distributed ; extra == 'parallel'
70
+ Provides-Extra: pdf
71
+ Requires-Dist: pydantic ; extra == 'pdf'
72
+ Requires-Dist: pymupdf ; extra == 'pdf'
73
+ Requires-Dist: pymupdf4llm ; extra == 'pdf'
70
74
  Provides-Extra: process
71
75
  Requires-Dist: logfire ; extra == 'process'
72
76
  Requires-Dist: semver ; extra == 'process'
@@ -105,6 +109,8 @@ Requires-Dist: pandas ; extra == 'test'
105
109
  Requires-Dist: peft ; extra == 'test'
106
110
  Requires-Dist: pydantic ; extra == 'test'
107
111
  Requires-Dist: pydantic-ai[logfire,openai] ; extra == 'test'
112
+ Requires-Dist: pymupdf ; extra == 'test'
113
+ Requires-Dist: pymupdf4llm ; extra == 'test'
108
114
  Requires-Dist: pytest-cov ; extra == 'test'
109
115
  Requires-Dist: pyyaml ; extra == 'test'
110
116
  Requires-Dist: semver ; extra == 'test'
@@ -1,6 +1,6 @@
1
- fmtr/tools/__init__.py,sha256=imKjjZg5HypnRy5bD4Xe6HBkQhavg6BOkwIeFzItG_8,5052
1
+ fmtr/tools/__init__.py,sha256=3eJEUYhp82YvQUmoaFdNYaiNeQnyoRuxmez0uqKgZdU,5186
2
2
  fmtr/tools/ai_tools.py,sha256=hN7DzuATXfurCDHugaluUsbmF_PzeKu3BTc2WXhG59g,11806
3
- fmtr/tools/api_tools.py,sha256=KU_qetI7WQRZijZ0l1nCKjPcRbseMMCWj0W8n_NTJCM,2027
3
+ fmtr/tools/api_tools.py,sha256=u5YEdKyKto8MKY8legULLU7xeJ7lY2Bgyaep_xa8iZg,2089
4
4
  fmtr/tools/async_tools.py,sha256=ewz757WcveQJd-G5SVr2JDOQVbdLGecCgl-tsBGVZz4,284
5
5
  fmtr/tools/augmentation_tools.py,sha256=-6ESbO4CDlKqVOV1J1V6qBeoBMzbFIinkDHRHnCBej0,55
6
6
  fmtr/tools/caching_tools.py,sha256=UOCYUNvLQ-NofR_dhqBmZF96-HRPf4At5MmxVk3gAIk,2943
@@ -30,18 +30,19 @@ fmtr/tools/name_tools.py,sha256=5CB_phqhHjl66iI8oLxOGPF2odC1apdul-M8Fv2xBhs,5514
30
30
  fmtr/tools/netrc_tools.py,sha256=PpNpz_mWlQi6VHGromKwFfTyLpHUXsd4LY6-OKLCbeI,376
31
31
  fmtr/tools/openai_tools.py,sha256=6SUgejgzUzmlKKct2_ePXntvMegu3FJgfk9x7aqtqYc,742
32
32
  fmtr/tools/parallel_tools.py,sha256=G__ZbLRRx4cP5OyqY1hKwnE-VI3m5prYABB0tnZHnes,3132
33
- fmtr/tools/path_tools.py,sha256=iNIG-nxBX-uiyfyOapJZaUiS4pI8sHsxm6WhVHixQ0M,3700
33
+ fmtr/tools/path_tools.py,sha256=1GeWXdhV5rH99IfLI5ZFEnOJfs4Q4mYTT2R-rA791iQ,4273
34
+ fmtr/tools/pdf_tools.py,sha256=3XZ6Tpvuf46oeYENsYQsYn-6hbTqHDSJesqNZeJ1Rko,3563
34
35
  fmtr/tools/platform_tools.py,sha256=7p69CmAHe_sF68Fx9uVhns1k5EewTHTWgUYzkl6ZQKA,308
35
36
  fmtr/tools/process_tools.py,sha256=Ysh5Dk2QFBhXQerArjKdt7xZd3JrN5Ho02AaOjH0Nnw,1425
36
37
  fmtr/tools/profiling_tools.py,sha256=jpXVjaNKPydTasEQVNXvxzGtMhXPit08AnJddkU8uIc,46
37
38
  fmtr/tools/random_tools.py,sha256=4VlQdk5THbR8ka4pZaLbk_ZO_4yy6PF_lHZes_rgenY,2223
38
39
  fmtr/tools/semantic_tools.py,sha256=cxY9NSAHWj4nEc6Oj4qA1omR3dWbl2OuH7_PkINc6_E,1386
39
40
  fmtr/tools/spaces_tools.py,sha256=D_he3mve6DruB3OPS6QyzqD05ChHnRTb4buViKPe7To,1099
40
- fmtr/tools/string_tools.py,sha256=w0lw70bgzJ8tAHj_4lMrjtMyefE5kELgpCBgGzGcalo,3117
41
+ fmtr/tools/string_tools.py,sha256=U2EptMWR6KDOP22ZQ4ReUHV4i25SP7xwCmZScI1sy4M,3233
41
42
  fmtr/tools/tokenization_tools.py,sha256=9FP5vgPufWv0XA961eVKObFll0d_2mM0W3ut3rtZyeo,4329
42
43
  fmtr/tools/tools.py,sha256=xnfUrOnrT4OxFYez6vV5tAhydzCICJFiGVnviiZDEQo,796
43
44
  fmtr/tools/unicode_tools.py,sha256=yS_9wpu8ogNoiIL7s1G_8bETFFO_YQlo4LNPv1NLDeY,52
44
- fmtr/tools/version,sha256=VmLTmBbOULW3noulizLqu_B8fqxN0ySSfEW-8f3y1Sk,6
45
+ fmtr/tools/version,sha256=V-CY1GAptD08dwDUv8tP0IXGjOPKFQq8iTv5XUR7qhc,6
45
46
  fmtr/tools/version_tools.py,sha256=axzzHBS9V1n6YuSacsDKG3VfAvRqR8qr6aENCibR8vs,1248
46
47
  fmtr/tools/yaml_tools.py,sha256=Ol43ZwbnSXGnn1K98Uxx61KPGSqfC4axE-X2q1LKMwk,349
47
48
  fmtr/tools/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -52,9 +53,9 @@ fmtr/tools/tests/test_environment.py,sha256=iHaiMQfECYZPkPKwfuIZV9uHuWe3aE-p_dN_
52
53
  fmtr/tools/tests/test_json.py,sha256=IeSP4ziPvRcmS8kq7k9tHonC9rN5YYq9GSNT2ul6Msk,287
53
54
  fmtr/tools/tests/test_path.py,sha256=AkZQa6_8BQ-VaCyL_J-iKmdf2ZaM-xFYR37Kun3k4_g,2188
54
55
  fmtr/tools/tests/test_yaml.py,sha256=jc0TwwKu9eC0LvFGNMERdgBue591xwLxYXFbtsRwXVM,287
55
- fmtr.tools-1.0.31.dist-info/LICENSE,sha256=FW9aa6vVN5IjRQWLT43hs4_koYSmpcbIovlKeAJ0_cI,10757
56
- fmtr.tools-1.0.31.dist-info/METADATA,sha256=VOhy728mlqRmzO9jP8wUEN8rWHYap9dcf9AA0beu9r8,13207
57
- fmtr.tools-1.0.31.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
58
- fmtr.tools-1.0.31.dist-info/entry_points.txt,sha256=CEStVkwJ1mTFvhN1WV5RdW83SkNW1d5Syj-KZ6A19ng,72
59
- fmtr.tools-1.0.31.dist-info/top_level.txt,sha256=t5341a8ii3n4RFizwTeXGmcq_pf4GqL1h9ylE5LIWRk,12
60
- fmtr.tools-1.0.31.dist-info/RECORD,,
56
+ fmtr.tools-1.0.33.dist-info/LICENSE,sha256=FW9aa6vVN5IjRQWLT43hs4_koYSmpcbIovlKeAJ0_cI,10757
57
+ fmtr.tools-1.0.33.dist-info/METADATA,sha256=G6ybzpAyBJfpfyoTPLQBXB6HJRrG-3lKwc84YaSI9nI,13438
58
+ fmtr.tools-1.0.33.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
59
+ fmtr.tools-1.0.33.dist-info/entry_points.txt,sha256=CEStVkwJ1mTFvhN1WV5RdW83SkNW1d5Syj-KZ6A19ng,72
60
+ fmtr.tools-1.0.33.dist-info/top_level.txt,sha256=t5341a8ii3n4RFizwTeXGmcq_pf4GqL1h9ylE5LIWRk,12
61
+ fmtr.tools-1.0.33.dist-info/RECORD,,