fmtr.tools 1.0.31__tar.gz → 1.0.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fmtr.tools might be problematic. Click here for more details.

Files changed (67) hide show
  1. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/PKG-INFO +2 -1
  2. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/__init__.py +5 -0
  3. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/api_tools.py +5 -0
  4. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/path_tools.py +23 -0
  5. fmtr.tools-1.0.33/fmtr/tools/pdf_tools.py +172 -0
  6. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/string_tools.py +7 -0
  7. fmtr.tools-1.0.33/fmtr/tools/version +1 -0
  8. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/PKG-INFO +2 -1
  9. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/SOURCES.txt +1 -0
  10. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/requires.txt +7 -0
  11. fmtr.tools-1.0.31/fmtr/tools/version +0 -1
  12. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/LICENSE +0 -0
  13. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/README.md +0 -0
  14. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/ai_tools.py +0 -0
  15. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/async_tools.py +0 -0
  16. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/augmentation_tools.py +0 -0
  17. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/caching_tools.py +0 -0
  18. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/config.py +0 -0
  19. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/config_tools.py +0 -0
  20. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/console_script_tools.py +0 -0
  21. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/data_modelling_tools.py +0 -0
  22. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/dataclass_tools.py +0 -0
  23. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/datatype_tools.py +0 -0
  24. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/docker_tools.py +0 -0
  25. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/environment_tools.py +0 -0
  26. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/function_tools.py +0 -0
  27. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/google_api_tools.py +0 -0
  28. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/hash_tools.py +0 -0
  29. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/hfh_tools.py +0 -0
  30. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/html_tools.py +0 -0
  31. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/import_tools.py +0 -0
  32. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/inspection_tools.py +0 -0
  33. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/interface_tools.py +0 -0
  34. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/iterator_tools.py +0 -0
  35. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/json_fix_tools.py +0 -0
  36. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/json_tools.py +0 -0
  37. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/logging_tools.py +0 -0
  38. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/merging_tools.py +0 -0
  39. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/metric_tools.py +0 -0
  40. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/name_tools.py +0 -0
  41. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/netrc_tools.py +0 -0
  42. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/openai_tools.py +0 -0
  43. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/parallel_tools.py +0 -0
  44. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/platform_tools.py +0 -0
  45. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/process_tools.py +0 -0
  46. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/profiling_tools.py +0 -0
  47. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/random_tools.py +0 -0
  48. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/semantic_tools.py +0 -0
  49. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/spaces_tools.py +0 -0
  50. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/__init__.py +0 -0
  51. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/conftest.py +0 -0
  52. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/helpers.py +0 -0
  53. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_datatype.py +0 -0
  54. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_environment.py +0 -0
  55. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_json.py +0 -0
  56. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_path.py +0 -0
  57. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_yaml.py +0 -0
  58. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tokenization_tools.py +0 -0
  59. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tools.py +0 -0
  60. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/unicode_tools.py +0 -0
  61. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/version_tools.py +0 -0
  62. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/yaml_tools.py +0 -0
  63. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/dependency_links.txt +0 -0
  64. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/entry_points.txt +0 -0
  65. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/top_level.txt +0 -0
  66. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/setup.cfg +0 -0
  67. {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fmtr.tools
3
- Version: 1.0.31
3
+ Version: 1.0.33
4
4
  Summary: Collection of high-level tools to simplify everyday development tasks, with a focus on AI/ML
5
5
  Home-page: https://github.com/fmtr/fmtr.tools
6
6
  Author: Frontmatter
@@ -25,6 +25,7 @@ Provides-Extra: metric
25
25
  Provides-Extra: netrc
26
26
  Provides-Extra: openai.api
27
27
  Provides-Extra: parallel
28
+ Provides-Extra: pdf
28
29
  Provides-Extra: process
29
30
  Provides-Extra: profiling
30
31
  Provides-Extra: semantic
@@ -143,6 +143,11 @@ try:
143
143
  except ImportError as exception:
144
144
  caching = MissingExtraMockModule('caching', exception)
145
145
 
146
+ try:
147
+ from fmtr.tools import pdf_tools as pdf
148
+ except ImportError as exception:
149
+ pdf = MissingExtraMockModule('pdf', exception)
150
+
146
151
 
147
152
  __all__ = [
148
153
  'config',
@@ -83,6 +83,11 @@ class ApiBase:
83
83
 
84
84
  @classmethod
85
85
  def launch(cls):
86
+ """
87
+
88
+ Initialise self and launch.
89
+
90
+ """
86
91
  self = cls()
87
92
  logger.info(f'Launching API {cls.TITLE}...')
88
93
  uvicorn.run(self.app, host=self.HOST, port=self.PORT)
@@ -96,6 +96,29 @@ class Path(type(Path())):
96
96
  """
97
97
  return cls(gettempdir())
98
98
 
99
+ @classmethod
100
+ def data(cls, name='data') -> 'Path':
101
+ """
102
+
103
+ Fetch canonical "data"/"artifacts" path, whether calling package is regular or namespace package.
104
+
105
+ """
106
+ from fmtr.tools.inspection_tools import get_call_path
107
+ path = get_call_path()
108
+ path = path.absolute().parent.parent
109
+
110
+ path /= name
111
+
112
+ if path.exists():
113
+ return path
114
+
115
+ path = path.parent.parent / name
116
+
117
+ if path.exists():
118
+ return path
119
+
120
+ raise FileNotFoundError(f'No "{name}" directory found at "{path}"')
121
+
99
122
  def write_json(self, obj) -> int:
100
123
  """
101
124
 
@@ -0,0 +1,172 @@
1
+ import pymupdf as pm
2
+ import pymupdf4llm
3
+ from typing import List, Tuple, Dict, Any, Self
4
+
5
+ from fmtr.tools import data_modelling_tools
6
+
7
+
8
+ class BoundingBox(data_modelling_tools.Base):
9
+ left: float
10
+ top: float
11
+ right: float
12
+ bottom: float
13
+
14
+ @property
15
+ def order(self):
16
+ """
17
+
18
+ Approximate natural reading order
19
+
20
+ """
21
+ return (self.top, self.left), (self.bottom, self.right)
22
+
23
+ @property
24
+ def rect(self) -> pm.Rect:
25
+ """
26
+
27
+ Position as a PyMuPDF Rect
28
+
29
+ """
30
+ return pm.Rect(self.left, self.top, self.right, self.bottom)
31
+
32
+ @classmethod
33
+ def from_dict(cls, data: Tuple[float]) -> Self:
34
+ """
35
+
36
+ Instantiate from PyMuPDF dictionary data
37
+
38
+ """
39
+ data = {key: value for key, value in zip(cls.model_fields.keys(), data)}
40
+ return cls(**data)
41
+
42
+
43
+ class Span(data_modelling_tools.Base):
44
+ size: float
45
+ flags: int
46
+ font: str
47
+ color: int
48
+ ascender: float
49
+ descender: float
50
+ text: str
51
+ origin: Tuple[float, float]
52
+ bbox: BoundingBox
53
+
54
+ @classmethod
55
+ def from_dict(cls, data: Dict[str, Any]) -> Self:
56
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
57
+ return cls(**data)
58
+
59
+
60
+ class Line(data_modelling_tools.Base):
61
+ spans: List[Span]
62
+ wmode: int
63
+ dir: Tuple[float, float]
64
+ bbox: BoundingBox
65
+
66
+ @classmethod
67
+ def from_dict(cls, data: Dict[str, Any]) -> 'Line':
68
+ """
69
+
70
+ Instantiate from PyMuPDF dictionary data
71
+
72
+ """
73
+ data['spans'] = [Span.from_dict(span) for span in data['spans']]
74
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
75
+ return cls(**data)
76
+
77
+ @property
78
+ def text(self) -> str:
79
+ """
80
+
81
+ Simple text representation
82
+
83
+ """
84
+ return ' '.join([span.text for span in self.spans])
85
+
86
+
87
+ class Block(data_modelling_tools.Base):
88
+ number: int
89
+ type: int
90
+ bbox: BoundingBox
91
+ lines: List[Line]
92
+
93
+ @property
94
+ def text(self) -> str:
95
+ """
96
+
97
+ Simple text representation
98
+
99
+ """
100
+ return ' '.join([span.text for span in self.spans])
101
+
102
+ @classmethod
103
+ def from_dict(cls, data: Dict) -> Self:
104
+ """
105
+
106
+ Instantiate from PyMuPDF dictionary data
107
+
108
+ """
109
+ data['lines'] = [Line.from_dict(line) for line in data['lines']]
110
+ data['bbox'] = BoundingBox.from_dict(data['bbox'])
111
+ return cls(**data)
112
+
113
+ @property
114
+ def rect(self) -> pm.Rect:
115
+ """
116
+
117
+ Position as a PyMuPDF Rect
118
+
119
+ """
120
+ return self.bbox.rect
121
+
122
+
123
+ class Document(pm.Document):
124
+ """
125
+
126
+ Subclassed Document object with data-modelled elements property and markdown conversion.
127
+
128
+ """
129
+
130
+ @property
131
+ def data(self) -> List[Block]:
132
+ """
133
+
134
+ Get representation of Document elements as Python objects.
135
+
136
+ """
137
+
138
+ blocks = []
139
+
140
+ for page in self:
141
+ for block in page.get_text("dict")["blocks"]:
142
+ obj = Block.from_dict(block)
143
+ blocks.append(obj)
144
+
145
+ return blocks
146
+
147
+ def to_markdown(self, **kwargs) -> str:
148
+ """
149
+
150
+ Markdown output via `pymupdf4llm`
151
+
152
+ """
153
+ return pymupdf4llm.to_markdown(self, **kwargs)
154
+
155
+
156
+ if __name__ == '__main__':
157
+ from fmtr.tools.path_tools import Path
158
+
159
+ PATH_DATA = Path.data()
160
+ # PATH_PDF=PATH_DATA/'chib.pdf'
161
+ PATH_PDF = PATH_DATA / 'kvm.pdf'
162
+ assert PATH_PDF.exists()
163
+
164
+ doc = Document(PATH_PDF)
165
+ doc.data
166
+
167
+ for page in doc:
168
+ print(page.get_text('dict'))
169
+ print(page.get_text('html'))
170
+
171
+ md = doc.to_markdown()
172
+ md
@@ -145,3 +145,10 @@ class Mask:
145
145
  return text
146
146
  except (KeyError, IndexError):
147
147
  return self
148
+
149
+
150
+ if __name__ == '__main__':
151
+ import numpy as np
152
+
153
+ st = join([1, None, 'test', np.nan, 0, '', 'yeah'])
154
+ st
@@ -0,0 +1 @@
1
+ 1.0.33
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fmtr.tools
3
- Version: 1.0.31
3
+ Version: 1.0.33
4
4
  Summary: Collection of high-level tools to simplify everyday development tasks, with a focus on AI/ML
5
5
  Home-page: https://github.com/fmtr/fmtr.tools
6
6
  Author: Frontmatter
@@ -25,6 +25,7 @@ Provides-Extra: metric
25
25
  Provides-Extra: netrc
26
26
  Provides-Extra: openai.api
27
27
  Provides-Extra: parallel
28
+ Provides-Extra: pdf
28
29
  Provides-Extra: process
29
30
  Provides-Extra: profiling
30
31
  Provides-Extra: semantic
@@ -34,6 +34,7 @@ setup.py
34
34
  ./fmtr/tools/openai_tools.py
35
35
  ./fmtr/tools/parallel_tools.py
36
36
  ./fmtr/tools/path_tools.py
37
+ ./fmtr/tools/pdf_tools.py
37
38
  ./fmtr/tools/platform_tools.py
38
39
  ./fmtr/tools/process_tools.py
39
40
  ./fmtr/tools/profiling_tools.py
@@ -76,6 +76,11 @@ bokeh
76
76
  dask[bag]
77
77
  distributed
78
78
 
79
+ [pdf]
80
+ pydantic
81
+ pymupdf
82
+ pymupdf4llm
83
+
79
84
  [process]
80
85
  logfire
81
86
  semver
@@ -118,6 +123,8 @@ pandas
118
123
  peft
119
124
  pydantic
120
125
  pydantic-ai[logfire,openai]
126
+ pymupdf
127
+ pymupdf4llm
121
128
  pytest-cov
122
129
  pyyaml
123
130
  semver
@@ -1 +0,0 @@
1
- 1.0.31
File without changes
File without changes
File without changes
File without changes