fmtr.tools 1.0.31__tar.gz → 1.0.33__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fmtr.tools might be problematic. Click here for more details.
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/PKG-INFO +2 -1
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/__init__.py +5 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/api_tools.py +5 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/path_tools.py +23 -0
- fmtr.tools-1.0.33/fmtr/tools/pdf_tools.py +172 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/string_tools.py +7 -0
- fmtr.tools-1.0.33/fmtr/tools/version +1 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/PKG-INFO +2 -1
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/SOURCES.txt +1 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/requires.txt +7 -0
- fmtr.tools-1.0.31/fmtr/tools/version +0 -1
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/LICENSE +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/README.md +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/ai_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/async_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/augmentation_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/caching_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/config.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/config_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/console_script_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/data_modelling_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/dataclass_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/datatype_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/docker_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/environment_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/function_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/google_api_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/hash_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/hfh_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/html_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/import_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/inspection_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/interface_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/iterator_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/json_fix_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/json_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/logging_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/merging_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/metric_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/name_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/netrc_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/openai_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/parallel_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/platform_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/process_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/profiling_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/random_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/semantic_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/spaces_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/__init__.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/conftest.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/helpers.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_datatype.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_environment.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_json.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_path.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tests/test_yaml.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tokenization_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/unicode_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/version_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr/tools/yaml_tools.py +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/dependency_links.txt +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/entry_points.txt +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/fmtr.tools.egg-info/top_level.txt +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/setup.cfg +0 -0
- {fmtr.tools-1.0.31 → fmtr.tools-1.0.33}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fmtr.tools
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.33
|
|
4
4
|
Summary: Collection of high-level tools to simplify everyday development tasks, with a focus on AI/ML
|
|
5
5
|
Home-page: https://github.com/fmtr/fmtr.tools
|
|
6
6
|
Author: Frontmatter
|
|
@@ -25,6 +25,7 @@ Provides-Extra: metric
|
|
|
25
25
|
Provides-Extra: netrc
|
|
26
26
|
Provides-Extra: openai.api
|
|
27
27
|
Provides-Extra: parallel
|
|
28
|
+
Provides-Extra: pdf
|
|
28
29
|
Provides-Extra: process
|
|
29
30
|
Provides-Extra: profiling
|
|
30
31
|
Provides-Extra: semantic
|
|
@@ -143,6 +143,11 @@ try:
|
|
|
143
143
|
except ImportError as exception:
|
|
144
144
|
caching = MissingExtraMockModule('caching', exception)
|
|
145
145
|
|
|
146
|
+
try:
|
|
147
|
+
from fmtr.tools import pdf_tools as pdf
|
|
148
|
+
except ImportError as exception:
|
|
149
|
+
pdf = MissingExtraMockModule('pdf', exception)
|
|
150
|
+
|
|
146
151
|
|
|
147
152
|
__all__ = [
|
|
148
153
|
'config',
|
|
@@ -96,6 +96,29 @@ class Path(type(Path())):
|
|
|
96
96
|
"""
|
|
97
97
|
return cls(gettempdir())
|
|
98
98
|
|
|
99
|
+
@classmethod
|
|
100
|
+
def data(cls, name='data') -> 'Path':
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
Fetch canonical "data"/"artifacts" path, whether calling package is regular or namespace package.
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
from fmtr.tools.inspection_tools import get_call_path
|
|
107
|
+
path = get_call_path()
|
|
108
|
+
path = path.absolute().parent.parent
|
|
109
|
+
|
|
110
|
+
path /= name
|
|
111
|
+
|
|
112
|
+
if path.exists():
|
|
113
|
+
return path
|
|
114
|
+
|
|
115
|
+
path = path.parent.parent / name
|
|
116
|
+
|
|
117
|
+
if path.exists():
|
|
118
|
+
return path
|
|
119
|
+
|
|
120
|
+
raise FileNotFoundError(f'No "{name}" directory found at "{path}"')
|
|
121
|
+
|
|
99
122
|
def write_json(self, obj) -> int:
|
|
100
123
|
"""
|
|
101
124
|
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import pymupdf as pm
|
|
2
|
+
import pymupdf4llm
|
|
3
|
+
from typing import List, Tuple, Dict, Any, Self
|
|
4
|
+
|
|
5
|
+
from fmtr.tools import data_modelling_tools
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BoundingBox(data_modelling_tools.Base):
|
|
9
|
+
left: float
|
|
10
|
+
top: float
|
|
11
|
+
right: float
|
|
12
|
+
bottom: float
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def order(self):
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
Approximate natural reading order
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
return (self.top, self.left), (self.bottom, self.right)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def rect(self) -> pm.Rect:
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
Position as a PyMuPDF Rect
|
|
28
|
+
|
|
29
|
+
"""
|
|
30
|
+
return pm.Rect(self.left, self.top, self.right, self.bottom)
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_dict(cls, data: Tuple[float]) -> Self:
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
Instantiate from PyMuPDF dictionary data
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
data = {key: value for key, value in zip(cls.model_fields.keys(), data)}
|
|
40
|
+
return cls(**data)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Span(data_modelling_tools.Base):
|
|
44
|
+
size: float
|
|
45
|
+
flags: int
|
|
46
|
+
font: str
|
|
47
|
+
color: int
|
|
48
|
+
ascender: float
|
|
49
|
+
descender: float
|
|
50
|
+
text: str
|
|
51
|
+
origin: Tuple[float, float]
|
|
52
|
+
bbox: BoundingBox
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_dict(cls, data: Dict[str, Any]) -> Self:
|
|
56
|
+
data['bbox'] = BoundingBox.from_dict(data['bbox'])
|
|
57
|
+
return cls(**data)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class Line(data_modelling_tools.Base):
|
|
61
|
+
spans: List[Span]
|
|
62
|
+
wmode: int
|
|
63
|
+
dir: Tuple[float, float]
|
|
64
|
+
bbox: BoundingBox
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_dict(cls, data: Dict[str, Any]) -> 'Line':
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
Instantiate from PyMuPDF dictionary data
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
data['spans'] = [Span.from_dict(span) for span in data['spans']]
|
|
74
|
+
data['bbox'] = BoundingBox.from_dict(data['bbox'])
|
|
75
|
+
return cls(**data)
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def text(self) -> str:
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
Simple text representation
|
|
82
|
+
|
|
83
|
+
"""
|
|
84
|
+
return ' '.join([span.text for span in self.spans])
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Block(data_modelling_tools.Base):
|
|
88
|
+
number: int
|
|
89
|
+
type: int
|
|
90
|
+
bbox: BoundingBox
|
|
91
|
+
lines: List[Line]
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def text(self) -> str:
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
Simple text representation
|
|
98
|
+
|
|
99
|
+
"""
|
|
100
|
+
return ' '.join([span.text for span in self.spans])
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_dict(cls, data: Dict) -> Self:
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
Instantiate from PyMuPDF dictionary data
|
|
107
|
+
|
|
108
|
+
"""
|
|
109
|
+
data['lines'] = [Line.from_dict(line) for line in data['lines']]
|
|
110
|
+
data['bbox'] = BoundingBox.from_dict(data['bbox'])
|
|
111
|
+
return cls(**data)
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def rect(self) -> pm.Rect:
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
Position as a PyMuPDF Rect
|
|
118
|
+
|
|
119
|
+
"""
|
|
120
|
+
return self.bbox.rect
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class Document(pm.Document):
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
Subclassed Document object with data-modelled elements property and markdown conversion.
|
|
127
|
+
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def data(self) -> List[Block]:
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
Get representation of Document elements as Python objects.
|
|
135
|
+
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
blocks = []
|
|
139
|
+
|
|
140
|
+
for page in self:
|
|
141
|
+
for block in page.get_text("dict")["blocks"]:
|
|
142
|
+
obj = Block.from_dict(block)
|
|
143
|
+
blocks.append(obj)
|
|
144
|
+
|
|
145
|
+
return blocks
|
|
146
|
+
|
|
147
|
+
def to_markdown(self, **kwargs) -> str:
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
Markdown output via `pymupdf4llm`
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
return pymupdf4llm.to_markdown(self, **kwargs)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == '__main__':
|
|
157
|
+
from fmtr.tools.path_tools import Path
|
|
158
|
+
|
|
159
|
+
PATH_DATA = Path.data()
|
|
160
|
+
# PATH_PDF=PATH_DATA/'chib.pdf'
|
|
161
|
+
PATH_PDF = PATH_DATA / 'kvm.pdf'
|
|
162
|
+
assert PATH_PDF.exists()
|
|
163
|
+
|
|
164
|
+
doc = Document(PATH_PDF)
|
|
165
|
+
doc.data
|
|
166
|
+
|
|
167
|
+
for page in doc:
|
|
168
|
+
print(page.get_text('dict'))
|
|
169
|
+
print(page.get_text('html'))
|
|
170
|
+
|
|
171
|
+
md = doc.to_markdown()
|
|
172
|
+
md
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1.0.33
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fmtr.tools
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.33
|
|
4
4
|
Summary: Collection of high-level tools to simplify everyday development tasks, with a focus on AI/ML
|
|
5
5
|
Home-page: https://github.com/fmtr/fmtr.tools
|
|
6
6
|
Author: Frontmatter
|
|
@@ -25,6 +25,7 @@ Provides-Extra: metric
|
|
|
25
25
|
Provides-Extra: netrc
|
|
26
26
|
Provides-Extra: openai.api
|
|
27
27
|
Provides-Extra: parallel
|
|
28
|
+
Provides-Extra: pdf
|
|
28
29
|
Provides-Extra: process
|
|
29
30
|
Provides-Extra: profiling
|
|
30
31
|
Provides-Extra: semantic
|
|
@@ -76,6 +76,11 @@ bokeh
|
|
|
76
76
|
dask[bag]
|
|
77
77
|
distributed
|
|
78
78
|
|
|
79
|
+
[pdf]
|
|
80
|
+
pydantic
|
|
81
|
+
pymupdf
|
|
82
|
+
pymupdf4llm
|
|
83
|
+
|
|
79
84
|
[process]
|
|
80
85
|
logfire
|
|
81
86
|
semver
|
|
@@ -118,6 +123,8 @@ pandas
|
|
|
118
123
|
peft
|
|
119
124
|
pydantic
|
|
120
125
|
pydantic-ai[logfire,openai]
|
|
126
|
+
pymupdf
|
|
127
|
+
pymupdf4llm
|
|
121
128
|
pytest-cov
|
|
122
129
|
pyyaml
|
|
123
130
|
semver
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
1.0.31
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|