mmar-mapi 1.0.18__py3-none-any.whl → 1.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmar-mapi might be problematic. Click here for more details.
- mmar_mapi/api.py +92 -3
- mmar_mapi/file_storage.py +11 -2
- {mmar_mapi-1.0.18.dist-info → mmar_mapi-1.0.20.dist-info}/METADATA +1 -1
- {mmar_mapi-1.0.18.dist-info → mmar_mapi-1.0.20.dist-info}/RECORD +6 -6
- {mmar_mapi-1.0.18.dist-info → mmar_mapi-1.0.20.dist-info}/WHEEL +0 -0
- {mmar_mapi-1.0.18.dist-info → mmar_mapi-1.0.20.dist-info}/licenses/LICENSE +0 -0
mmar_mapi/api.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
from pydantic import AfterValidator, BaseModel
|
|
5
|
+
|
|
6
|
+
from mmar_mapi.file_storage import ResourceId
|
|
1
7
|
from mmar_mapi.models.chat import Chat, ChatMessage
|
|
2
8
|
from mmar_mapi.models.tracks import DomainInfo, TrackInfo
|
|
3
|
-
from pydantic import BaseModel
|
|
4
|
-
|
|
5
9
|
|
|
6
10
|
Value = str
|
|
7
11
|
Interpretation = str
|
|
8
|
-
ResourceId = str
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
class ChatManagerAPI:
|
|
@@ -60,3 +63,89 @@ class ContentInterpreterAPI:
|
|
|
60
63
|
class TextProcessorAPI:
|
|
61
64
|
def process(self, *, text: str, chat: Chat | None = None) -> str:
|
|
62
65
|
raise NotImplementedError
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TextExtractorAPI:
|
|
69
|
+
def extract(self, *, resource_id: ResourceId) -> ResourceId:
|
|
70
|
+
"""returns file with text"""
|
|
71
|
+
raise NotImplementedError
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
PageRange = Annotated[tuple[int, int], AfterValidator(lambda rng: rng[0] <= rng[1])]
|
|
75
|
+
ForceOCR = StrEnum("ForceOCR", ["ENABLED", "DISABLED", "AUTO"])
|
|
76
|
+
OutputType = StrEnum("OutputType", ["RAW", "PLAIN", "MARKDOWN"])
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ExtractionEngineSpec(BaseModel):
|
|
80
|
+
output_type: OutputType = OutputType.MARKDOWN
|
|
81
|
+
force_ocr: ForceOCR = ForceOCR.AUTO
|
|
82
|
+
do_ocr: bool = False
|
|
83
|
+
do_table_structure: bool = False
|
|
84
|
+
do_cell_matching: bool = False
|
|
85
|
+
do_annotations: bool = False
|
|
86
|
+
do_image_extraction: bool = False
|
|
87
|
+
generate_page_images: bool = False
|
|
88
|
+
images_scale: float = 2.0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class DocExtractionSpec(BaseModel):
|
|
92
|
+
page_range: PageRange | None = None
|
|
93
|
+
engine: ExtractionEngineSpec = ExtractionEngineSpec()
|
|
94
|
+
|
|
95
|
+
def _update(self, **update):
|
|
96
|
+
return self.model_copy(update=update)
|
|
97
|
+
|
|
98
|
+
def _update_engine(self, **engine_update):
|
|
99
|
+
return self._update(engine=self.engine.model_copy(update=engine_update))
|
|
100
|
+
|
|
101
|
+
# fmt: off
|
|
102
|
+
def with_output_type_raw(self): return self._update_engine(output_type=OutputType.RAW)
|
|
103
|
+
def with_output_type_plain(self): return self._update_engine(output_type=OutputType.PLAIN)
|
|
104
|
+
def with_ocr(self): return self._update_engine(do_ocr=True)
|
|
105
|
+
def with_tables(self): return self._update_engine(do_table_structure=True, do_cell_matching=True)
|
|
106
|
+
def with_images(self): return self._update_engine(do_image_extraction=True)
|
|
107
|
+
def with_annotations(self): return self._update_engine(do_annotations=True)
|
|
108
|
+
def with_force_ocr_enabled(self): return self._update_engine(force_ocr=ForceOCR.ENABLED)
|
|
109
|
+
def with_force_ocr_disabled(self): return self._update_engine(force_ocr=ForceOCR.DISABLED)
|
|
110
|
+
def with_page_images(self): return self._update_engine(generate_page_images=True)
|
|
111
|
+
|
|
112
|
+
def with_page_range(self, page_range: PageRange): return self._update(page_range=page_range)
|
|
113
|
+
# fmt: on
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class ExtractedImage(BaseModel):
|
|
117
|
+
page: int
|
|
118
|
+
image_resource_id: ResourceId | None = None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class ExtractedImageMetadata(BaseModel):
|
|
122
|
+
annotation: str = ""
|
|
123
|
+
caption: str = ""
|
|
124
|
+
width: int | None = None
|
|
125
|
+
height: int | None = None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class ExtractedPicture(ExtractedImage, ExtractedImageMetadata):
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class ExtractedTable(ExtractedImage, ExtractedImageMetadata):
|
|
133
|
+
formatted_str: str
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class ExtractedPageImage(ExtractedImage):
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class DocExtractionOutput(BaseModel):
|
|
141
|
+
spec: DocExtractionSpec
|
|
142
|
+
text: str = ""
|
|
143
|
+
tables: list[ExtractedTable] = []
|
|
144
|
+
pictures: list[ExtractedPicture] = []
|
|
145
|
+
page_images: list[ExtractedPageImage] = []
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class DocumentExtractorAPI:
|
|
149
|
+
def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId | None:
|
|
150
|
+
"""returns file with DocExtractionOutput"""
|
|
151
|
+
raise NotImplementedError
|
mmar_mapi/file_storage.py
CHANGED
|
@@ -45,7 +45,7 @@ class FileStorage:
|
|
|
45
45
|
resource_id = self.upload(content, fname)
|
|
46
46
|
return resource_id
|
|
47
47
|
|
|
48
|
-
def upload(self, content: bytes | str, fname: str) -> ResourceId:
|
|
48
|
+
def upload(self, content: bytes | str, fname: str, origin: str | None = None) -> ResourceId:
|
|
49
49
|
if isinstance(content, str):
|
|
50
50
|
content = content.encode()
|
|
51
51
|
|
|
@@ -56,7 +56,7 @@ class FileStorage:
|
|
|
56
56
|
|
|
57
57
|
fpath_md = fpath.with_suffix(SUFFIX_METADATA)
|
|
58
58
|
update_date = f"{datetime.now():%Y-%m-%d--%H-%M-%S}"
|
|
59
|
-
metadata = {"fname": fname, "update_date": update_date, "size": len(content)}
|
|
59
|
+
metadata = {"fname": fname, "update_date": update_date, "size": len(content), "origin": origin}
|
|
60
60
|
fpath_md.write_text(json.dumps(metadata, ensure_ascii=False))
|
|
61
61
|
|
|
62
62
|
return str(fpath)
|
|
@@ -67,6 +67,12 @@ class FileStorage:
|
|
|
67
67
|
return None
|
|
68
68
|
return json.loads(metadata_path.read_text())
|
|
69
69
|
|
|
70
|
+
def get_fname(self, resource_id: ResourceId) -> str | None:
|
|
71
|
+
metadata = self.get_metadata(resource_id)
|
|
72
|
+
if metadata is None:
|
|
73
|
+
return None
|
|
74
|
+
return metadata.get("fname")
|
|
75
|
+
|
|
70
76
|
async def upload_async(self, content: bytes | str, fname: str) -> ResourceId:
|
|
71
77
|
return self.upload(content, fname)
|
|
72
78
|
|
|
@@ -90,6 +96,9 @@ class FileStorage:
|
|
|
90
96
|
res = self.download_text(resource_id).split("\n")
|
|
91
97
|
return res
|
|
92
98
|
|
|
99
|
+
def get_path(self, resource_id: ResourceId | None) -> Path | None:
|
|
100
|
+
return self._get_path(resource_id)
|
|
101
|
+
|
|
93
102
|
def _get_path(self, resource_id: ResourceId | None) -> Path | None:
|
|
94
103
|
if not resource_id:
|
|
95
104
|
return None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
mmar_mapi/__init__.py,sha256=9Q5xsrj26uUnn7ZWvvJUvdVIuzC2oCIeNB4dEoqjF-o,1256
|
|
2
|
-
mmar_mapi/api.py,sha256=
|
|
2
|
+
mmar_mapi/api.py,sha256=LCWO4HN8mAmgpZI5KJ5MSZwI55Y7hWuplOo1e3EGC_I,4670
|
|
3
3
|
mmar_mapi/decorators_maybe_lru_cache.py,sha256=eO2I6t1fHLUNRABClK1c8EZzHAmCeSK6O-hbJGb2c9E,444
|
|
4
|
-
mmar_mapi/file_storage.py,sha256=
|
|
4
|
+
mmar_mapi/file_storage.py,sha256=xJU59HmXFsfc53XALdx53IwyqV_k4218AzzXq1Q65Js,5052
|
|
5
5
|
mmar_mapi/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
mmar_mapi/models/base.py,sha256=mKtXV2x51XVj7W-et9tjGcPMDUUUMelW-BywMgFc2p0,411
|
|
7
7
|
mmar_mapi/models/chat.py,sha256=-XilkiderIOFG1oSKRDG9NDOEN21sBpbTPHUrqVPjc4,15400
|
|
@@ -13,7 +13,7 @@ mmar_mapi/type_union.py,sha256=diwmzcnbqkpGFckPHNw9o8zyQ955mOGNvhTlcBJ0RMI,1905
|
|
|
13
13
|
mmar_mapi/utils.py,sha256=FlW9n-84xz2zSHsahHzJ3Y4Wu5mjpFer6t9z6PF6lS0,488
|
|
14
14
|
mmar_mapi/utils_import.py,sha256=pUyMFd8SItTxBKI-GO9JhRmy43jG_OQlUPr8QCBOSwg,1682
|
|
15
15
|
mmar_mapi/xml_parser.py,sha256=VvLIX_XCZao9i0qqpTVx8nx0vbFXSe8pEbdJdXnj97g,568
|
|
16
|
-
mmar_mapi-1.0.
|
|
17
|
-
mmar_mapi-1.0.
|
|
18
|
-
mmar_mapi-1.0.
|
|
19
|
-
mmar_mapi-1.0.
|
|
16
|
+
mmar_mapi-1.0.20.dist-info/licenses/LICENSE,sha256=2A90w8WjhOgQXnFuUijKJYazaqZ4_NTokYb9Po4y-9k,1061
|
|
17
|
+
mmar_mapi-1.0.20.dist-info/WHEEL,sha256=-neZj6nU9KAMg2CnCY6T3w8J53nx1kFGw_9HfoSzM60,79
|
|
18
|
+
mmar_mapi-1.0.20.dist-info/METADATA,sha256=IoMO8Hdd07-s3F4aC3OuA7dgYgZddbXiK7ES5Jwnyck,944
|
|
19
|
+
mmar_mapi-1.0.20.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|