mmar-mapi 1.0.19__py3-none-any.whl → 1.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmar-mapi might be problematic. Click here for more details.
- mmar_mapi/api.py +21 -13
- mmar_mapi/file_storage.py +3 -0
- {mmar_mapi-1.0.19.dist-info → mmar_mapi-1.0.20.dist-info}/METADATA +1 -1
- {mmar_mapi-1.0.19.dist-info → mmar_mapi-1.0.20.dist-info}/RECORD +6 -6
- {mmar_mapi-1.0.19.dist-info → mmar_mapi-1.0.20.dist-info}/WHEEL +0 -0
- {mmar_mapi-1.0.19.dist-info → mmar_mapi-1.0.20.dist-info}/licenses/LICENSE +0 -0
mmar_mapi/api.py
CHANGED
|
@@ -76,8 +76,7 @@ ForceOCR = StrEnum("ForceOCR", ["ENABLED", "DISABLED", "AUTO"])
|
|
|
76
76
|
OutputType = StrEnum("OutputType", ["RAW", "PLAIN", "MARKDOWN"])
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
class
|
|
80
|
-
page_range: PageRange | None = None
|
|
79
|
+
class ExtractionEngineSpec(BaseModel):
|
|
81
80
|
output_type: OutputType = OutputType.MARKDOWN
|
|
82
81
|
force_ocr: ForceOCR = ForceOCR.AUTO
|
|
83
82
|
do_ocr: bool = False
|
|
@@ -88,19 +87,28 @@ class DocExtractionSpec(BaseModel):
|
|
|
88
87
|
generate_page_images: bool = False
|
|
89
88
|
images_scale: float = 2.0
|
|
90
89
|
|
|
90
|
+
|
|
91
|
+
class DocExtractionSpec(BaseModel):
|
|
92
|
+
page_range: PageRange | None = None
|
|
93
|
+
engine: ExtractionEngineSpec = ExtractionEngineSpec()
|
|
94
|
+
|
|
91
95
|
def _update(self, **update):
|
|
92
96
|
return self.model_copy(update=update)
|
|
93
97
|
|
|
98
|
+
def _update_engine(self, **engine_update):
|
|
99
|
+
return self._update(engine=self.engine.model_copy(update=engine_update))
|
|
100
|
+
|
|
94
101
|
# fmt: off
|
|
95
|
-
def with_output_type_raw(self): return self.
|
|
96
|
-
def with_output_type_plain(self): return self.
|
|
97
|
-
def with_ocr(self): return self.
|
|
98
|
-
def with_tables(self): return self.
|
|
99
|
-
def with_images(self): return self.
|
|
100
|
-
def with_annotations(self): return self.
|
|
101
|
-
def with_force_ocr_enabled(self): return self.
|
|
102
|
-
def with_force_ocr_disabled(self): return self.
|
|
103
|
-
def with_page_images(self): return self.
|
|
102
|
+
def with_output_type_raw(self): return self._update_engine(output_type=OutputType.RAW)
|
|
103
|
+
def with_output_type_plain(self): return self._update_engine(output_type=OutputType.PLAIN)
|
|
104
|
+
def with_ocr(self): return self._update_engine(do_ocr=True)
|
|
105
|
+
def with_tables(self): return self._update_engine(do_table_structure=True, do_cell_matching=True)
|
|
106
|
+
def with_images(self): return self._update_engine(do_image_extraction=True)
|
|
107
|
+
def with_annotations(self): return self._update_engine(do_annotations=True)
|
|
108
|
+
def with_force_ocr_enabled(self): return self._update_engine(force_ocr=ForceOCR.ENABLED)
|
|
109
|
+
def with_force_ocr_disabled(self): return self._update_engine(force_ocr=ForceOCR.DISABLED)
|
|
110
|
+
def with_page_images(self): return self._update_engine(generate_page_images=True)
|
|
111
|
+
|
|
104
112
|
def with_page_range(self, page_range: PageRange): return self._update(page_range=page_range)
|
|
105
113
|
# fmt: on
|
|
106
114
|
|
|
@@ -130,7 +138,7 @@ class ExtractedPageImage(ExtractedImage):
|
|
|
130
138
|
|
|
131
139
|
|
|
132
140
|
class DocExtractionOutput(BaseModel):
|
|
133
|
-
|
|
141
|
+
spec: DocExtractionSpec
|
|
134
142
|
text: str = ""
|
|
135
143
|
tables: list[ExtractedTable] = []
|
|
136
144
|
pictures: list[ExtractedPicture] = []
|
|
@@ -138,6 +146,6 @@ class DocExtractionOutput(BaseModel):
|
|
|
138
146
|
|
|
139
147
|
|
|
140
148
|
class DocumentExtractorAPI:
|
|
141
|
-
def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId:
|
|
149
|
+
def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId | None:
|
|
142
150
|
"""returns file with DocExtractionOutput"""
|
|
143
151
|
raise NotImplementedError
|
mmar_mapi/file_storage.py
CHANGED
|
@@ -96,6 +96,9 @@ class FileStorage:
|
|
|
96
96
|
res = self.download_text(resource_id).split("\n")
|
|
97
97
|
return res
|
|
98
98
|
|
|
99
|
+
def get_path(self, resource_id: ResourceId | None) -> Path | None:
|
|
100
|
+
return self._get_path(resource_id)
|
|
101
|
+
|
|
99
102
|
def _get_path(self, resource_id: ResourceId | None) -> Path | None:
|
|
100
103
|
if not resource_id:
|
|
101
104
|
return None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
mmar_mapi/__init__.py,sha256=9Q5xsrj26uUnn7ZWvvJUvdVIuzC2oCIeNB4dEoqjF-o,1256
|
|
2
|
-
mmar_mapi/api.py,sha256=
|
|
2
|
+
mmar_mapi/api.py,sha256=LCWO4HN8mAmgpZI5KJ5MSZwI55Y7hWuplOo1e3EGC_I,4670
|
|
3
3
|
mmar_mapi/decorators_maybe_lru_cache.py,sha256=eO2I6t1fHLUNRABClK1c8EZzHAmCeSK6O-hbJGb2c9E,444
|
|
4
|
-
mmar_mapi/file_storage.py,sha256=
|
|
4
|
+
mmar_mapi/file_storage.py,sha256=xJU59HmXFsfc53XALdx53IwyqV_k4218AzzXq1Q65Js,5052
|
|
5
5
|
mmar_mapi/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
mmar_mapi/models/base.py,sha256=mKtXV2x51XVj7W-et9tjGcPMDUUUMelW-BywMgFc2p0,411
|
|
7
7
|
mmar_mapi/models/chat.py,sha256=-XilkiderIOFG1oSKRDG9NDOEN21sBpbTPHUrqVPjc4,15400
|
|
@@ -13,7 +13,7 @@ mmar_mapi/type_union.py,sha256=diwmzcnbqkpGFckPHNw9o8zyQ955mOGNvhTlcBJ0RMI,1905
|
|
|
13
13
|
mmar_mapi/utils.py,sha256=FlW9n-84xz2zSHsahHzJ3Y4Wu5mjpFer6t9z6PF6lS0,488
|
|
14
14
|
mmar_mapi/utils_import.py,sha256=pUyMFd8SItTxBKI-GO9JhRmy43jG_OQlUPr8QCBOSwg,1682
|
|
15
15
|
mmar_mapi/xml_parser.py,sha256=VvLIX_XCZao9i0qqpTVx8nx0vbFXSe8pEbdJdXnj97g,568
|
|
16
|
-
mmar_mapi-1.0.
|
|
17
|
-
mmar_mapi-1.0.
|
|
18
|
-
mmar_mapi-1.0.
|
|
19
|
-
mmar_mapi-1.0.
|
|
16
|
+
mmar_mapi-1.0.20.dist-info/licenses/LICENSE,sha256=2A90w8WjhOgQXnFuUijKJYazaqZ4_NTokYb9Po4y-9k,1061
|
|
17
|
+
mmar_mapi-1.0.20.dist-info/WHEEL,sha256=-neZj6nU9KAMg2CnCY6T3w8J53nx1kFGw_9HfoSzM60,79
|
|
18
|
+
mmar_mapi-1.0.20.dist-info/METADATA,sha256=IoMO8Hdd07-s3F4aC3OuA7dgYgZddbXiK7ES5Jwnyck,944
|
|
19
|
+
mmar_mapi-1.0.20.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|