mmar-mapi 1.0.19__tar.gz → 1.0.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mmar-mapi might be problematic. Click here for more details.
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/PKG-INFO +1 -1
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/pyproject.toml +1 -1
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/api.py +28 -14
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/file_storage.py +3 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/LICENSE +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/README.md +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/__init__.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/decorators_maybe_lru_cache.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/models/__init__.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/models/base.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/models/chat.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/models/chat_item.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/models/enums.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/models/tracks.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/models/widget.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/type_union.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/utils.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/utils_import.py +0 -0
- {mmar_mapi-1.0.19 → mmar_mapi-1.0.21}/src/mmar_mapi/xml_parser.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mmar-mapi"
|
|
3
3
|
# dynamic version is not supported yet on uv_build
|
|
4
|
-
version = "1.0.
|
|
4
|
+
version = "1.0.21"
|
|
5
5
|
description = "Common pure/IO utilities for multi-modal architectures team"
|
|
6
6
|
authors = [{name = "Eugene Tagin", email = "tagin@airi.net"}]
|
|
7
7
|
license = "MIT"
|
|
@@ -71,13 +71,18 @@ class TextExtractorAPI:
|
|
|
71
71
|
raise NotImplementedError
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
|
|
74
|
+
def _validate_page_range(v: tuple[int, int]) -> tuple[int, int]:
|
|
75
|
+
if v[0] < 1 or v[1] < v[0]:
|
|
76
|
+
raise ValueError("Invalid page range: start must be ≥ 1 and end must be ≥ start.")
|
|
77
|
+
return v
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
PageRange = Annotated[tuple[int, int], AfterValidator(_validate_page_range)]
|
|
75
81
|
ForceOCR = StrEnum("ForceOCR", ["ENABLED", "DISABLED", "AUTO"])
|
|
76
82
|
OutputType = StrEnum("OutputType", ["RAW", "PLAIN", "MARKDOWN"])
|
|
77
83
|
|
|
78
84
|
|
|
79
|
-
class
|
|
80
|
-
page_range: PageRange | None = None
|
|
85
|
+
class ExtractionEngineSpec(BaseModel):
|
|
81
86
|
output_type: OutputType = OutputType.MARKDOWN
|
|
82
87
|
force_ocr: ForceOCR = ForceOCR.AUTO
|
|
83
88
|
do_ocr: bool = False
|
|
@@ -88,19 +93,28 @@ class DocExtractionSpec(BaseModel):
|
|
|
88
93
|
generate_page_images: bool = False
|
|
89
94
|
images_scale: float = 2.0
|
|
90
95
|
|
|
96
|
+
|
|
97
|
+
class DocExtractionSpec(BaseModel):
|
|
98
|
+
page_range: PageRange | None = None
|
|
99
|
+
engine: ExtractionEngineSpec = ExtractionEngineSpec()
|
|
100
|
+
|
|
91
101
|
def _update(self, **update):
|
|
92
102
|
return self.model_copy(update=update)
|
|
93
103
|
|
|
104
|
+
def _update_engine(self, **engine_update):
|
|
105
|
+
return self._update(engine=self.engine.model_copy(update=engine_update))
|
|
106
|
+
|
|
94
107
|
# fmt: off
|
|
95
|
-
def with_output_type_raw(self): return self.
|
|
96
|
-
def with_output_type_plain(self): return self.
|
|
97
|
-
def with_ocr(self): return self.
|
|
98
|
-
def with_tables(self): return self.
|
|
99
|
-
def with_images(self): return self.
|
|
100
|
-
def with_annotations(self): return self.
|
|
101
|
-
def with_force_ocr_enabled(self): return self.
|
|
102
|
-
def with_force_ocr_disabled(self): return self.
|
|
103
|
-
def with_page_images(self): return self.
|
|
108
|
+
def with_output_type_raw(self): return self._update_engine(output_type=OutputType.RAW)
|
|
109
|
+
def with_output_type_plain(self): return self._update_engine(output_type=OutputType.PLAIN)
|
|
110
|
+
def with_ocr(self): return self._update_engine(do_ocr=True)
|
|
111
|
+
def with_tables(self): return self._update_engine(do_table_structure=True, do_cell_matching=True)
|
|
112
|
+
def with_images(self): return self._update_engine(do_image_extraction=True)
|
|
113
|
+
def with_annotations(self): return self._update_engine(do_annotations=True)
|
|
114
|
+
def with_force_ocr_enabled(self): return self._update_engine(force_ocr=ForceOCR.ENABLED)
|
|
115
|
+
def with_force_ocr_disabled(self): return self._update_engine(force_ocr=ForceOCR.DISABLED)
|
|
116
|
+
def with_page_images(self): return self._update_engine(generate_page_images=True)
|
|
117
|
+
|
|
104
118
|
def with_page_range(self, page_range: PageRange): return self._update(page_range=page_range)
|
|
105
119
|
# fmt: on
|
|
106
120
|
|
|
@@ -130,7 +144,7 @@ class ExtractedPageImage(ExtractedImage):
|
|
|
130
144
|
|
|
131
145
|
|
|
132
146
|
class DocExtractionOutput(BaseModel):
|
|
133
|
-
|
|
147
|
+
spec: DocExtractionSpec
|
|
134
148
|
text: str = ""
|
|
135
149
|
tables: list[ExtractedTable] = []
|
|
136
150
|
pictures: list[ExtractedPicture] = []
|
|
@@ -138,6 +152,6 @@ class DocExtractionOutput(BaseModel):
|
|
|
138
152
|
|
|
139
153
|
|
|
140
154
|
class DocumentExtractorAPI:
|
|
141
|
-
def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId:
|
|
155
|
+
def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId | None:
|
|
142
156
|
"""returns file with DocExtractionOutput"""
|
|
143
157
|
raise NotImplementedError
|
|
@@ -96,6 +96,9 @@ class FileStorage:
|
|
|
96
96
|
res = self.download_text(resource_id).split("\n")
|
|
97
97
|
return res
|
|
98
98
|
|
|
99
|
+
def get_path(self, resource_id: ResourceId | None) -> Path | None:
|
|
100
|
+
return self._get_path(resource_id)
|
|
101
|
+
|
|
99
102
|
def _get_path(self, resource_id: ResourceId | None) -> Path | None:
|
|
100
103
|
if not resource_id:
|
|
101
104
|
return None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|