mmar-mapi 1.0.19__py3-none-any.whl → 1.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmar-mapi might be problematic. Click here for more details.

mmar_mapi/api.py CHANGED
@@ -71,13 +71,18 @@ class TextExtractorAPI:
71
71
  raise NotImplementedError
72
72
 
73
73
 
74
- PageRange = Annotated[tuple[int, int], AfterValidator(lambda rng: rng[0] <= rng[1])]
74
+ def _validate_page_range(v: tuple[int, int]) -> tuple[int, int]:
75
+ if v[0] < 1 or v[1] < v[0]:
76
+ raise ValueError("Invalid page range: start must be ≥ 1 and end must be ≥ start.")
77
+ return v
78
+
79
+
80
+ PageRange = Annotated[tuple[int, int], AfterValidator(_validate_page_range)]
75
81
  ForceOCR = StrEnum("ForceOCR", ["ENABLED", "DISABLED", "AUTO"])
76
82
  OutputType = StrEnum("OutputType", ["RAW", "PLAIN", "MARKDOWN"])
77
83
 
78
84
 
79
- class DocExtractionSpec(BaseModel):
80
- page_range: PageRange | None = None
85
+ class ExtractionEngineSpec(BaseModel):
81
86
  output_type: OutputType = OutputType.MARKDOWN
82
87
  force_ocr: ForceOCR = ForceOCR.AUTO
83
88
  do_ocr: bool = False
@@ -88,19 +93,28 @@ class DocExtractionSpec(BaseModel):
88
93
  generate_page_images: bool = False
89
94
  images_scale: float = 2.0
90
95
 
96
+
97
+ class DocExtractionSpec(BaseModel):
98
+ page_range: PageRange | None = None
99
+ engine: ExtractionEngineSpec = ExtractionEngineSpec()
100
+
91
101
  def _update(self, **update):
92
102
  return self.model_copy(update=update)
93
103
 
104
+ def _update_engine(self, **engine_update):
105
+ return self._update(engine=self.engine.model_copy(update=engine_update))
106
+
94
107
  # fmt: off
95
- def with_output_type_raw(self): return self._update(output_type=OutputType.RAW)
96
- def with_output_type_plain(self): return self._update(output_type=OutputType.PLAIN)
97
- def with_ocr(self): return self._update(do_ocr=True)
98
- def with_tables(self): return self._update(do_table_structure=True, do_cell_matching=True)
99
- def with_images(self): return self._update(do_image_extraction=True)
100
- def with_annotations(self): return self._update(do_annotations=True)
101
- def with_force_ocr_enabled(self): return self._update(force_ocr=ForceOCR.ENABLED)
102
- def with_force_ocr_disabled(self): return self._update(force_ocr=ForceOCR.DISABLED)
103
- def with_page_images(self): return self._update(generate_page_images=True)
108
+ def with_output_type_raw(self): return self._update_engine(output_type=OutputType.RAW)
109
+ def with_output_type_plain(self): return self._update_engine(output_type=OutputType.PLAIN)
110
+ def with_ocr(self): return self._update_engine(do_ocr=True)
111
+ def with_tables(self): return self._update_engine(do_table_structure=True, do_cell_matching=True)
112
+ def with_images(self): return self._update_engine(do_image_extraction=True)
113
+ def with_annotations(self): return self._update_engine(do_annotations=True)
114
+ def with_force_ocr_enabled(self): return self._update_engine(force_ocr=ForceOCR.ENABLED)
115
+ def with_force_ocr_disabled(self): return self._update_engine(force_ocr=ForceOCR.DISABLED)
116
+ def with_page_images(self): return self._update_engine(generate_page_images=True)
117
+
104
118
  def with_page_range(self, page_range: PageRange): return self._update(page_range=page_range)
105
119
  # fmt: on
106
120
 
@@ -130,7 +144,7 @@ class ExtractedPageImage(ExtractedImage):
130
144
 
131
145
 
132
146
  class DocExtractionOutput(BaseModel):
133
- config: DocExtractionSpec
147
+ spec: DocExtractionSpec
134
148
  text: str = ""
135
149
  tables: list[ExtractedTable] = []
136
150
  pictures: list[ExtractedPicture] = []
@@ -138,6 +152,6 @@ class DocExtractionOutput(BaseModel):
138
152
 
139
153
 
140
154
  class DocumentExtractorAPI:
141
- def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId:
155
+ def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId | None:
142
156
  """returns file with DocExtractionOutput"""
143
157
  raise NotImplementedError
mmar_mapi/file_storage.py CHANGED
@@ -96,6 +96,9 @@ class FileStorage:
96
96
  res = self.download_text(resource_id).split("\n")
97
97
  return res
98
98
 
99
+ def get_path(self, resource_id: ResourceId | None) -> Path | None:
100
+ return self._get_path(resource_id)
101
+
99
102
  def _get_path(self, resource_id: ResourceId | None) -> Path | None:
100
103
  if not resource_id:
101
104
  return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmar-mapi
3
- Version: 1.0.19
3
+ Version: 1.0.21
4
4
  Summary: Common pure/IO utilities for multi-modal architectures team
5
5
  Keywords:
6
6
  Author: Eugene Tagin
@@ -1,7 +1,7 @@
1
1
  mmar_mapi/__init__.py,sha256=9Q5xsrj26uUnn7ZWvvJUvdVIuzC2oCIeNB4dEoqjF-o,1256
2
- mmar_mapi/api.py,sha256=R9v-1QQWocj5OjNk70T4XnEUTBYGujlwBFurbodiBZA,4373
2
+ mmar_mapi/api.py,sha256=8e_C3sfzX67bLDhgFJlUovIlhYN1Lw9ip5qCRpCCVVM,4869
3
3
  mmar_mapi/decorators_maybe_lru_cache.py,sha256=eO2I6t1fHLUNRABClK1c8EZzHAmCeSK6O-hbJGb2c9E,444
4
- mmar_mapi/file_storage.py,sha256=kxh2DcKY1M9MMb-U03doDYmowHH9VoGYetqBubIJhLI,4937
4
+ mmar_mapi/file_storage.py,sha256=xJU59HmXFsfc53XALdx53IwyqV_k4218AzzXq1Q65Js,5052
5
5
  mmar_mapi/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  mmar_mapi/models/base.py,sha256=mKtXV2x51XVj7W-et9tjGcPMDUUUMelW-BywMgFc2p0,411
7
7
  mmar_mapi/models/chat.py,sha256=-XilkiderIOFG1oSKRDG9NDOEN21sBpbTPHUrqVPjc4,15400
@@ -13,7 +13,7 @@ mmar_mapi/type_union.py,sha256=diwmzcnbqkpGFckPHNw9o8zyQ955mOGNvhTlcBJ0RMI,1905
13
13
  mmar_mapi/utils.py,sha256=FlW9n-84xz2zSHsahHzJ3Y4Wu5mjpFer6t9z6PF6lS0,488
14
14
  mmar_mapi/utils_import.py,sha256=pUyMFd8SItTxBKI-GO9JhRmy43jG_OQlUPr8QCBOSwg,1682
15
15
  mmar_mapi/xml_parser.py,sha256=VvLIX_XCZao9i0qqpTVx8nx0vbFXSe8pEbdJdXnj97g,568
16
- mmar_mapi-1.0.19.dist-info/licenses/LICENSE,sha256=2A90w8WjhOgQXnFuUijKJYazaqZ4_NTokYb9Po4y-9k,1061
17
- mmar_mapi-1.0.19.dist-info/WHEEL,sha256=-neZj6nU9KAMg2CnCY6T3w8J53nx1kFGw_9HfoSzM60,79
18
- mmar_mapi-1.0.19.dist-info/METADATA,sha256=iFhcn0K4RZtc0xJSmsQM1fBbrbG1XvmMTeuC0XSdAHc,944
19
- mmar_mapi-1.0.19.dist-info/RECORD,,
16
+ mmar_mapi-1.0.21.dist-info/licenses/LICENSE,sha256=2A90w8WjhOgQXnFuUijKJYazaqZ4_NTokYb9Po4y-9k,1061
17
+ mmar_mapi-1.0.21.dist-info/WHEEL,sha256=-neZj6nU9KAMg2CnCY6T3w8J53nx1kFGw_9HfoSzM60,79
18
+ mmar_mapi-1.0.21.dist-info/METADATA,sha256=EFNEhMdQS1WL5gCxZFSqaqBGdQQRriWcWh9aPFkCm2k,944
19
+ mmar_mapi-1.0.21.dist-info/RECORD,,