mmar-mapi 1.0.18__py3-none-any.whl → 1.0.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mmar-mapi might be problematic. Click here for more details.

mmar_mapi/api.py CHANGED
@@ -1,11 +1,14 @@
1
+ from enum import StrEnum
2
+ from typing import Annotated
3
+
4
+ from pydantic import AfterValidator, BaseModel
5
+
6
+ from mmar_mapi.file_storage import ResourceId
1
7
  from mmar_mapi.models.chat import Chat, ChatMessage
2
8
  from mmar_mapi.models.tracks import DomainInfo, TrackInfo
3
- from pydantic import BaseModel
4
-
5
9
 
6
10
  Value = str
7
11
  Interpretation = str
8
- ResourceId = str
9
12
 
10
13
 
11
14
  class ChatManagerAPI:
@@ -60,3 +63,81 @@ class ContentInterpreterAPI:
60
63
  class TextProcessorAPI:
61
64
  def process(self, *, text: str, chat: Chat | None = None) -> str:
62
65
  raise NotImplementedError
66
+
67
+
68
+ class TextExtractorAPI:
69
+ def extract(self, *, resource_id: ResourceId) -> ResourceId:
70
+ """returns file with text"""
71
+ raise NotImplementedError
72
+
73
+
74
+ PageRange = Annotated[tuple[int, int], AfterValidator(lambda rng: rng[0] <= rng[1])]
75
+ ForceOCR = StrEnum("ForceOCR", ["ENABLED", "DISABLED", "AUTO"])
76
+ OutputType = StrEnum("OutputType", ["RAW", "PLAIN", "MARKDOWN"])
77
+
78
+
79
+ class DocExtractionSpec(BaseModel):
80
+ page_range: PageRange | None = None
81
+ output_type: OutputType = OutputType.MARKDOWN
82
+ force_ocr: ForceOCR = ForceOCR.AUTO
83
+ do_ocr: bool = False
84
+ do_table_structure: bool = False
85
+ do_cell_matching: bool = False
86
+ do_annotations: bool = False
87
+ do_image_extraction: bool = False
88
+ generate_page_images: bool = False
89
+ images_scale: float = 2.0
90
+
91
+ def _update(self, **update):
92
+ return self.model_copy(update=update)
93
+
94
+ # fmt: off
95
+ def with_output_type_raw(self): return self._update(output_type=OutputType.RAW)
96
+ def with_output_type_plain(self): return self._update(output_type=OutputType.PLAIN)
97
+ def with_ocr(self): return self._update(do_ocr=True)
98
+ def with_tables(self): return self._update(do_table_structure=True, do_cell_matching=True)
99
+ def with_images(self): return self._update(do_image_extraction=True)
100
+ def with_annotations(self): return self._update(do_annotations=True)
101
+ def with_force_ocr_enabled(self): return self._update(force_ocr=ForceOCR.ENABLED)
102
+ def with_force_ocr_disabled(self): return self._update(force_ocr=ForceOCR.DISABLED)
103
+ def with_page_images(self): return self._update(generate_page_images=True)
104
+ def with_page_range(self, page_range: PageRange): return self._update(page_range=page_range)
105
+ # fmt: on
106
+
107
+
108
+ class ExtractedImage(BaseModel):
109
+ page: int
110
+ image_resource_id: ResourceId | None = None
111
+
112
+
113
+ class ExtractedImageMetadata(BaseModel):
114
+ annotation: str = ""
115
+ caption: str = ""
116
+ width: int | None = None
117
+ height: int | None = None
118
+
119
+
120
+ class ExtractedPicture(ExtractedImage, ExtractedImageMetadata):
121
+ pass
122
+
123
+
124
+ class ExtractedTable(ExtractedImage, ExtractedImageMetadata):
125
+ formatted_str: str
126
+
127
+
128
+ class ExtractedPageImage(ExtractedImage):
129
+ pass
130
+
131
+
132
+ class DocExtractionOutput(BaseModel):
133
+ config: DocExtractionSpec
134
+ text: str = ""
135
+ tables: list[ExtractedTable] = []
136
+ pictures: list[ExtractedPicture] = []
137
+ page_images: list[ExtractedPageImage] = []
138
+
139
+
140
+ class DocumentExtractorAPI:
141
+ def extract(self, *, resource_id: ResourceId, spec: DocExtractionSpec) -> ResourceId:
142
+ """returns file with DocExtractionOutput"""
143
+ raise NotImplementedError
mmar_mapi/file_storage.py CHANGED
@@ -45,7 +45,7 @@ class FileStorage:
45
45
  resource_id = self.upload(content, fname)
46
46
  return resource_id
47
47
 
48
- def upload(self, content: bytes | str, fname: str) -> ResourceId:
48
+ def upload(self, content: bytes | str, fname: str, origin: str | None = None) -> ResourceId:
49
49
  if isinstance(content, str):
50
50
  content = content.encode()
51
51
 
@@ -56,7 +56,7 @@ class FileStorage:
56
56
 
57
57
  fpath_md = fpath.with_suffix(SUFFIX_METADATA)
58
58
  update_date = f"{datetime.now():%Y-%m-%d--%H-%M-%S}"
59
- metadata = {"fname": fname, "update_date": update_date, "size": len(content)}
59
+ metadata = {"fname": fname, "update_date": update_date, "size": len(content), "origin": origin}
60
60
  fpath_md.write_text(json.dumps(metadata, ensure_ascii=False))
61
61
 
62
62
  return str(fpath)
@@ -67,6 +67,12 @@ class FileStorage:
67
67
  return None
68
68
  return json.loads(metadata_path.read_text())
69
69
 
70
+ def get_fname(self, resource_id: ResourceId) -> str | None:
71
+ metadata = self.get_metadata(resource_id)
72
+ if metadata is None:
73
+ return None
74
+ return metadata.get("fname")
75
+
70
76
  async def upload_async(self, content: bytes | str, fname: str) -> ResourceId:
71
77
  return self.upload(content, fname)
72
78
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mmar-mapi
3
- Version: 1.0.18
3
+ Version: 1.0.19
4
4
  Summary: Common pure/IO utilities for multi-modal architectures team
5
5
  Keywords:
6
6
  Author: Eugene Tagin
@@ -1,7 +1,7 @@
1
1
  mmar_mapi/__init__.py,sha256=9Q5xsrj26uUnn7ZWvvJUvdVIuzC2oCIeNB4dEoqjF-o,1256
2
- mmar_mapi/api.py,sha256=C9Sr8dISvf51xfEznPjccI_odaG4coQE3HI_0jVpjMQ,1677
2
+ mmar_mapi/api.py,sha256=R9v-1QQWocj5OjNk70T4XnEUTBYGujlwBFurbodiBZA,4373
3
3
  mmar_mapi/decorators_maybe_lru_cache.py,sha256=eO2I6t1fHLUNRABClK1c8EZzHAmCeSK6O-hbJGb2c9E,444
4
- mmar_mapi/file_storage.py,sha256=RNPHKDV7JIo2ZlSOyi7UfE8q7kpMvv7ZzfpTdoVg1vM,4687
4
+ mmar_mapi/file_storage.py,sha256=kxh2DcKY1M9MMb-U03doDYmowHH9VoGYetqBubIJhLI,4937
5
5
  mmar_mapi/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  mmar_mapi/models/base.py,sha256=mKtXV2x51XVj7W-et9tjGcPMDUUUMelW-BywMgFc2p0,411
7
7
  mmar_mapi/models/chat.py,sha256=-XilkiderIOFG1oSKRDG9NDOEN21sBpbTPHUrqVPjc4,15400
@@ -13,7 +13,7 @@ mmar_mapi/type_union.py,sha256=diwmzcnbqkpGFckPHNw9o8zyQ955mOGNvhTlcBJ0RMI,1905
13
13
  mmar_mapi/utils.py,sha256=FlW9n-84xz2zSHsahHzJ3Y4Wu5mjpFer6t9z6PF6lS0,488
14
14
  mmar_mapi/utils_import.py,sha256=pUyMFd8SItTxBKI-GO9JhRmy43jG_OQlUPr8QCBOSwg,1682
15
15
  mmar_mapi/xml_parser.py,sha256=VvLIX_XCZao9i0qqpTVx8nx0vbFXSe8pEbdJdXnj97g,568
16
- mmar_mapi-1.0.18.dist-info/licenses/LICENSE,sha256=2A90w8WjhOgQXnFuUijKJYazaqZ4_NTokYb9Po4y-9k,1061
17
- mmar_mapi-1.0.18.dist-info/WHEEL,sha256=-neZj6nU9KAMg2CnCY6T3w8J53nx1kFGw_9HfoSzM60,79
18
- mmar_mapi-1.0.18.dist-info/METADATA,sha256=EIQ57TyO_0Ur_a_04YCmNztCgtNjGmTlGfxXWWe9v2g,944
19
- mmar_mapi-1.0.18.dist-info/RECORD,,
16
+ mmar_mapi-1.0.19.dist-info/licenses/LICENSE,sha256=2A90w8WjhOgQXnFuUijKJYazaqZ4_NTokYb9Po4y-9k,1061
17
+ mmar_mapi-1.0.19.dist-info/WHEEL,sha256=-neZj6nU9KAMg2CnCY6T3w8J53nx1kFGw_9HfoSzM60,79
18
+ mmar_mapi-1.0.19.dist-info/METADATA,sha256=iFhcn0K4RZtc0xJSmsQM1fBbrbG1XvmMTeuC0XSdAHc,944
19
+ mmar_mapi-1.0.19.dist-info/RECORD,,