dd-core 1.2.2__tar.gz → 1.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dd_core-1.2.2 → dd_core-1.2.4}/PKG-INFO +1 -1
- {dd_core-1.2.2 → dd_core-1.2.4}/pyproject.toml +1 -1
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/__init__.py +1 -1
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/doc.py +44 -13
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/object_types.py +283 -176
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/pdf_utils.py +7 -4
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/PKG-INFO +1 -1
- {dd_core-1.2.2 → dd_core-1.2.4}/tests/test_session_id_removal_integration.py +3 -3
- {dd_core-1.2.2 → dd_core-1.2.4}/README.md +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/setup.cfg +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/__init__.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/base.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/common.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/custom.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/custom_serialize.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/parallel_map.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/serialize.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/stats.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/__init__.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/annotation.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/box.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/convert.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/image.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/view.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/__init__.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/cats.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/cocostruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/d2struct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/hfstruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/laylmstruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/maputils.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/match.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/misc.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/nms.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/pascalstruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/prodigystruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/pubstruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/wandbstruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/xfundstruct.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/py.typed +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/__init__.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/concurrency.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/context.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/develop.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/env_info.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/error.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/file_utils.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/fs.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/identifier.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/logger.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/metacfg.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/ptutils.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/tqdm.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/transform.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/types.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/utils.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/viz.py +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/SOURCES.txt +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/dependency_links.txt +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/requires.txt +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/top_level.txt +0 -0
- {dd_core-1.2.2 → dd_core-1.2.4}/tests/test_doc.py +0 -0
|
@@ -705,23 +705,30 @@ class Document:
|
|
|
705
705
|
return path_json
|
|
706
706
|
|
|
707
707
|
@classmethod
|
|
708
|
-
def
|
|
708
|
+
def from_dict(cls, inputs: dict[str, Any]) -> Document:
|
|
709
709
|
"""
|
|
710
|
-
Create
|
|
710
|
+
Create a ``Document`` instance from a dict that has the same shape as
|
|
711
|
+
``as_dict()`` / ``from_json()``.
|
|
711
712
|
|
|
712
|
-
|
|
713
|
-
|
|
713
|
+
The dict is **not** mutated. Private fields (``_summary``,
|
|
714
|
+
``_images``, ``_page_references``) and composite fields
|
|
715
|
+
(``pipeline_jobs``) that cannot be passed directly to the dataclass
|
|
716
|
+
constructor are extracted and restored after construction.
|
|
717
|
+
|
|
718
|
+
Args:
|
|
719
|
+
inputs: Dict with the same keys as produced by :meth:`as_dict`.
|
|
720
|
+
|
|
721
|
+
Returns:
|
|
722
|
+
Document: Fully restored ``Document`` instance.
|
|
714
723
|
"""
|
|
715
|
-
|
|
716
|
-
raw: dict[str, Any] = json.load(f)
|
|
724
|
+
raw: dict[str, Any] = dict(inputs) # shallow copy – do not mutate caller's dict
|
|
717
725
|
|
|
718
726
|
summary_raw = raw.pop("_summary", None)
|
|
719
727
|
images_raw = raw.pop("_images", None)
|
|
720
728
|
page_refs_raw = raw.pop("_page_references", None)
|
|
721
729
|
pipeline_jobs = raw.pop("pipeline_jobs", {})
|
|
722
730
|
|
|
723
|
-
|
|
724
|
-
raw.pop("_processing_state")
|
|
731
|
+
raw.pop("_processing_state", None)
|
|
725
732
|
|
|
726
733
|
raw["compute_metadata"] = False
|
|
727
734
|
|
|
@@ -729,7 +736,11 @@ class Document:
|
|
|
729
736
|
|
|
730
737
|
if pipeline_jobs:
|
|
731
738
|
doc.pipeline_jobs = {
|
|
732
|
-
key:
|
|
739
|
+
key: (
|
|
740
|
+
val
|
|
741
|
+
if isinstance(val, PipelineJobs)
|
|
742
|
+
else PipelineJobs(**{k: v for k, v in val.items() if k != "session_id"})
|
|
743
|
+
)
|
|
733
744
|
for key, val in pipeline_jobs.items()
|
|
734
745
|
}
|
|
735
746
|
|
|
@@ -737,14 +748,21 @@ class Document:
|
|
|
737
748
|
doc.location = Path(doc.location)
|
|
738
749
|
|
|
739
750
|
if summary_raw is not None:
|
|
740
|
-
doc._summary =
|
|
751
|
+
doc._summary = (
|
|
752
|
+
summary_raw
|
|
753
|
+
if isinstance(summary_raw, CategoryAnnotation)
|
|
754
|
+
else CategoryAnnotation.from_dict(**summary_raw)
|
|
755
|
+
)
|
|
741
756
|
|
|
742
757
|
if images_raw is not None:
|
|
743
758
|
restored_images: dict[str, Image] = {}
|
|
744
759
|
for image_id, img in images_raw.items():
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
760
|
+
if isinstance(img, Image):
|
|
761
|
+
restored_images[image_id] = img
|
|
762
|
+
else:
|
|
763
|
+
restored_images[image_id] = Image(**img)
|
|
764
|
+
if img.get("_image"):
|
|
765
|
+
restored_images[image_id].image = img["_image"]
|
|
748
766
|
doc._images = restored_images
|
|
749
767
|
|
|
750
768
|
if page_refs_raw is not None:
|
|
@@ -755,6 +773,19 @@ class Document:
|
|
|
755
773
|
|
|
756
774
|
return doc
|
|
757
775
|
|
|
776
|
+
@classmethod
|
|
777
|
+
def from_json(cls, file_path: PathLikeOrStr) -> Document:
|
|
778
|
+
"""
|
|
779
|
+
Create `Document` instance from `.json` file.
|
|
780
|
+
|
|
781
|
+
Restores private attrs (e.g. `_images`, `_page_references`, `_summary`, `_processing_state`)
|
|
782
|
+
that are not populated automatically by pydantic from input data.
|
|
783
|
+
"""
|
|
784
|
+
with open(file_path, "r", encoding="UTF-8") as f:
|
|
785
|
+
raw: dict[str, Any] = json.load(f)
|
|
786
|
+
|
|
787
|
+
return cls.from_dict(raw)
|
|
788
|
+
|
|
758
789
|
def viz_entities( # type
|
|
759
790
|
self,
|
|
760
791
|
scaled_width: int = 900,
|
|
@@ -21,6 +21,7 @@ Module for funcs and constants that maintain general settings
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
23
|
import itertools
|
|
24
|
+
import re
|
|
24
25
|
import threading
|
|
25
26
|
from enum import Enum
|
|
26
27
|
from typing import Any, Callable, Iterable, Optional, Sequence, Type, Union
|
|
@@ -62,38 +63,301 @@ def _iter_registered_enums() -> Iterable[Type[ObjectTypes]]:
|
|
|
62
63
|
return object_types_registry.get_all().values()
|
|
63
64
|
|
|
64
65
|
|
|
65
|
-
def
|
|
66
|
+
def _wrapped_register(name: str, func: Optional[Any] = None) -> Callable[[Type[ObjectTypes]], Type[ObjectTypes]]:
|
|
67
|
+
def _decorator(cls: Type[ObjectTypes]) -> Type[ObjectTypes]:
|
|
68
|
+
with _TYPES_INDEX_LOCK:
|
|
69
|
+
registered_cls = _orig_register(name, func=func)(cls)
|
|
70
|
+
_rebuild_types_index_locked()
|
|
71
|
+
return registered_cls
|
|
72
|
+
|
|
73
|
+
return _decorator
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _upsert_dynamic_enum(name: str, members: Sequence[tuple[str, str]]) -> Type[ObjectTypes] | None:
|
|
66
77
|
"""
|
|
67
|
-
|
|
78
|
+
Idempotently register or extend a dynamic ObjectTypes enum under `name`.
|
|
79
|
+
|
|
80
|
+
Rules:
|
|
81
|
+
- existing values under the same enum name are preserved
|
|
82
|
+
- new values are appended
|
|
83
|
+
- repeated registration of the same values is a no-op
|
|
84
|
+
- values already owned by a different enum raise DuplicateObjectTypeError
|
|
68
85
|
"""
|
|
69
86
|
with _TYPES_INDEX_LOCK:
|
|
87
|
+
registered = object_types_registry.get_all()
|
|
88
|
+
existing_enum = registered.get(name)
|
|
89
|
+
|
|
90
|
+
existing_by_value: dict[str, str] = {}
|
|
91
|
+
if existing_enum is not None:
|
|
92
|
+
for member in existing_enum:
|
|
93
|
+
existing_by_value[str(member.value)] = member.name
|
|
94
|
+
|
|
95
|
+
merged_by_value = dict(existing_by_value)
|
|
96
|
+
|
|
97
|
+
for proposed_member_name, raw_value in members:
|
|
98
|
+
value = _normalize_object_type_value(raw_value)
|
|
99
|
+
|
|
100
|
+
existing_member = _ALL_TYPES_DICT.get(value)
|
|
101
|
+
|
|
102
|
+
if existing_member is not None and existing_member.__class__.__name__ != name:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
if value not in merged_by_value:
|
|
106
|
+
merged_by_value[value] = proposed_member_name
|
|
107
|
+
|
|
108
|
+
if merged_by_value == existing_by_value:
|
|
109
|
+
return existing_enum
|
|
110
|
+
|
|
111
|
+
if not merged_by_value:
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
merged_members = [(member_name, value) for value, member_name in merged_by_value.items()]
|
|
115
|
+
merged_enum = ObjectTypes(name, merged_members) # type: ignore
|
|
116
|
+
|
|
117
|
+
registered_cls = _orig_register(name)(merged_enum)
|
|
118
|
+
_rebuild_types_index_locked()
|
|
119
|
+
return registered_cls
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _get_black_list() -> list[str]:
|
|
123
|
+
return _BLACK_LIST
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def update_black_list(item: str) -> None:
|
|
127
|
+
"""Updates the black list, i.e. set of elements that must not be lowered"""
|
|
128
|
+
_BLACK_LIST.append(item)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _normalize_object_type_value(obj_type: str) -> str:
|
|
132
|
+
"""
|
|
133
|
+
Canonical normalization for lookup and dynamic registration.
|
|
134
|
+
This must match get_type() semantics.
|
|
135
|
+
"""
|
|
136
|
+
obj_type = _get_new_obj_type_str(obj_type)
|
|
137
|
+
|
|
138
|
+
if obj_type.startswith(("B-", "E-", "I-", "S-")):
|
|
139
|
+
return obj_type[:2] + obj_type[2:].lower()
|
|
140
|
+
|
|
141
|
+
if obj_type not in _get_black_list():
|
|
142
|
+
return obj_type.lower()
|
|
143
|
+
|
|
144
|
+
return obj_type
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _flatten_categories(categories_list: Sequence[str]) -> list[str]:
|
|
148
|
+
"""
|
|
149
|
+
Flatten a possibly nested category sequence into a plain list[str].
|
|
150
|
+
"""
|
|
151
|
+
if categories_list and isinstance(categories_list[0], (list, tuple)):
|
|
152
|
+
return [str(item) for item in itertools.chain.from_iterable(categories_list)]
|
|
153
|
+
return [str(item) for item in categories_list]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _dedupe_preserve_order(values: Sequence[str]) -> list[str]:
|
|
157
|
+
"""
|
|
158
|
+
Stable de-duplication preserving the first occurrence order.
|
|
159
|
+
"""
|
|
160
|
+
seen: set[str] = set()
|
|
161
|
+
out: list[str] = []
|
|
162
|
+
for value in values:
|
|
163
|
+
if value not in seen:
|
|
164
|
+
seen.add(value)
|
|
165
|
+
out.append(value)
|
|
166
|
+
return out
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _sanitize_enum_member_name(value: str, used_names: set[str]) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Convert a value string into a safe enum member name.
|
|
172
|
+
"""
|
|
173
|
+
candidate = re.sub(r"[^A-Z0-9_]", "_", value.upper())
|
|
174
|
+
if not candidate:
|
|
175
|
+
candidate = "TYPE"
|
|
176
|
+
if candidate[0].isdigit():
|
|
177
|
+
candidate = f"TYPE_{candidate}"
|
|
178
|
+
|
|
179
|
+
name = candidate
|
|
180
|
+
suffix = 2
|
|
181
|
+
while name in used_names:
|
|
182
|
+
name = f"{candidate}_{suffix}"
|
|
183
|
+
suffix += 1
|
|
184
|
+
|
|
185
|
+
used_names.add(name)
|
|
186
|
+
return name
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _build_types_index_from_registry() -> dict[str, ObjectTypes]:
|
|
190
|
+
"""
|
|
191
|
+
Build a fresh value->member index from the current registry contents.
|
|
192
|
+
|
|
193
|
+
This is the only place where duplicate value detection happens.
|
|
194
|
+
"""
|
|
195
|
+
new_index: dict[str, ObjectTypes] = {}
|
|
196
|
+
|
|
197
|
+
for enum_cls in _iter_registered_enums():
|
|
70
198
|
for member in enum_cls:
|
|
71
|
-
|
|
72
|
-
existing =
|
|
199
|
+
value = str(member.value)
|
|
200
|
+
existing = new_index.get(value)
|
|
73
201
|
if existing is not None and existing is not member:
|
|
74
202
|
raise DuplicateObjectTypeError(
|
|
75
|
-
f"Object type value '{
|
|
203
|
+
f"Object type value '{value}' already taken by {existing!r}; cannot register {member!r}"
|
|
76
204
|
)
|
|
77
|
-
|
|
205
|
+
new_index[value] = member
|
|
206
|
+
|
|
207
|
+
return new_index
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _rebuild_types_index_locked() -> None:
|
|
211
|
+
"""
|
|
212
|
+
Rebuild the global lookup index from the registry.
|
|
213
|
+
Caller must hold _TYPES_INDEX_LOCK.
|
|
214
|
+
"""
|
|
215
|
+
global _ALL_TYPES_DICT # pylint: disable=W0603
|
|
216
|
+
_ALL_TYPES_DICT = _build_types_index_from_registry()
|
|
78
217
|
|
|
79
218
|
|
|
80
219
|
def _rebuild_types_index() -> None:
|
|
81
220
|
"""
|
|
82
|
-
|
|
221
|
+
Public/internal helper to rebuild the lookup index from the registry.
|
|
83
222
|
"""
|
|
84
223
|
with _TYPES_INDEX_LOCK:
|
|
85
|
-
|
|
86
|
-
for enum_cls in _iter_registered_enums():
|
|
87
|
-
_index_enum(enum_cls)
|
|
224
|
+
_rebuild_types_index_locked()
|
|
88
225
|
|
|
89
226
|
|
|
90
|
-
def
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
return registered_cls
|
|
227
|
+
def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes) -> ObjectTypes:
|
|
228
|
+
"""
|
|
229
|
+
Maps a `TokenClassWithTagLabel` enum member from a token class and tag, e.g. `TokenClassLabel.HEADER` and
|
|
230
|
+
`BioTag.INSIDE` maps to `TTokenClassWithTagLabel.I_HEADER`.
|
|
95
231
|
|
|
96
|
-
|
|
232
|
+
Args:
|
|
233
|
+
token: TokenClasses member.
|
|
234
|
+
tag: BioTag member.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
TokenClassWithTag member.
|
|
238
|
+
|
|
239
|
+
Raises:
|
|
240
|
+
TypeError: If token is not of type TokenClasses or tag is not of type BioTag.
|
|
241
|
+
"""
|
|
242
|
+
if isinstance(token, TokenClassLabel) and isinstance(tag, BioTagLabel):
|
|
243
|
+
return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
|
|
244
|
+
raise TypeError(
|
|
245
|
+
f"Token must be of type TokenClasses, is of {type(token)} and tag " f"{type(tag)} must be of type BioTag"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def token_class_with_tag_to_token_class_and_tag(
|
|
250
|
+
token_class_with_tag: ObjectTypes,
|
|
251
|
+
) -> Optional[tuple[ObjectTypes, ObjectTypes]]:
|
|
252
|
+
"""
|
|
253
|
+
This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
token_class_with_tag: `TokenClassWithTag` member
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Tuple of `TokenClasses` member and `BioTag` member
|
|
260
|
+
"""
|
|
261
|
+
return {val: key for key, val in _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG.items()}.get(token_class_with_tag)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def register_custom_token_tag(custom_object_types: ObjectTypes, suffix: str) -> str:
|
|
265
|
+
"""
|
|
266
|
+
Registers custom token tags for a given ObjectType with a specified suffix. The tags are created by combining
|
|
267
|
+
BIO tags (B, I, E) with the custom object types.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
custom_object_types: An instance of ObjectTypes containing the custom object types to be registered.
|
|
271
|
+
suffix: A string suffix to be appended to the name of the registered object type.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
The name of the registered object type.
|
|
275
|
+
|
|
276
|
+
Example:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
from deepdoctection.utils.settings import ObjectTypes
|
|
280
|
+
|
|
281
|
+
class CustomObjectTypesLabel(ObjectTypes):
|
|
282
|
+
TOKEN_A = "token_a"
|
|
283
|
+
TOKEN_B = "token_b"
|
|
284
|
+
|
|
285
|
+
custom_object_types = CustomObjectTypes()
|
|
286
|
+
register_custom_token_tag(custom_object_types, "custom_type")
|
|
287
|
+
# This will register tags like "B-TOKEN_A", "I-TOKEN_A", "E-TOKEN_A", "B-TOKEN_B", "I-TOKEN_B", "E-TOKEN_B"
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
"""
|
|
291
|
+
tag_list = [i for i in object_types_registry.get("BioTagLabel") if i in ("B", "I", "E")]
|
|
292
|
+
name = f"{custom_object_types.__name__.lower()}_{suffix}" # type: ignore
|
|
293
|
+
|
|
294
|
+
product = [
|
|
295
|
+
(
|
|
296
|
+
a[0].value + "_" + a[1].value.upper(), # type: ignore
|
|
297
|
+
a[0].value + "-" + a[1].value, # type: ignore
|
|
298
|
+
)
|
|
299
|
+
for a in list(itertools.product(tag_list, custom_object_types))
|
|
300
|
+
]
|
|
301
|
+
|
|
302
|
+
_upsert_dynamic_enum(name, product)
|
|
303
|
+
return name
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def register_string_categories_from_list(categories_list: Sequence[str], object_type_name: str) -> None:
|
|
307
|
+
"""
|
|
308
|
+
Idempotently register or extend string categories under a dynamic ObjectTypes enum.
|
|
309
|
+
|
|
310
|
+
Repeated calls with the same values are a no-op.
|
|
311
|
+
Repeated calls with new values extend the enum under the same name.
|
|
312
|
+
"""
|
|
313
|
+
flattened_categories = _flatten_categories(categories_list)
|
|
314
|
+
normalized_values = _dedupe_preserve_order([_normalize_object_type_value(cat) for cat in flattened_categories])
|
|
315
|
+
|
|
316
|
+
if not normalized_values:
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
used_names: set[str] = set()
|
|
320
|
+
members = [(_sanitize_enum_member_name(value, used_names), value) for value in normalized_values]
|
|
321
|
+
|
|
322
|
+
_upsert_dynamic_enum(object_type_name, members)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def update_all_types_dict() -> None:
|
|
326
|
+
"""
|
|
327
|
+
Compatibility helper retained for older code/tests that call it explicitly.
|
|
328
|
+
Rebuilds the global index from the registry.
|
|
329
|
+
"""
|
|
330
|
+
_rebuild_types_index()
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def get_type(obj_type: Union[str, ObjectTypes]) -> ObjectTypes:
|
|
334
|
+
"""
|
|
335
|
+
Get an object type property from a given string. Does nothing if an `ObjectType` is passed
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
obj_type: String or ObjectTypes
|
|
339
|
+
Returns:
|
|
340
|
+
`ObjectType`
|
|
341
|
+
"""
|
|
342
|
+
if isinstance(obj_type, ObjectTypes):
|
|
343
|
+
return obj_type
|
|
344
|
+
|
|
345
|
+
if not isinstance(obj_type, str):
|
|
346
|
+
raise TypeError(f"get_type expects str or ObjectTypes, got {type(obj_type)}")
|
|
347
|
+
|
|
348
|
+
normalized = _normalize_object_type_value(obj_type)
|
|
349
|
+
|
|
350
|
+
with _TYPES_INDEX_LOCK:
|
|
351
|
+
member = _ALL_TYPES_DICT.get(normalized)
|
|
352
|
+
|
|
353
|
+
if member is None:
|
|
354
|
+
raise KeyError(f"String {normalized} does not correspond to a registered ObjectType")
|
|
355
|
+
|
|
356
|
+
return member
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _get_new_obj_type_str(obj_type: str) -> str:
|
|
360
|
+
return _OLD_TO_NEW_OBJ_TYPE.get(obj_type, obj_type)
|
|
97
361
|
|
|
98
362
|
|
|
99
363
|
# Monkey-patch the registry to enforce duplicate detection for all modules.
|
|
@@ -103,6 +367,9 @@ _TYPES_INDEX_LOCK = threading.RLock()
|
|
|
103
367
|
_ALL_TYPES_DICT: dict[str, ObjectTypes] = {}
|
|
104
368
|
|
|
105
369
|
|
|
370
|
+
_rebuild_types_index()
|
|
371
|
+
|
|
372
|
+
|
|
106
373
|
@object_types_registry.register("DefaultType")
|
|
107
374
|
class DefaultType(ObjectTypes):
|
|
108
375
|
"""Type for default member"""
|
|
@@ -412,121 +679,6 @@ _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
|
|
|
412
679
|
}
|
|
413
680
|
|
|
414
681
|
|
|
415
|
-
def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes) -> ObjectTypes:
|
|
416
|
-
"""
|
|
417
|
-
Maps a `TokenClassWithTagLabel` enum member from a token class and tag, e.g. `TokenClassLabel.HEADER` and
|
|
418
|
-
`BioTag.INSIDE` maps to `TTokenClassWithTagLabel.I_HEADER`.
|
|
419
|
-
|
|
420
|
-
Args:
|
|
421
|
-
token: TokenClasses member.
|
|
422
|
-
tag: BioTag member.
|
|
423
|
-
|
|
424
|
-
Returns:
|
|
425
|
-
TokenClassWithTag member.
|
|
426
|
-
|
|
427
|
-
Raises:
|
|
428
|
-
TypeError: If token is not of type TokenClasses or tag is not of type BioTag.
|
|
429
|
-
"""
|
|
430
|
-
if isinstance(token, TokenClassLabel) and isinstance(tag, BioTagLabel):
|
|
431
|
-
return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
|
|
432
|
-
raise TypeError(
|
|
433
|
-
f"Token must be of type TokenClasses, is of {type(token)} and tag " f"{type(tag)} must be of type BioTag"
|
|
434
|
-
)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
def token_class_with_tag_to_token_class_and_tag(
|
|
438
|
-
token_class_with_tag: ObjectTypes,
|
|
439
|
-
) -> Optional[tuple[ObjectTypes, ObjectTypes]]:
|
|
440
|
-
"""
|
|
441
|
-
This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
|
|
442
|
-
|
|
443
|
-
Args:
|
|
444
|
-
token_class_with_tag: `TokenClassWithTag` member
|
|
445
|
-
|
|
446
|
-
Returns:
|
|
447
|
-
Tuple of `TokenClasses` member and `BioTag` member
|
|
448
|
-
"""
|
|
449
|
-
return {val: key for key, val in _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG.items()}.get(token_class_with_tag)
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
def register_custom_token_tag(custom_object_types: ObjectTypes, suffix: str) -> str:
|
|
453
|
-
"""
|
|
454
|
-
Registers custom token tags for a given ObjectType with a specified suffix. The tags are created by combining
|
|
455
|
-
BIO tags (B, I, E) with the custom object types.
|
|
456
|
-
|
|
457
|
-
Args:
|
|
458
|
-
custom_object_types: An instance of ObjectTypes containing the custom object types to be registered.
|
|
459
|
-
suffix: A string suffix to be appended to the name of the registered object type.
|
|
460
|
-
|
|
461
|
-
Returns:
|
|
462
|
-
The name of the registered object type.
|
|
463
|
-
|
|
464
|
-
Example:
|
|
465
|
-
|
|
466
|
-
```python
|
|
467
|
-
from deepdoctection.utils.settings import ObjectTypes
|
|
468
|
-
|
|
469
|
-
class CustomObjectTypesLabel(ObjectTypes):
|
|
470
|
-
TOKEN_A = "token_a"
|
|
471
|
-
TOKEN_B = "token_b"
|
|
472
|
-
|
|
473
|
-
custom_object_types = CustomObjectTypes()
|
|
474
|
-
register_custom_token_tag(custom_object_types, "custom_type")
|
|
475
|
-
# This will register tags like "B-TOKEN_A", "I-TOKEN_A", "E-TOKEN_A", "B-TOKEN_B", "I-TOKEN_B", "E-TOKEN_B"
|
|
476
|
-
```
|
|
477
|
-
|
|
478
|
-
"""
|
|
479
|
-
tag_list = [i for i in object_types_registry.get("BioTagLabel") if i in ("B", "I", "E")]
|
|
480
|
-
name = f"{custom_object_types.__name__.lower()}_{suffix}" # type: ignore
|
|
481
|
-
product = [
|
|
482
|
-
(
|
|
483
|
-
a[0].value + "_" + a[1].value.upper(), # type: ignore
|
|
484
|
-
a[0].value + "-" + a[1].value, # type: ignore
|
|
485
|
-
)
|
|
486
|
-
for a in list(itertools.product(tag_list, custom_object_types))
|
|
487
|
-
]
|
|
488
|
-
|
|
489
|
-
object_types_registry.register(name)(ObjectTypes(name, product)) # type: ignore
|
|
490
|
-
return name
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
def register_string_categories_from_list(categories_list: Sequence[str], object_type_name: str) -> None:
|
|
494
|
-
"""
|
|
495
|
-
Registers string categories from a given list into the object types registry. If a category from the list is not
|
|
496
|
-
already registered, it will be added with the specified object type name.
|
|
497
|
-
|
|
498
|
-
Args:
|
|
499
|
-
categories_list: A sequence of strings representing the categories to be registered.
|
|
500
|
-
object_type_name: The name of the object type under which the categories will be registered.
|
|
501
|
-
|
|
502
|
-
Example:
|
|
503
|
-
```python
|
|
504
|
-
categories = ["category1", "category2", "category3"]
|
|
505
|
-
register_string_categories_from_list(categories, "custom_object_type")
|
|
506
|
-
# This will register "CATEGORY1", "CATEGORY2", "CATEGORY3" under the object type "custom_object_type"
|
|
507
|
-
```
|
|
508
|
-
"""
|
|
509
|
-
|
|
510
|
-
all_types = {cat.value for object_type in set(object_types_registry.get_all().values()) for cat in object_type}
|
|
511
|
-
|
|
512
|
-
if categories_list and isinstance(categories_list[0], (list, tuple)):
|
|
513
|
-
flattened_categories = list(itertools.chain.from_iterable(categories_list))
|
|
514
|
-
else:
|
|
515
|
-
flattened_categories = list(categories_list)
|
|
516
|
-
|
|
517
|
-
categories_to_register = [cat for cat in flattened_categories if cat not in all_types]
|
|
518
|
-
categories_tuple = list({cat.upper(): cat for cat in categories_to_register}.items())
|
|
519
|
-
object_types_registry.register(object_type_name)(ObjectTypes(object_type_name, categories_tuple)) # type: ignore
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
def update_all_types_dict() -> None:
|
|
523
|
-
"""
|
|
524
|
-
Compatibility helper retained for older code/tests that call it explicitly.
|
|
525
|
-
Rebuilds the global index from the registry.
|
|
526
|
-
"""
|
|
527
|
-
_rebuild_types_index()
|
|
528
|
-
|
|
529
|
-
|
|
530
682
|
_OLD_TO_NEW_OBJ_TYPE: dict[str, str] = {
|
|
531
683
|
"DOC_CLASS": "document_type",
|
|
532
684
|
"CHARS": "characters",
|
|
@@ -546,49 +698,4 @@ _OLD_TO_NEW_OBJ_TYPE: dict[str, str] = {
|
|
|
546
698
|
}
|
|
547
699
|
|
|
548
700
|
|
|
549
|
-
def _get_new_obj_type_str(obj_type: str) -> str:
|
|
550
|
-
return _OLD_TO_NEW_OBJ_TYPE.get(obj_type, obj_type)
|
|
551
|
-
|
|
552
|
-
|
|
553
701
|
_BLACK_LIST: list[str] = ["B", "I", "O", "E", "S"]
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
def _get_black_list() -> list[str]:
|
|
557
|
-
return _BLACK_LIST
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
def update_black_list(item: str) -> None:
|
|
561
|
-
"""Updates the black list, i.e. set of elements that must not be lowered"""
|
|
562
|
-
_BLACK_LIST.append(item)
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
def get_type(obj_type: Union[str, ObjectTypes]) -> ObjectTypes:
|
|
566
|
-
"""
|
|
567
|
-
Get an object type property from a given string. Does nothing if an `ObjectType` is passed
|
|
568
|
-
|
|
569
|
-
Args:
|
|
570
|
-
obj_type: String or ObjectTypes
|
|
571
|
-
Returns:
|
|
572
|
-
`ObjectType`
|
|
573
|
-
"""
|
|
574
|
-
if isinstance(obj_type, ObjectTypes):
|
|
575
|
-
return obj_type
|
|
576
|
-
if not isinstance(obj_type, str):
|
|
577
|
-
raise TypeError(f"get_type expects str or ObjectTypes, got {type(obj_type)}")
|
|
578
|
-
|
|
579
|
-
obj_type = _get_new_obj_type_str(obj_type)
|
|
580
|
-
if obj_type.startswith(("B-", "E-", "I-", "S-")):
|
|
581
|
-
obj_type = obj_type[:2] + obj_type[2:].lower()
|
|
582
|
-
elif obj_type not in _get_black_list():
|
|
583
|
-
obj_type = obj_type.lower()
|
|
584
|
-
|
|
585
|
-
with _TYPES_INDEX_LOCK:
|
|
586
|
-
member = _ALL_TYPES_DICT.get(obj_type)
|
|
587
|
-
|
|
588
|
-
if member is None:
|
|
589
|
-
raise KeyError(f"String {obj_type} does not correspond to a registered ObjectType")
|
|
590
|
-
|
|
591
|
-
return member
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
_rebuild_types_index()
|
|
@@ -127,7 +127,7 @@ def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
|
|
|
127
127
|
sys.exit()
|
|
128
128
|
|
|
129
129
|
|
|
130
|
-
def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
|
|
130
|
+
def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes], check_file_extension: bool = True) -> PdfReader:
|
|
131
131
|
"""
|
|
132
132
|
Create a file reader object from a PDF document.
|
|
133
133
|
|
|
@@ -136,6 +136,7 @@ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader
|
|
|
136
136
|
|
|
137
137
|
Args:
|
|
138
138
|
path_or_bytes: A path to a PDF document or bytes.
|
|
139
|
+
check_file_extension: If True, and file suffix is not .pdf, it will raise a FileExtensionError
|
|
139
140
|
|
|
140
141
|
Returns:
|
|
141
142
|
A file reader object from which you can iterate through the document.
|
|
@@ -154,7 +155,7 @@ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader
|
|
|
154
155
|
if not os.path.isfile(path_or_bytes):
|
|
155
156
|
raise FileNotFoundError(str(path_or_bytes))
|
|
156
157
|
file_name = os.path.split(path_or_bytes)[1]
|
|
157
|
-
if not is_file_extension(file_name, ".pdf"):
|
|
158
|
+
if not is_file_extension(file_name, ".pdf") and check_file_extension:
|
|
158
159
|
raise FileExtensionError(f"must be a pdf file: {file_name}")
|
|
159
160
|
|
|
160
161
|
with open(path_or_bytes, "rb") as file:
|
|
@@ -218,15 +219,17 @@ class PDFStreamer:
|
|
|
218
219
|
you open many files.
|
|
219
220
|
"""
|
|
220
221
|
|
|
221
|
-
def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
|
|
222
|
+
def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes], check_file_extension: bool = True) -> None:
|
|
222
223
|
"""
|
|
223
224
|
Args:
|
|
224
225
|
path_or_bytes: Path to a PDF.
|
|
226
|
+
check_file_extension: If True, and file suffix is not .pdf, it will raise a FileExtensionError
|
|
227
|
+
|
|
225
228
|
|
|
226
229
|
Returns:
|
|
227
230
|
None.
|
|
228
231
|
"""
|
|
229
|
-
self.file_reader = get_pdf_file_reader(path_or_bytes)
|
|
232
|
+
self.file_reader = get_pdf_file_reader(path_or_bytes, check_file_extension=check_file_extension)
|
|
230
233
|
self.file_writer = PdfWriter()
|
|
231
234
|
|
|
232
235
|
def __len__(self) -> int:
|
|
@@ -62,7 +62,7 @@ class TestSessionIdRemovalIntegration:
|
|
|
62
62
|
assert loaded_ann.model_id == "model_v1"
|
|
63
63
|
# The important thing is that session_id was removed from model_fields, not that hasattr returns False
|
|
64
64
|
# (hasattr may still return True due to Pydantic internals, but the field is not in the model schema)
|
|
65
|
-
assert "session_id" not in
|
|
65
|
+
assert "session_id" not in set(ImageAnnotation.model_fields)
|
|
66
66
|
|
|
67
67
|
def test_serialization_roundtrip_without_session_id(self) -> None:
|
|
68
68
|
"""Test that serialization/deserialization roundtrip works without session_id"""
|
|
@@ -130,5 +130,5 @@ class TestSessionIdRemovalIntegration:
|
|
|
130
130
|
assert image.annotations[1].service_id == "new_detector"
|
|
131
131
|
|
|
132
132
|
# Both should have session_id removed from model fields
|
|
133
|
-
for
|
|
134
|
-
assert "session_id" not in
|
|
133
|
+
for _ in image.annotations:
|
|
134
|
+
assert "session_id" not in set(ImageAnnotation.model_fields)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|