dd-core 1.2.2__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {dd_core-1.2.2 → dd_core-1.2.4}/PKG-INFO +1 -1
  2. {dd_core-1.2.2 → dd_core-1.2.4}/pyproject.toml +1 -1
  3. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/__init__.py +1 -1
  4. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/doc.py +44 -13
  5. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/object_types.py +283 -176
  6. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/pdf_utils.py +7 -4
  7. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/PKG-INFO +1 -1
  8. {dd_core-1.2.2 → dd_core-1.2.4}/tests/test_session_id_removal_integration.py +3 -3
  9. {dd_core-1.2.2 → dd_core-1.2.4}/README.md +0 -0
  10. {dd_core-1.2.2 → dd_core-1.2.4}/setup.cfg +0 -0
  11. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/__init__.py +0 -0
  12. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/base.py +0 -0
  13. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/common.py +0 -0
  14. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/custom.py +0 -0
  15. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/custom_serialize.py +0 -0
  16. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/parallel_map.py +0 -0
  17. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/serialize.py +0 -0
  18. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/dataflow/stats.py +0 -0
  19. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/__init__.py +0 -0
  20. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/annotation.py +0 -0
  21. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/box.py +0 -0
  22. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/convert.py +0 -0
  23. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/image.py +0 -0
  24. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/datapoint/view.py +0 -0
  25. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/__init__.py +0 -0
  26. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/cats.py +0 -0
  27. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/cocostruct.py +0 -0
  28. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/d2struct.py +0 -0
  29. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/hfstruct.py +0 -0
  30. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/laylmstruct.py +0 -0
  31. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/maputils.py +0 -0
  32. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/match.py +0 -0
  33. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/misc.py +0 -0
  34. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/nms.py +0 -0
  35. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/pascalstruct.py +0 -0
  36. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/prodigystruct.py +0 -0
  37. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/pubstruct.py +0 -0
  38. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/wandbstruct.py +0 -0
  39. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/mapper/xfundstruct.py +0 -0
  40. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/py.typed +0 -0
  41. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/__init__.py +0 -0
  42. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/concurrency.py +0 -0
  43. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/context.py +0 -0
  44. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/develop.py +0 -0
  45. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/env_info.py +0 -0
  46. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/error.py +0 -0
  47. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/file_utils.py +0 -0
  48. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/fs.py +0 -0
  49. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/identifier.py +0 -0
  50. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/logger.py +0 -0
  51. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/metacfg.py +0 -0
  52. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/ptutils.py +0 -0
  53. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/tqdm.py +0 -0
  54. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/transform.py +0 -0
  55. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/types.py +0 -0
  56. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/utils.py +0 -0
  57. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core/utils/viz.py +0 -0
  58. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/SOURCES.txt +0 -0
  59. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/dependency_links.txt +0 -0
  60. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/requires.txt +0 -0
  61. {dd_core-1.2.2 → dd_core-1.2.4}/src/dd_core.egg-info/top_level.txt +0 -0
  62. {dd_core-1.2.2 → dd_core-1.2.4}/tests/test_doc.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dd-core
3
- Version: 1.2.2
3
+ Version: 1.2.4
4
4
  Summary: Core data structures, dataflows, mappers and utilities for deepdoctection
5
5
  Author: Dr. Janis Meyer
6
6
  License: Apache License 2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dd-core"
7
- version = "1.2.2"
7
+ version = "1.2.4"
8
8
  authors = [
9
9
  {name = "Dr. Janis Meyer"}
10
10
  ]
@@ -32,7 +32,7 @@ from typing import TYPE_CHECKING
32
32
 
33
33
  from .utils.file_utils import _LazyModule
34
34
 
35
- __version__ = "1.2.2"
35
+ __version__ = "1.2.4"
36
36
  _IMPORT_STRUCTURE = {
37
37
  "dataflow": [
38
38
  "DataFlowReentrantGuard",
@@ -705,23 +705,30 @@ class Document:
705
705
  return path_json
706
706
 
707
707
  @classmethod
708
- def from_json(cls, file_path: PathLikeOrStr) -> Document:
708
+ def from_dict(cls, inputs: dict[str, Any]) -> Document:
709
709
  """
710
- Create `Document` instance from `.json` file.
710
+ Create a ``Document`` instance from a dict that has the same shape as
711
+ ``as_dict()`` / ``from_json()``.
711
712
 
712
- Restores private attrs (e.g. `_images`, `_page_references`, `_summary`, `_processing_state`)
713
- that are not populated automatically by pydantic from input data.
713
+ The dict is **not** mutated. Private fields (``_summary``,
714
+ ``_images``, ``_page_references``) and composite fields
715
+ (``pipeline_jobs``) that cannot be passed directly to the dataclass
716
+ constructor are extracted and restored after construction.
717
+
718
+ Args:
719
+ inputs: Dict with the same keys as produced by :meth:`as_dict`.
720
+
721
+ Returns:
722
+ Document: Fully restored ``Document`` instance.
714
723
  """
715
- with open(file_path, "r", encoding="UTF-8") as f:
716
- raw: dict[str, Any] = json.load(f)
724
+ raw: dict[str, Any] = dict(inputs) # shallow copy – do not mutate caller's dict
717
725
 
718
726
  summary_raw = raw.pop("_summary", None)
719
727
  images_raw = raw.pop("_images", None)
720
728
  page_refs_raw = raw.pop("_page_references", None)
721
729
  pipeline_jobs = raw.pop("pipeline_jobs", {})
722
730
 
723
- if "_processing_state" in raw:
724
- raw.pop("_processing_state")
731
+ raw.pop("_processing_state", None)
725
732
 
726
733
  raw["compute_metadata"] = False
727
734
 
@@ -729,7 +736,11 @@ class Document:
729
736
 
730
737
  if pipeline_jobs:
731
738
  doc.pipeline_jobs = {
732
- key: PipelineJobs(**{k: v for k, v in val.items() if k != "session_id"})
739
+ key: (
740
+ val
741
+ if isinstance(val, PipelineJobs)
742
+ else PipelineJobs(**{k: v for k, v in val.items() if k != "session_id"})
743
+ )
733
744
  for key, val in pipeline_jobs.items()
734
745
  }
735
746
 
@@ -737,14 +748,21 @@ class Document:
737
748
  doc.location = Path(doc.location)
738
749
 
739
750
  if summary_raw is not None:
740
- doc._summary = CategoryAnnotation.from_dict(**summary_raw)
751
+ doc._summary = (
752
+ summary_raw
753
+ if isinstance(summary_raw, CategoryAnnotation)
754
+ else CategoryAnnotation.from_dict(**summary_raw)
755
+ )
741
756
 
742
757
  if images_raw is not None:
743
758
  restored_images: dict[str, Image] = {}
744
759
  for image_id, img in images_raw.items():
745
- restored_images[image_id] = img if isinstance(img, Image) else Image(**img)
746
- if img["_image"]:
747
- restored_images[image_id].image = img["_image"]
760
+ if isinstance(img, Image):
761
+ restored_images[image_id] = img
762
+ else:
763
+ restored_images[image_id] = Image(**img)
764
+ if img.get("_image"):
765
+ restored_images[image_id].image = img["_image"]
748
766
  doc._images = restored_images
749
767
 
750
768
  if page_refs_raw is not None:
@@ -755,6 +773,19 @@ class Document:
755
773
 
756
774
  return doc
757
775
 
776
+ @classmethod
777
+ def from_json(cls, file_path: PathLikeOrStr) -> Document:
778
+ """
779
+ Create `Document` instance from `.json` file.
780
+
781
+ Restores private attrs (e.g. `_images`, `_page_references`, `_summary`, `_processing_state`)
782
+ that are not populated automatically by pydantic from input data.
783
+ """
784
+ with open(file_path, "r", encoding="UTF-8") as f:
785
+ raw: dict[str, Any] = json.load(f)
786
+
787
+ return cls.from_dict(raw)
788
+
758
789
  def viz_entities( # type
759
790
  self,
760
791
  scaled_width: int = 900,
@@ -21,6 +21,7 @@ Module for funcs and constants that maintain general settings
21
21
  from __future__ import annotations
22
22
 
23
23
  import itertools
24
+ import re
24
25
  import threading
25
26
  from enum import Enum
26
27
  from typing import Any, Callable, Iterable, Optional, Sequence, Type, Union
@@ -62,38 +63,301 @@ def _iter_registered_enums() -> Iterable[Type[ObjectTypes]]:
62
63
  return object_types_registry.get_all().values()
63
64
 
64
65
 
65
- def _index_enum(enum_cls: Type[ObjectTypes]) -> None:
66
+ def _wrapped_register(name: str, func: Optional[Any] = None) -> Callable[[Type[ObjectTypes]], Type[ObjectTypes]]:
67
+ def _decorator(cls: Type[ObjectTypes]) -> Type[ObjectTypes]:
68
+ with _TYPES_INDEX_LOCK:
69
+ registered_cls = _orig_register(name, func=func)(cls)
70
+ _rebuild_types_index_locked()
71
+ return registered_cls
72
+
73
+ return _decorator
74
+
75
+
76
+ def _upsert_dynamic_enum(name: str, members: Sequence[tuple[str, str]]) -> Type[ObjectTypes] | None:
66
77
  """
67
- Merge a newly-registered enum class into the global index, enforcing unique string values.
78
+ Idempotently register or extend a dynamic ObjectTypes enum under `name`.
79
+
80
+ Rules:
81
+ - existing values under the same enum name are preserved
82
+ - new values are appended
83
+ - repeated registration of the same values is a no-op
84
+ - values already owned by a different enum raise DuplicateObjectTypeError
68
85
  """
69
86
  with _TYPES_INDEX_LOCK:
87
+ registered = object_types_registry.get_all()
88
+ existing_enum = registered.get(name)
89
+
90
+ existing_by_value: dict[str, str] = {}
91
+ if existing_enum is not None:
92
+ for member in existing_enum:
93
+ existing_by_value[str(member.value)] = member.name
94
+
95
+ merged_by_value = dict(existing_by_value)
96
+
97
+ for proposed_member_name, raw_value in members:
98
+ value = _normalize_object_type_value(raw_value)
99
+
100
+ existing_member = _ALL_TYPES_DICT.get(value)
101
+
102
+ if existing_member is not None and existing_member.__class__.__name__ != name:
103
+ continue
104
+
105
+ if value not in merged_by_value:
106
+ merged_by_value[value] = proposed_member_name
107
+
108
+ if merged_by_value == existing_by_value:
109
+ return existing_enum
110
+
111
+ if not merged_by_value:
112
+ return None
113
+
114
+ merged_members = [(member_name, value) for value, member_name in merged_by_value.items()]
115
+ merged_enum = ObjectTypes(name, merged_members) # type: ignore
116
+
117
+ registered_cls = _orig_register(name)(merged_enum)
118
+ _rebuild_types_index_locked()
119
+ return registered_cls
120
+
121
+
122
+ def _get_black_list() -> list[str]:
123
+ return _BLACK_LIST
124
+
125
+
126
+ def update_black_list(item: str) -> None:
127
+ """Updates the black list, i.e. set of elements that must not be lowered"""
128
+ _BLACK_LIST.append(item)
129
+
130
+
131
+ def _normalize_object_type_value(obj_type: str) -> str:
132
+ """
133
+ Canonical normalization for lookup and dynamic registration.
134
+ This must match get_type() semantics.
135
+ """
136
+ obj_type = _get_new_obj_type_str(obj_type)
137
+
138
+ if obj_type.startswith(("B-", "E-", "I-", "S-")):
139
+ return obj_type[:2] + obj_type[2:].lower()
140
+
141
+ if obj_type not in _get_black_list():
142
+ return obj_type.lower()
143
+
144
+ return obj_type
145
+
146
+
147
+ def _flatten_categories(categories_list: Sequence[str]) -> list[str]:
148
+ """
149
+ Flatten a possibly nested category sequence into a plain list[str].
150
+ """
151
+ if categories_list and isinstance(categories_list[0], (list, tuple)):
152
+ return [str(item) for item in itertools.chain.from_iterable(categories_list)]
153
+ return [str(item) for item in categories_list]
154
+
155
+
156
+ def _dedupe_preserve_order(values: Sequence[str]) -> list[str]:
157
+ """
158
+ Stable de-duplication preserving the first occurrence order.
159
+ """
160
+ seen: set[str] = set()
161
+ out: list[str] = []
162
+ for value in values:
163
+ if value not in seen:
164
+ seen.add(value)
165
+ out.append(value)
166
+ return out
167
+
168
+
169
+ def _sanitize_enum_member_name(value: str, used_names: set[str]) -> str:
170
+ """
171
+ Convert a value string into a safe enum member name.
172
+ """
173
+ candidate = re.sub(r"[^A-Z0-9_]", "_", value.upper())
174
+ if not candidate:
175
+ candidate = "TYPE"
176
+ if candidate[0].isdigit():
177
+ candidate = f"TYPE_{candidate}"
178
+
179
+ name = candidate
180
+ suffix = 2
181
+ while name in used_names:
182
+ name = f"{candidate}_{suffix}"
183
+ suffix += 1
184
+
185
+ used_names.add(name)
186
+ return name
187
+
188
+
189
+ def _build_types_index_from_registry() -> dict[str, ObjectTypes]:
190
+ """
191
+ Build a fresh value->member index from the current registry contents.
192
+
193
+ This is the only place where duplicate value detection happens.
194
+ """
195
+ new_index: dict[str, ObjectTypes] = {}
196
+
197
+ for enum_cls in _iter_registered_enums():
70
198
  for member in enum_cls:
71
- val = str(member.value)
72
- existing = _ALL_TYPES_DICT.get(val)
199
+ value = str(member.value)
200
+ existing = new_index.get(value)
73
201
  if existing is not None and existing is not member:
74
202
  raise DuplicateObjectTypeError(
75
- f"Object type value '{val}' already taken by {existing!r}; cannot register {member!r}"
203
+ f"Object type value '{value}' already taken by {existing!r}; cannot register {member!r}"
76
204
  )
77
- _ALL_TYPES_DICT[val] = member
205
+ new_index[value] = member
206
+
207
+ return new_index
208
+
209
+
210
+ def _rebuild_types_index_locked() -> None:
211
+ """
212
+ Rebuild the global lookup index from the registry.
213
+ Caller must hold _TYPES_INDEX_LOCK.
214
+ """
215
+ global _ALL_TYPES_DICT # pylint: disable=W0603
216
+ _ALL_TYPES_DICT = _build_types_index_from_registry()
78
217
 
79
218
 
80
219
  def _rebuild_types_index() -> None:
81
220
  """
82
- Full rebuild from the registry. Useful at process start or in tests.
221
+ Public/internal helper to rebuild the lookup index from the registry.
83
222
  """
84
223
  with _TYPES_INDEX_LOCK:
85
- _ALL_TYPES_DICT.clear()
86
- for enum_cls in _iter_registered_enums():
87
- _index_enum(enum_cls)
224
+ _rebuild_types_index_locked()
88
225
 
89
226
 
90
- def _wrapped_register(name: str, func: Optional[Any] = None) -> Callable[[Type[ObjectTypes]], Type[ObjectTypes]]:
91
- def _decorator(cls: Type[ObjectTypes]) -> Type[ObjectTypes]:
92
- registered_cls = _orig_register(name, func=func)(cls)
93
- _index_enum(registered_cls)
94
- return registered_cls
227
+ def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes) -> ObjectTypes:
228
+ """
229
+ Maps a `TokenClassWithTagLabel` enum member from a token class and tag, e.g. `TokenClassLabel.HEADER` and
230
+ `BioTag.INSIDE` maps to `TTokenClassWithTagLabel.I_HEADER`.
95
231
 
96
- return _decorator
232
+ Args:
233
+ token: TokenClasses member.
234
+ tag: BioTag member.
235
+
236
+ Returns:
237
+ TokenClassWithTag member.
238
+
239
+ Raises:
240
+ TypeError: If token is not of type TokenClasses or tag is not of type BioTag.
241
+ """
242
+ if isinstance(token, TokenClassLabel) and isinstance(tag, BioTagLabel):
243
+ return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
244
+ raise TypeError(
245
+ f"Token must be of type TokenClasses, is of {type(token)} and tag " f"{type(tag)} must be of type BioTag"
246
+ )
247
+
248
+
249
+ def token_class_with_tag_to_token_class_and_tag(
250
+ token_class_with_tag: ObjectTypes,
251
+ ) -> Optional[tuple[ObjectTypes, ObjectTypes]]:
252
+ """
253
+ This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
254
+
255
+ Args:
256
+ token_class_with_tag: `TokenClassWithTag` member
257
+
258
+ Returns:
259
+ Tuple of `TokenClasses` member and `BioTag` member
260
+ """
261
+ return {val: key for key, val in _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG.items()}.get(token_class_with_tag)
262
+
263
+
264
+ def register_custom_token_tag(custom_object_types: ObjectTypes, suffix: str) -> str:
265
+ """
266
+ Registers custom token tags for a given ObjectType with a specified suffix. The tags are created by combining
267
+ BIO tags (B, I, E) with the custom object types.
268
+
269
+ Args:
270
+ custom_object_types: An instance of ObjectTypes containing the custom object types to be registered.
271
+ suffix: A string suffix to be appended to the name of the registered object type.
272
+
273
+ Returns:
274
+ The name of the registered object type.
275
+
276
+ Example:
277
+
278
+ ```python
279
+ from deepdoctection.utils.settings import ObjectTypes
280
+
281
+ class CustomObjectTypesLabel(ObjectTypes):
282
+ TOKEN_A = "token_a"
283
+ TOKEN_B = "token_b"
284
+
285
+ custom_object_types = CustomObjectTypes()
286
+ register_custom_token_tag(custom_object_types, "custom_type")
287
+ # This will register tags like "B-TOKEN_A", "I-TOKEN_A", "E-TOKEN_A", "B-TOKEN_B", "I-TOKEN_B", "E-TOKEN_B"
288
+ ```
289
+
290
+ """
291
+ tag_list = [i for i in object_types_registry.get("BioTagLabel") if i in ("B", "I", "E")]
292
+ name = f"{custom_object_types.__name__.lower()}_{suffix}" # type: ignore
293
+
294
+ product = [
295
+ (
296
+ a[0].value + "_" + a[1].value.upper(), # type: ignore
297
+ a[0].value + "-" + a[1].value, # type: ignore
298
+ )
299
+ for a in list(itertools.product(tag_list, custom_object_types))
300
+ ]
301
+
302
+ _upsert_dynamic_enum(name, product)
303
+ return name
304
+
305
+
306
+ def register_string_categories_from_list(categories_list: Sequence[str], object_type_name: str) -> None:
307
+ """
308
+ Idempotently register or extend string categories under a dynamic ObjectTypes enum.
309
+
310
+ Repeated calls with the same values are a no-op.
311
+ Repeated calls with new values extend the enum under the same name.
312
+ """
313
+ flattened_categories = _flatten_categories(categories_list)
314
+ normalized_values = _dedupe_preserve_order([_normalize_object_type_value(cat) for cat in flattened_categories])
315
+
316
+ if not normalized_values:
317
+ return
318
+
319
+ used_names: set[str] = set()
320
+ members = [(_sanitize_enum_member_name(value, used_names), value) for value in normalized_values]
321
+
322
+ _upsert_dynamic_enum(object_type_name, members)
323
+
324
+
325
+ def update_all_types_dict() -> None:
326
+ """
327
+ Compatibility helper retained for older code/tests that call it explicitly.
328
+ Rebuilds the global index from the registry.
329
+ """
330
+ _rebuild_types_index()
331
+
332
+
333
+ def get_type(obj_type: Union[str, ObjectTypes]) -> ObjectTypes:
334
+ """
335
+ Get an object type property from a given string. Does nothing if an `ObjectType` is passed
336
+
337
+ Args:
338
+ obj_type: String or ObjectTypes
339
+ Returns:
340
+ `ObjectType`
341
+ """
342
+ if isinstance(obj_type, ObjectTypes):
343
+ return obj_type
344
+
345
+ if not isinstance(obj_type, str):
346
+ raise TypeError(f"get_type expects str or ObjectTypes, got {type(obj_type)}")
347
+
348
+ normalized = _normalize_object_type_value(obj_type)
349
+
350
+ with _TYPES_INDEX_LOCK:
351
+ member = _ALL_TYPES_DICT.get(normalized)
352
+
353
+ if member is None:
354
+ raise KeyError(f"String {normalized} does not correspond to a registered ObjectType")
355
+
356
+ return member
357
+
358
+
359
+ def _get_new_obj_type_str(obj_type: str) -> str:
360
+ return _OLD_TO_NEW_OBJ_TYPE.get(obj_type, obj_type)
97
361
 
98
362
 
99
363
  # Monkey-patch the registry to enforce duplicate detection for all modules.
@@ -103,6 +367,9 @@ _TYPES_INDEX_LOCK = threading.RLock()
103
367
  _ALL_TYPES_DICT: dict[str, ObjectTypes] = {}
104
368
 
105
369
 
370
+ _rebuild_types_index()
371
+
372
+
106
373
  @object_types_registry.register("DefaultType")
107
374
  class DefaultType(ObjectTypes):
108
375
  """Type for default member"""
@@ -412,121 +679,6 @@ _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
412
679
  }
413
680
 
414
681
 
415
- def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes) -> ObjectTypes:
416
- """
417
- Maps a `TokenClassWithTagLabel` enum member from a token class and tag, e.g. `TokenClassLabel.HEADER` and
418
- `BioTag.INSIDE` maps to `TTokenClassWithTagLabel.I_HEADER`.
419
-
420
- Args:
421
- token: TokenClasses member.
422
- tag: BioTag member.
423
-
424
- Returns:
425
- TokenClassWithTag member.
426
-
427
- Raises:
428
- TypeError: If token is not of type TokenClasses or tag is not of type BioTag.
429
- """
430
- if isinstance(token, TokenClassLabel) and isinstance(tag, BioTagLabel):
431
- return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
432
- raise TypeError(
433
- f"Token must be of type TokenClasses, is of {type(token)} and tag " f"{type(tag)} must be of type BioTag"
434
- )
435
-
436
-
437
- def token_class_with_tag_to_token_class_and_tag(
438
- token_class_with_tag: ObjectTypes,
439
- ) -> Optional[tuple[ObjectTypes, ObjectTypes]]:
440
- """
441
- This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
442
-
443
- Args:
444
- token_class_with_tag: `TokenClassWithTag` member
445
-
446
- Returns:
447
- Tuple of `TokenClasses` member and `BioTag` member
448
- """
449
- return {val: key for key, val in _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG.items()}.get(token_class_with_tag)
450
-
451
-
452
- def register_custom_token_tag(custom_object_types: ObjectTypes, suffix: str) -> str:
453
- """
454
- Registers custom token tags for a given ObjectType with a specified suffix. The tags are created by combining
455
- BIO tags (B, I, E) with the custom object types.
456
-
457
- Args:
458
- custom_object_types: An instance of ObjectTypes containing the custom object types to be registered.
459
- suffix: A string suffix to be appended to the name of the registered object type.
460
-
461
- Returns:
462
- The name of the registered object type.
463
-
464
- Example:
465
-
466
- ```python
467
- from deepdoctection.utils.settings import ObjectTypes
468
-
469
- class CustomObjectTypesLabel(ObjectTypes):
470
- TOKEN_A = "token_a"
471
- TOKEN_B = "token_b"
472
-
473
- custom_object_types = CustomObjectTypes()
474
- register_custom_token_tag(custom_object_types, "custom_type")
475
- # This will register tags like "B-TOKEN_A", "I-TOKEN_A", "E-TOKEN_A", "B-TOKEN_B", "I-TOKEN_B", "E-TOKEN_B"
476
- ```
477
-
478
- """
479
- tag_list = [i for i in object_types_registry.get("BioTagLabel") if i in ("B", "I", "E")]
480
- name = f"{custom_object_types.__name__.lower()}_{suffix}" # type: ignore
481
- product = [
482
- (
483
- a[0].value + "_" + a[1].value.upper(), # type: ignore
484
- a[0].value + "-" + a[1].value, # type: ignore
485
- )
486
- for a in list(itertools.product(tag_list, custom_object_types))
487
- ]
488
-
489
- object_types_registry.register(name)(ObjectTypes(name, product)) # type: ignore
490
- return name
491
-
492
-
493
- def register_string_categories_from_list(categories_list: Sequence[str], object_type_name: str) -> None:
494
- """
495
- Registers string categories from a given list into the object types registry. If a category from the list is not
496
- already registered, it will be added with the specified object type name.
497
-
498
- Args:
499
- categories_list: A sequence of strings representing the categories to be registered.
500
- object_type_name: The name of the object type under which the categories will be registered.
501
-
502
- Example:
503
- ```python
504
- categories = ["category1", "category2", "category3"]
505
- register_string_categories_from_list(categories, "custom_object_type")
506
- # This will register "CATEGORY1", "CATEGORY2", "CATEGORY3" under the object type "custom_object_type"
507
- ```
508
- """
509
-
510
- all_types = {cat.value for object_type in set(object_types_registry.get_all().values()) for cat in object_type}
511
-
512
- if categories_list and isinstance(categories_list[0], (list, tuple)):
513
- flattened_categories = list(itertools.chain.from_iterable(categories_list))
514
- else:
515
- flattened_categories = list(categories_list)
516
-
517
- categories_to_register = [cat for cat in flattened_categories if cat not in all_types]
518
- categories_tuple = list({cat.upper(): cat for cat in categories_to_register}.items())
519
- object_types_registry.register(object_type_name)(ObjectTypes(object_type_name, categories_tuple)) # type: ignore
520
-
521
-
522
- def update_all_types_dict() -> None:
523
- """
524
- Compatibility helper retained for older code/tests that call it explicitly.
525
- Rebuilds the global index from the registry.
526
- """
527
- _rebuild_types_index()
528
-
529
-
530
682
  _OLD_TO_NEW_OBJ_TYPE: dict[str, str] = {
531
683
  "DOC_CLASS": "document_type",
532
684
  "CHARS": "characters",
@@ -546,49 +698,4 @@ _OLD_TO_NEW_OBJ_TYPE: dict[str, str] = {
546
698
  }
547
699
 
548
700
 
549
- def _get_new_obj_type_str(obj_type: str) -> str:
550
- return _OLD_TO_NEW_OBJ_TYPE.get(obj_type, obj_type)
551
-
552
-
553
701
  _BLACK_LIST: list[str] = ["B", "I", "O", "E", "S"]
554
-
555
-
556
- def _get_black_list() -> list[str]:
557
- return _BLACK_LIST
558
-
559
-
560
- def update_black_list(item: str) -> None:
561
- """Updates the black list, i.e. set of elements that must not be lowered"""
562
- _BLACK_LIST.append(item)
563
-
564
-
565
- def get_type(obj_type: Union[str, ObjectTypes]) -> ObjectTypes:
566
- """
567
- Get an object type property from a given string. Does nothing if an `ObjectType` is passed
568
-
569
- Args:
570
- obj_type: String or ObjectTypes
571
- Returns:
572
- `ObjectType`
573
- """
574
- if isinstance(obj_type, ObjectTypes):
575
- return obj_type
576
- if not isinstance(obj_type, str):
577
- raise TypeError(f"get_type expects str or ObjectTypes, got {type(obj_type)}")
578
-
579
- obj_type = _get_new_obj_type_str(obj_type)
580
- if obj_type.startswith(("B-", "E-", "I-", "S-")):
581
- obj_type = obj_type[:2] + obj_type[2:].lower()
582
- elif obj_type not in _get_black_list():
583
- obj_type = obj_type.lower()
584
-
585
- with _TYPES_INDEX_LOCK:
586
- member = _ALL_TYPES_DICT.get(obj_type)
587
-
588
- if member is None:
589
- raise KeyError(f"String {obj_type} does not correspond to a registered ObjectType")
590
-
591
- return member
592
-
593
-
594
- _rebuild_types_index()
@@ -127,7 +127,7 @@ def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
127
127
  sys.exit()
128
128
 
129
129
 
130
- def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
130
+ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes], check_file_extension: bool = True) -> PdfReader:
131
131
  """
132
132
  Create a file reader object from a PDF document.
133
133
 
@@ -136,6 +136,7 @@ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader
136
136
 
137
137
  Args:
138
138
  path_or_bytes: A path to a PDF document or bytes.
139
+ check_file_extension: If True, and file suffix is not .pdf, it will raise a FileExtensionError
139
140
 
140
141
  Returns:
141
142
  A file reader object from which you can iterate through the document.
@@ -154,7 +155,7 @@ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader
154
155
  if not os.path.isfile(path_or_bytes):
155
156
  raise FileNotFoundError(str(path_or_bytes))
156
157
  file_name = os.path.split(path_or_bytes)[1]
157
- if not is_file_extension(file_name, ".pdf"):
158
+ if not is_file_extension(file_name, ".pdf") and check_file_extension:
158
159
  raise FileExtensionError(f"must be a pdf file: {file_name}")
159
160
 
160
161
  with open(path_or_bytes, "rb") as file:
@@ -218,15 +219,17 @@ class PDFStreamer:
218
219
  you open many files.
219
220
  """
220
221
 
221
- def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
222
+ def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes], check_file_extension: bool = True) -> None:
222
223
  """
223
224
  Args:
224
225
  path_or_bytes: Path to a PDF.
226
+ check_file_extension: If True, and file suffix is not .pdf, it will raise a FileExtensionError
227
+
225
228
 
226
229
  Returns:
227
230
  None.
228
231
  """
229
- self.file_reader = get_pdf_file_reader(path_or_bytes)
232
+ self.file_reader = get_pdf_file_reader(path_or_bytes, check_file_extension=check_file_extension)
230
233
  self.file_writer = PdfWriter()
231
234
 
232
235
  def __len__(self) -> int:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dd-core
3
- Version: 1.2.2
3
+ Version: 1.2.4
4
4
  Summary: Core data structures, dataflows, mappers and utilities for deepdoctection
5
5
  Author: Dr. Janis Meyer
6
6
  License: Apache License 2.0
@@ -62,7 +62,7 @@ class TestSessionIdRemovalIntegration:
62
62
  assert loaded_ann.model_id == "model_v1"
63
63
  # The important thing is that session_id was removed from model_fields, not that hasattr returns False
64
64
  # (hasattr may still return True due to Pydantic internals, but the field is not in the model schema)
65
- assert "session_id" not in loaded_ann.model_fields
65
+ assert "session_id" not in set(ImageAnnotation.model_fields)
66
66
 
67
67
  def test_serialization_roundtrip_without_session_id(self) -> None:
68
68
  """Test that serialization/deserialization roundtrip works without session_id"""
@@ -130,5 +130,5 @@ class TestSessionIdRemovalIntegration:
130
130
  assert image.annotations[1].service_id == "new_detector"
131
131
 
132
132
  # Both should have session_id removed from model fields
133
- for ann in image.annotations:
134
- assert "session_id" not in ann.model_fields
133
+ for _ in image.annotations:
134
+ assert "session_id" not in set(ImageAnnotation.model_fields)
File without changes
File without changes
File without changes
File without changes
File without changes