extract-python 0.4.0__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {extract_python-0.4.0 → extract_python-0.4.2}/PKG-INFO +1 -1
  2. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/docling_.py +37 -6
  3. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/objects.py +1 -0
  4. {extract_python-0.4.0 → extract_python-0.4.2}/.dockerignore +0 -0
  5. {extract_python-0.4.0 → extract_python-0.4.2}/.github/workflows/publish.yml +0 -0
  6. {extract_python-0.4.0 → extract_python-0.4.2}/.github/workflows/tests.yml +0 -0
  7. {extract_python-0.4.0 → extract_python-0.4.2}/.gitignore +0 -0
  8. {extract_python-0.4.0 → extract_python-0.4.2}/.python-version +0 -0
  9. {extract_python-0.4.0 → extract_python-0.4.2}/Dockerfile +0 -0
  10. {extract_python-0.4.0 → extract_python-0.4.2}/README.md +0 -0
  11. {extract_python-0.4.0 → extract_python-0.4.2}/benches/__init__.py +0 -0
  12. {extract_python-0.4.0 → extract_python-0.4.2}/benches/compare.ipynb +0 -0
  13. {extract_python-0.4.0 → extract_python-0.4.2}/benches/compare.py +0 -0
  14. {extract_python-0.4.0 → extract_python-0.4.2}/benches/constants.py +0 -0
  15. {extract_python-0.4.0 → extract_python-0.4.2}/data/.gitignore +0 -0
  16. {extract_python-0.4.0 → extract_python-0.4.2}/docker-compose.yml +0 -0
  17. {extract_python-0.4.0 → extract_python-0.4.2}/extract +0 -0
  18. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/__init__.py +0 -0
  19. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/constants.py +0 -0
  20. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/marker_.py +0 -0
  21. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/miner_u.py +0 -0
  22. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/pipeline.py +0 -0
  23. {extract_python-0.4.0 → extract_python-0.4.2}/extract_python/utils.py +0 -0
  24. {extract_python-0.4.0 → extract_python-0.4.2}/pyproject.toml +0 -0
  25. {extract_python-0.4.0 → extract_python-0.4.2}/qa/ruff.toml +0 -0
  26. {extract_python-0.4.0 → extract_python-0.4.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -7,7 +7,7 @@ from pathlib import Path
7
7
  from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
8
8
 
9
9
  from docling.backend.abstract_backend import AbstractDocumentBackend
10
- from docling.datamodel.backend_options import BackendOptions
10
+ from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
11
11
 
12
12
  # Data model import are quick it's ok to leave it there
13
13
  from docling.datamodel.base_models import FormatToExtensions, InputFormat
@@ -26,7 +26,15 @@ from docling_core.types.doc import ImageRefMode
26
26
  from docling_core.types.io import DocumentStream
27
27
  from icij_common.pydantic_utils import to_lower_snake_case
28
28
  from icij_common.registrable import FromConfig
29
- from pydantic import AfterValidator, BeforeValidator, Field, model_validator
29
+ from pydantic import (
30
+ AfterValidator,
31
+ BeforeValidator,
32
+ Field,
33
+ PlainSerializer,
34
+ WrapSerializer,
35
+ model_validator,
36
+ )
37
+ from pydantic_core.core_schema import SerializerFunctionWrapHandler
30
38
 
31
39
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
32
40
  from .objects import (
@@ -73,7 +81,7 @@ def _find_subcls(cls: type[T], name: str) -> type[T]:
73
81
 
74
82
  def _find_init_arg_type(cls: type[Any], arg: str) -> type:
75
83
  hints = get_type_hints(cls.__init__)
76
- return hints[arg].__class__
84
+ return hints[arg]
77
85
 
78
86
 
79
87
  def _resolve_pipeline_cls(v: Any) -> Any:
@@ -82,6 +90,23 @@ def _resolve_pipeline_cls(v: Any) -> Any:
82
90
  return v
83
91
 
84
92
 
93
+ def _ser_class_as_str(v: Any) -> Any:
94
+ if isinstance(v, type):
95
+ return v.__name__
96
+ return v
97
+
98
+
99
+ def _ser_with_backend_option_kind(
100
+ v: Any, handler: SerializerFunctionWrapHandler
101
+ ) -> Any:
102
+ serialized = handler(v)
103
+ if isinstance(v, BaseBackendOptions):
104
+ kind = getattr(v, "kind", None)
105
+ if kind is not None:
106
+ serialized["kind"] = kind
107
+ return serialized
108
+
109
+
85
110
  def _resolve_backend(v: Any) -> Any:
86
111
  if isinstance(v, str):
87
112
  return _find_subcls(AbstractDocumentBackend, v)
@@ -90,15 +115,21 @@ def _resolve_backend(v: Any) -> Any:
90
115
 
91
116
  class DoclingFormatOption(FormatOption):
92
117
  pipeline_cls: Annotated[
93
- str | type[BasePipeline], BeforeValidator(_resolve_pipeline_cls)
118
+ str | type[BasePipeline],
119
+ BeforeValidator(_resolve_pipeline_cls),
120
+ PlainSerializer(_ser_class_as_str),
94
121
  ]
95
122
  pipeline_options: Annotated[
96
123
  dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
97
124
  ] = None
98
125
  backend: Annotated[
99
- str | type[AbstractDocumentBackend], BeforeValidator(_resolve_backend)
126
+ str | type[AbstractDocumentBackend],
127
+ BeforeValidator(_resolve_backend),
128
+ PlainSerializer(_ser_class_as_str),
100
129
  ]
101
- backend_options: BackendOptions | None = None
130
+ backend_options: Annotated[
131
+ BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
132
+ ] = None
102
133
 
103
134
  @model_validator(mode="after")
104
135
  def _resolve_pipeline_options(self) -> Self:
@@ -71,6 +71,7 @@ class SupportedExt(StrEnum):
71
71
  DOTM = ".dotm"
72
72
  DOCM = ".docm"
73
73
  EPUB = ".epub"
74
+ EML = ".eml"
74
75
  GIF = ".gif"
75
76
  HTLM = ".html"
76
77
  HTM = ".htm"
File without changes
File without changes
File without changes