extract-python 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {extract_python-0.3.1 → extract_python-0.3.2}/PKG-INFO +1 -1
  2. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/docling_.py +16 -16
  3. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/marker_.py +5 -5
  4. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/miner_u.py +9 -8
  5. {extract_python-0.3.1 → extract_python-0.3.2}/.dockerignore +0 -0
  6. {extract_python-0.3.1 → extract_python-0.3.2}/.github/workflows/publish.yml +0 -0
  7. {extract_python-0.3.1 → extract_python-0.3.2}/.github/workflows/tests.yml +0 -0
  8. {extract_python-0.3.1 → extract_python-0.3.2}/.gitignore +0 -0
  9. {extract_python-0.3.1 → extract_python-0.3.2}/.python-version +0 -0
  10. {extract_python-0.3.1 → extract_python-0.3.2}/Dockerfile +0 -0
  11. {extract_python-0.3.1 → extract_python-0.3.2}/README.md +0 -0
  12. {extract_python-0.3.1 → extract_python-0.3.2}/benches/__init__.py +0 -0
  13. {extract_python-0.3.1 → extract_python-0.3.2}/benches/compare.ipynb +0 -0
  14. {extract_python-0.3.1 → extract_python-0.3.2}/benches/compare.py +0 -0
  15. {extract_python-0.3.1 → extract_python-0.3.2}/benches/constants.py +0 -0
  16. {extract_python-0.3.1 → extract_python-0.3.2}/data/.gitignore +0 -0
  17. {extract_python-0.3.1 → extract_python-0.3.2}/docker-compose.yml +0 -0
  18. {extract_python-0.3.1 → extract_python-0.3.2}/extract +0 -0
  19. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/__init__.py +0 -0
  20. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/constants.py +0 -0
  21. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/objects.py +0 -0
  22. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/pipeline.py +0 -0
  23. {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/utils.py +0 -0
  24. {extract_python-0.3.1 → extract_python-0.3.2}/pyproject.toml +0 -0
  25. {extract_python-0.3.1 → extract_python-0.3.2}/qa/ruff.toml +0 -0
  26. {extract_python-0.3.1 → extract_python-0.3.2}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
32
32
 
33
33
 
34
34
  def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
- from docling.datamodel.pipeline_options import PdfPipelineOptions
35
+ from docling.datamodel.pipeline_options import PdfPipelineOptions # noqa: PLC0415
36
36
 
37
37
  if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
38
38
  msg = "generate_picture_images should be set to true"
@@ -49,11 +49,11 @@ def _validate_options(
49
49
 
50
50
  @cache
51
51
  def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
- from docling.datamodel.pipeline_options import (
52
+ from docling.datamodel.pipeline_options import ( # noqa: PLC0415
53
53
  EasyOcrOptions,
54
54
  PdfPipelineOptions,
55
55
  )
56
- from docling.document_converter import PdfFormatOption
56
+ from docling.document_converter import PdfFormatOption # noqa: PLC0415
57
57
 
58
58
  return {
59
59
  InputFormat.PDF: PdfFormatOption(
@@ -80,23 +80,21 @@ class DoclingPipelineConfig(PipelineConfig):
80
80
  task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
81
81
 
82
82
  format_options: Annotated[
83
- dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
83
+ dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
84
84
  ] = Field(default_factory=_default_format_opts)
85
85
 
86
- _unsupported_input_formats: ClassVar[set[InputFormat]] = {
87
- InputFormat.AUDIO,
88
- InputFormat.METS_GBS,
89
- InputFormat.VTT,
90
- }
91
-
92
86
  @classmethod
93
87
  @cache
94
88
  def supported_exts(cls) -> set[SupportedExt]:
95
- from docling.datamodel.base_models import FormatToExtensions, InputFormat
89
+ from docling.datamodel.base_models import ( # noqa: PLC0415
90
+ FormatToExtensions,
91
+ InputFormat,
92
+ )
96
93
 
94
+ unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
97
95
  supported = set()
98
96
  for f in InputFormat:
99
- if f in cls._unsupported_input_formats:
97
+ if f in unsupported:
100
98
  continue
101
99
  for ext in FormatToExtensions[f]:
102
100
  supported.add(SupportedExt(f".{ext.lower()}"))
@@ -105,8 +103,10 @@ class DoclingPipelineConfig(PipelineConfig):
105
103
 
106
104
  @Pipeline.register(PipelineType.DOCLING)
107
105
  class DoclingPipeline(Pipeline):
108
- def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
109
- from docling.document_converter import DocumentConverter
106
+ def __init__(
107
+ self, format_options: dict["InputFormat", "FormatOption"] | None = None
108
+ ):
109
+ from docling.document_converter import DocumentConverter # noqa: PLC0415
110
110
 
111
111
  allowed_format = [
112
112
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
@@ -128,7 +128,7 @@ class DoclingPipeline(Pipeline):
128
128
  return cls(config.format_options)
129
129
 
130
130
 
131
- def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
131
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
132
132
  for d in docs:
133
133
  yield d.to_docling()
134
134
 
@@ -160,7 +160,7 @@ def _to_markdown_doc(
160
160
  page_sep: str = DEFAULT_MD_PAGE_SEP,
161
161
  **kwargs,
162
162
  ) -> MarkdownDoc:
163
- from docling_core.types.doc import ImageRefMode
163
+ from docling_core.types.doc import ImageRefMode # noqa: PLC0415
164
164
 
165
165
  # TODO: Should we add a hash to avoid collision between files with same names
166
166
  # nested in the tree structured
@@ -73,9 +73,9 @@ class MarkerPipeline(Pipeline):
73
73
  async def extract_content(
74
74
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
75
75
  ) -> AsyncGenerator[Result, None]:
76
- from marker.config.parser import ConfigParser
77
- from marker.converters.pdf import PdfConverter
78
- from marker.models import create_model_dict
76
+ from marker.config.parser import ConfigParser # noqa: PLC0415
77
+ from marker.converters.pdf import PdfConverter # noqa: PLC0415
78
+ from marker.models import create_model_dict # noqa: PLC0415
79
79
 
80
80
  config = deepcopy(self._marker_config)
81
81
  config["output_format"] = output_format.to_marker()
@@ -102,7 +102,7 @@ def _process_doc(
102
102
  output_format: OutputFormat,
103
103
  output_path: Path,
104
104
  ) -> Result:
105
- from marker.output import text_from_rendered
105
+ from marker.output import text_from_rendered # noqa: PLC0415
106
106
 
107
107
  rendered = converter(str(doc.path))
108
108
  content, _, images = text_from_rendered(rendered)
@@ -118,7 +118,7 @@ def _process_doc(
118
118
  def _to_markdown_doc(
119
119
  input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
120
120
  ) -> MarkdownDoc:
121
- from marker.renderers.markdown import MarkdownRenderer
121
+ from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
122
122
 
123
123
  # TODO: Should we add a hash to avoid collision between files with same names
124
124
  # nested in the tree structured
@@ -52,13 +52,12 @@ class MinerUConfig(BaseModel):
52
52
  @classmethod
53
53
  @cache
54
54
  def _get_default_kwargs(cls) -> dict[str, Any]:
55
-
56
- from mineru.utils.enum_class import MakeMode
55
+ from mineru.utils.enum_class import MakeMode # noqa: PLC0415
57
56
 
58
57
  return {
59
58
  "server_url": None,
60
- # We don't dump md directly we process, we dump the middle json in order to be
61
- # able to get page indexes
59
+ # We don't dump md directly we process, we dump the middle json in order
60
+ # to be able to get page indexes
62
61
  "parse_method": "auto",
63
62
  "dump_md": False,
64
63
  "dump_middle_json": True,
@@ -104,7 +103,7 @@ class MinerUPipeline(Pipeline):
104
103
  async def extract_content(
105
104
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
106
105
  ) -> AsyncGenerator[Result, None]:
107
- from mineru.cli.common import aio_do_parse
106
+ from mineru.cli.common import aio_do_parse # noqa: PLC0415
108
107
 
109
108
  docs = list(docs)
110
109
  # TODO: exclude files which are not pdf and return an error
@@ -154,13 +153,15 @@ def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
154
153
 
155
154
  match backend:
156
155
  case MinerUBackend.PIPELINE:
157
- from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
156
+ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import ( # noqa: PLC0415
158
157
  union_make,
159
158
  )
160
159
 
161
160
  return union_make
162
161
  case MinerUBackend.VLM:
163
- from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
162
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import ( # noqa: PLC0415
163
+ union_make,
164
+ )
164
165
 
165
166
  return union_make
166
167
  case _:
@@ -212,7 +213,7 @@ def _dump_md_content(
212
213
  im_dir: Path,
213
214
  md_make_mode: str | None = None,
214
215
  ) -> ConversionOutput:
215
- from mineru.utils.enum_class import MakeMode
216
+ from mineru.utils.enum_class import MakeMode # noqa: PLC0415
216
217
 
217
218
  if md_make_mode is None:
218
219
  md_make_mode = MakeMode.MM_MD
File without changes
File without changes
File without changes