PyPI - docling-core - Versions diffs - 0.0.1__py3-none-any.whl - Mend

docling-core 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (46) hide show

docling_core/__init__.py +6 -0
docling_core/py.typed +0 -0
docling_core/resources/schemas/doc/ANN.json +171 -0
docling_core/resources/schemas/doc/DOC.json +300 -0
docling_core/resources/schemas/doc/OCR-output.json +166 -0
docling_core/resources/schemas/doc/RAW.json +158 -0
docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
docling_core/search/__init__.py +6 -0
docling_core/search/json_schema_to_search_mapper.py +406 -0
docling_core/search/mapping.py +29 -0
docling_core/search/meta.py +93 -0
docling_core/search/package.py +56 -0
docling_core/types/__init__.py +25 -0
docling_core/types/base.py +248 -0
docling_core/types/doc/__init__.py +6 -0
docling_core/types/doc/base.py +199 -0
docling_core/types/doc/doc_ann.py +76 -0
docling_core/types/doc/doc_ocr.py +83 -0
docling_core/types/doc/doc_raw.py +187 -0
docling_core/types/doc/document.py +393 -0
docling_core/types/gen/__init__.py +6 -0
docling_core/types/gen/generic.py +33 -0
docling_core/types/nlp/__init__.py +6 -0
docling_core/types/nlp/qa.py +74 -0
docling_core/types/nlp/qa_labels.py +118 -0
docling_core/types/rec/__init__.py +6 -0
docling_core/types/rec/attribute.py +55 -0
docling_core/types/rec/base.py +90 -0
docling_core/types/rec/predicate.py +133 -0
docling_core/types/rec/record.py +95 -0
docling_core/types/rec/statement.py +41 -0
docling_core/types/rec/subject.py +77 -0
docling_core/utils/__init__.py +6 -0
docling_core/utils/alias.py +27 -0
docling_core/utils/ds_generate_docs.py +144 -0
docling_core/utils/ds_generate_jsonschema.py +62 -0
docling_core/utils/validate.py +86 -0
docling_core/utils/validators.py +100 -0
docling_core-0.0.1.dist-info/LICENSE +21 -0
docling_core-0.0.1.dist-info/METADATA +133 -0
docling_core-0.0.1.dist-info/RECORD +46 -0
docling_core-0.0.1.dist-info/WHEEL +4 -0
docling_core-0.0.1.dist-info/entry_points.txt +5 -0

docling_core/search/json_schema_to_search_mapper.py ADDED Viewed

@@ -0,0 +1,406 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Methods to convert a JSON Schema into a search database schema."""
+import re
+from copy import deepcopy
+from typing import Any, Optional, Pattern, Tuple, TypedDict
+from jsonref import JsonRef
+class SearchIndexDefinition(TypedDict):
+    """Data type for an index basic definition (settings and mappings)."""
+    settings: dict
+    mappings: dict
+class JsonSchemaToSearchMapper:
+    """Map a JSON Schema to an search database schema.
+    The generated database schema is a mapping describing the fields from the
+    JSON Schema and how they should be indexed in a Lucene index of a search database.
+    Potential issues:
+    - Tuples may not be converted properly (e.g., Tuple[float,float,float,str,str])
+    - Method `_remove_keys` may lead to wrong results if a field is named `properties`.
+    """
+    def __init__(
+        self,
+        settings_extra: Optional[dict] = None,
+        mappings_extra: Optional[dict] = None,
+    ):
+        """Create an instance of the mapper with default settings."""
+        self.settings = {
+            "analysis": {
+                # Create a normalizer for lowercase ascii folding,
+                # this is used in keyword fields
+                "normalizer": {
+                    "lowercase_asciifolding": {
+                        "type": "custom",
+                        "filter": ["lowercase", "asciifolding"],
+                    }
+                }
+            }
+        }
+        self.settings_extra = settings_extra
+        self.mappings_extra = mappings_extra
+        self._re_es_flag = re.compile(r"^(?:x-es-)(.*)")
+        self._rm_keys = (
+            "description",
+            "required",
+            "title",
+            "additionalProperties",
+            "format",
+            "enum",
+            "pattern",
+            "$comment",
+            "default",
+            "minItems",
+            "maxItems",
+            "minimum",
+            "maximum",
+            "minLength",
+            "maxLength",
+            "exclusiveMinimum",
+            "exclusiveMaximum",
+            "$defs",
+            "const",
+        )
+        self._suppress_key = "x-es-suppress"
+        self._type_format_mappings: dict[tuple[str, str], str] = {
+            ("string", "date-time"): "date",
+        }
+        self._type_mappings = {
+            "number": "double",
+            "string": "text",
+        }
+        self._types_to_remove = ("object",)
+    def get_index_definition(self, schema: dict) -> SearchIndexDefinition:
+        """Generates a search database schema from a JSON Schema.
+        The search database schema consists of the sections `settings` and `mappings`,
+        which define the fields, their data types, and other specifications to index
+        JSON documents into a Lucene index.
+        """
+        mapping = JsonRef.replace_refs(schema)
+        mapping = self._merge_unions(mapping)
+        mapping = self._clean_types(mapping)
+        mapping = self._collapse_arrays(mapping)
+        mapping = self._remove_keys(mapping, self._rm_keys)
+        mapping = self._suppress(mapping, self._suppress_key)
+        mapping = self._translate_keys_re(mapping)
+        mapping = self._clean(mapping)
+        mapping.pop("definitions", None)
+        result = SearchIndexDefinition(
+            settings=self.settings,
+            mappings=mapping,
+        )
+        if self.mappings_extra:
+            result["mappings"] = {**result["mappings"], **self.mappings_extra}
+        if self.settings_extra:
+            result["settings"] = {**result["settings"], **self.settings_extra}
+        return result
+    def _merge_unions(self, doc: dict) -> dict:
+        """Merge objects of type anyOf, allOf, or oneOf (options).
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+        Returns:
+            A transformation of a JSON schema by merging option fields.
+        """
+        def _clean(value: Any) -> Any:
+            if isinstance(value, list):
+                return [_clean(v) for v in value]
+            if isinstance(value, dict):
+                union: list = []
+                merged_union: dict = {}
+                for k, v in value.items():
+                    if k in ("oneOf", "allOf", "anyOf"):
+                        union.extend(v)
+                    else:
+                        merged_union[k] = v
+                if not union:
+                    return {k: _clean(v) for k, v in value.items()}
+                for u in union:
+                    if not isinstance(u, dict):
+                        continue
+                    for k, v in u.items():
+                        if k == "type" and v == "null":  # null values are irrelevant
+                            continue
+                        elif not isinstance(v, dict) or k not in merged_union:
+                            merged_union[k] = _clean(v)
+                        elif isinstance(v, dict) and k in merged_union:
+                            merged_union[k] = _clean({**merged_union[k], **v})
+                return merged_union
+            return value
+        return _clean(doc)
+    def _clean_types(self, doc: dict) -> dict:
+        """Clean field types originated from a JSON schema to obtain search mappings.
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+        Returns:
+            A transformation of a JSON schema by merging option fields.
+        """
+        def _clean(value: Any) -> Any:
+            if isinstance(value, list):
+                return [_clean(v) for v in value]
+            if isinstance(value, dict):
+                if isinstance(value.get("type"), str):
+                    t: str = value["type"]
+                    # Tuples
+                    if t == "array" and isinstance(value.get("items"), list):
+                        items: list = value["items"]
+                        if items:
+                            value["items"] = value["items"][0]
+                        else:
+                            value["items"] = {}
+                    # Unwanted types, such as 'object'
+                    if t in self._types_to_remove:
+                        value.pop("type", None)
+                    # Map formats
+                    f: str = value.get("format", "")
+                    if (t, f) in self._type_format_mappings:
+                        value["type"] = self._type_format_mappings[(t, f)]
+                        value.pop("format", None)
+                    # Map types, such as 'string' to 'text'
+                    elif t in self._type_mappings:
+                        value["type"] = self._type_mappings[t]
+                return {k: _clean(v) for k, v in value.items()}
+            return value
+        return _clean(doc)
+    @staticmethod
+    def _collapse_arrays(doc: dict) -> dict:
+        """Collapse arrays from a JSON schema to match a search database mappings.
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+        Returns:
+            A transformation of a JSON schema by collapsing arrays.
+        """
+        def __collapse(d_: Any) -> Any:
+            if isinstance(d_, list):
+                return [v for v in (__collapse(v) for v in d_)]
+            if isinstance(d_, dict):
+                if "type" in d_ and d_["type"] == "array" and "items" in d_:
+                    collapsed = __collapse(d_["items"])
+                    d_ = deepcopy(d_)
+                    d_.pop("items", None)
+                    d_.pop("type", None)
+                    merged = {**d_, **collapsed}
+                    return merged
+                return {k: __collapse(v) for k, v in d_.items()}
+            return d_
+        return __collapse(doc)
+    @staticmethod
+    def _suppress(doc: dict, suppress_key: str) -> dict:
+        """Remove a key from a JSON schema to match a search database mappings.
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+            key: The name of a field to be removed from the `doc`.
+        Returns:
+            A transformation of a JSON schema by removing the field `suppress_key`.
+        """
+        def __suppress(d_: Any) -> Any:
+            if isinstance(d_, list):
+                return [v for v in (__suppress(v) for v in d_)]
+            if isinstance(d_, dict):
+                if suppress_key in d_ and d_[suppress_key] is True:
+                    return {}
+                else:
+                    return {
+                        k: v for k, v in ((k, __suppress(v)) for k, v in d_.items())
+                    }
+            return d_
+        return __suppress(doc)
+    @staticmethod
+    def _remove_keys(doc: dict, keys: Tuple[str, ...]) -> dict:
+        """Remove keys from a JSON schema to match a search database mappings.
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+            keys: Fields to be removed from the `doc`.
+        Returns:
+            A transformation of a JSON schema by removing the fields in `keys`.
+        """
+        def __remove(d_: Any) -> Any:
+            if isinstance(d_, list):
+                return [v for v in (__remove(v) for v in d_)]
+            if isinstance(d_, dict):
+                result = {}
+                for k, v in d_.items():
+                    if k == "properties" and isinstance(v, dict):
+                        # All properties must be included, they are not to be removed,
+                        # even if they have a name of a key that's to be removed.
+                        result[k] = {p_k: __remove(p_v) for p_k, p_v in v.items()}
+                    elif k not in keys:
+                        result[k] = __remove(v)
+                return result
+            return d_
+        return __remove(doc)
+    @staticmethod
+    def _remove_keys_re(doc: dict, regx: Pattern) -> dict:
+        """Remove keys from a JSON schema to match a search database mappings.
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+            keys: A pattern defining the fields to be removed from the `doc`.
+        Returns:
+            A transformation of a JSON schema by removing fields with a name pattern.
+        """
+        def __remove(d_: Any) -> Any:
+            if isinstance(d_, list):
+                return [v for v in (__remove(v) for v in d_)]
+            if isinstance(d_, dict):
+                return {
+                    k: v
+                    for k, v in (
+                        (k, __remove(v)) for k, v in d_.items() if not regx.match(k)
+                    )
+                }
+            return d_
+        return __remove(doc)
+    def _translate_keys_re(self, doc: dict) -> dict:
+        """Translate marked keys from a JSON schema to match a search database mappings.
+        The keys to be translated should have a name that matches the pattern defined
+        by this class patter, for instance, a name starting with `x-es-`.
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+        Returns:
+            A transformation of a JSON schema towards a search database mappings.
+        """
+        def __translate(d_: Any) -> Any:
+            if isinstance(d_, list):
+                return [v for v in (__translate(v) for v in d_)]
+            if isinstance(d_, dict):
+                new_dict = {}
+                for k, v in d_.items():
+                    new_dict[k] = __translate(v)
+                delkeys = []
+                for k in list(new_dict.keys()):
+                    k_ = self._re_es_flag.sub(r"\1", k)
+                    if k_ != k:
+                        new_dict[k_] = new_dict[k]
+                        delkeys.append(k)
+                for k in delkeys:
+                    new_dict.pop(k, None)
+                return new_dict
+            return d_
+        return __translate(doc)
+    @staticmethod
+    def _clean(doc: dict) -> dict:
+        """Recursively remove empty lists, dicts, strings, or None elements from a dict.
+        Args:
+            doc: A JSON schema or a transformation towards a search database mappings.
+        Returns:
+            A transformation of a JSON schema by removing empty objects.
+        """
+        def _empty(x) -> bool:
+            return x is None or x == {} or x == [] or x == ""
+        def _clean(d_: Any) -> Any:
+            if isinstance(d_, list):
+                return [v for v in (_clean(v) for v in d_) if not _empty(v)]
+            if isinstance(d_, dict):
+                return {
+                    k: v
+                    for k, v in ((k, _clean(v)) for k, v in d_.items())
+                    if not _empty(v)
+                }
+            return d_
+        return _clean(doc)

docling_core/search/mapping.py ADDED Viewed

@@ -0,0 +1,29 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Methods to define fields in an index mapping of a search database."""
+from typing import Any, Optional
+def es_field(
+    *,
+    type: Optional[str] = None,
+    ignore_above: Optional[int] = None,
+    term_vector: Optional[str] = None,
+    **kwargs: Any,
+):
+    """Create x-es kwargs to be passed to a `pydantic.Field` via unpacking."""
+    all_kwargs = {**kwargs}
+    if type is not None:
+        all_kwargs["type"] = type
+    if ignore_above is not None:
+        all_kwargs["ignore_above"] = ignore_above
+    if term_vector is not None:
+        all_kwargs["term_vector"] = term_vector
+    return {f"x-es-{k}": v for k, v in all_kwargs.items()}

docling_core/search/meta.py ADDED Viewed

@@ -0,0 +1,93 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Models and methods to define the metadata fields in database index mappings."""
+from pathlib import Path
+from typing import Generic, Optional, TypeVar
+from pydantic import BaseModel, Field, StrictStr, ValidationInfo, field_validator
+from docling_core.search.package import Package
+from docling_core.types.base import CollectionTypeEnum, StrictDateTime, UniqueList
+from docling_core.utils.alias import AliasModel
+ClassificationT = TypeVar("ClassificationT", bound=str)
+DomainT = TypeVar("DomainT", bound=str)
+class S3Path(BaseModel, extra="forbid"):
+    """The path details within a cloud object storage for CCS-parsed files."""
+    bucket: StrictStr
+    prefix: StrictStr
+    infix: StrictStr
+    def __hash__(self):
+        """Return the hash value for this S3Path object."""
+        return hash((type(self),) + tuple(self.__dict__.values()))
+class S3CcsData(BaseModel, extra="forbid"):
+    """The access details to a cloud object storage for CCS-parsed files."""
+    endpoint: StrictStr
+    paths: UniqueList[S3Path] = Field(min_length=1)
+class DocumentLicense(BaseModel, extra="forbid"):
+    """Document license for a search database index within the index mappings."""
+    code: Optional[list[StrictStr]] = None
+    text: Optional[list[StrictStr]] = None
+class Meta(AliasModel, Generic[ClassificationT, DomainT], extra="forbid"):
+    """Metadata of a search database index within the index mappings."""
+    aliases: Optional[list[StrictStr]] = None
+    created: StrictDateTime
+    description: Optional[StrictStr] = None
+    source: StrictStr
+    storage: Optional[StrictStr] = None
+    display_name: Optional[StrictStr] = None
+    type: CollectionTypeEnum
+    classification: Optional[list[ClassificationT]] = None
+    version: UniqueList[Package] = Field(min_length=1)
+    license: Optional[StrictStr] = None
+    filename: Optional[Path] = None
+    domain: Optional[list[DomainT]] = None
+    reference: Optional[StrictStr] = Field(default=None, alias="$ref")
+    ccs_s3_data: Optional[S3CcsData] = None
+    document_license: Optional[DocumentLicense] = None
+    index_key: Optional[StrictStr] = None
+    project_key: Optional[StrictStr] = None
+    @field_validator("reference")
+    @classmethod
+    def reference_for_document(cls, v, info: ValidationInfo):
+        """Validate the reference field for indexes of type Document."""
+        if "type" in info.data and info.data["type"] == "Document":
+            if v and v != "ccs:schemas#/Document":
+                raise ValueError("wrong reference value for Document type")
+            else:
+                return "ccs:schemas#/Document"
+        else:
+            return v
+    @field_validator("version")
+    @classmethod
+    def version_has_schema(cls, v):
+        """Validate that the docling-core library is always set in version field."""
+        docling_core = [item for item in v if item.name == "docling-core"]
+        if not docling_core:
+            raise ValueError(
+                "the version should include at least a valid docling-core package"
+            )
+        elif len(docling_core) > 1:
+            raise ValueError(
+                "the version must not include more than 1 docling-core package"
+            )
+        else:
+            return v

docling_core/search/package.py ADDED Viewed

@@ -0,0 +1,56 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Models and methods to define a package model."""
+import importlib.metadata
+import re
+from typing import Final
+from pydantic import BaseModel, StrictStr, StringConstraints
+from typing_extensions import Annotated
+VERSION_PATTERN: Final = (
+    r"^(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\.(?P<patch>0|[1-9]\d*)"
+    r"(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)"
+    r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+"
+    r"(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"
+)
+class Package(BaseModel, extra="forbid"):
+    """Representation of a software package.
+    The version needs to comply with Semantic Versioning 2.0.0.
+    """
+    name: StrictStr
+    version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
+        importlib.metadata.version("docling-core")
+    )
+    def __hash__(self):
+        """Return the hash value for this S3Path object."""
+        return hash((type(self),) + tuple(self.__dict__.values()))
+    def get_major(self):
+        """Get the major version of this package."""
+        return re.match(VERSION_PATTERN, self.version)["major"]
+    def get_minor(self):
+        """Get the major version of this package."""
+        return re.match(VERSION_PATTERN, self.version)["minor"]
+    def get_patch(self):
+        """Get the major version of this package."""
+        return re.match(VERSION_PATTERN, self.version)["patch"]
+    def get_pre_release(self):
+        """Get the pre-release version of this package."""
+        return re.match(VERSION_PATTERN, self.version)["prerelease"]
+    def get_build_metadata(self):
+        """Get the build metadata version of this package."""
+        return re.match(VERSION_PATTERN, self.version)["buildmetadata"]

docling_core/types/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define the main types."""
+from docling_core.types.doc.base import BoundingBox  # noqa
+from docling_core.types.doc.base import Table  # noqa
+from docling_core.types.doc.base import TableCell  # noqa
+from docling_core.types.doc.base import (  # noqa
+    BaseCell,
+    BaseText,
+    PageDimensions,
+    PageReference,
+    Prov,
+    Ref,
+)
+from docling_core.types.doc.document import (  # noqa
+    CCSDocumentDescription as DocumentDescription,
+)
+from docling_core.types.doc.document import CCSFileInfoObject as FileInfoObject  # noqa
+from docling_core.types.doc.document import ExportedCCSDocument as Document  # noqa
+from docling_core.types.gen.generic import Generic  # noqa
+from docling_core.types.rec.record import Record  # noqa