PyPI - docling-core - Versions diffs - 0.0.1__py3-none-any.whl - Mend

docling-core 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docling-core might be problematic. Click here for more details.

Files changed (46) hide show

docling_core/__init__.py +6 -0
docling_core/py.typed +0 -0
docling_core/resources/schemas/doc/ANN.json +171 -0
docling_core/resources/schemas/doc/DOC.json +300 -0
docling_core/resources/schemas/doc/OCR-output.json +166 -0
docling_core/resources/schemas/doc/RAW.json +158 -0
docling_core/resources/schemas/generated/ccs_document_schema.json +1071 -0
docling_core/resources/schemas/generated/minimal_document_schema_flat.json +1129 -0
docling_core/resources/schemas/search/search_doc_mapping.json +104 -0
docling_core/resources/schemas/search/search_doc_mapping_v2.json +256 -0
docling_core/search/__init__.py +6 -0
docling_core/search/json_schema_to_search_mapper.py +406 -0
docling_core/search/mapping.py +29 -0
docling_core/search/meta.py +93 -0
docling_core/search/package.py +56 -0
docling_core/types/__init__.py +25 -0
docling_core/types/base.py +248 -0
docling_core/types/doc/__init__.py +6 -0
docling_core/types/doc/base.py +199 -0
docling_core/types/doc/doc_ann.py +76 -0
docling_core/types/doc/doc_ocr.py +83 -0
docling_core/types/doc/doc_raw.py +187 -0
docling_core/types/doc/document.py +393 -0
docling_core/types/gen/__init__.py +6 -0
docling_core/types/gen/generic.py +33 -0
docling_core/types/nlp/__init__.py +6 -0
docling_core/types/nlp/qa.py +74 -0
docling_core/types/nlp/qa_labels.py +118 -0
docling_core/types/rec/__init__.py +6 -0
docling_core/types/rec/attribute.py +55 -0
docling_core/types/rec/base.py +90 -0
docling_core/types/rec/predicate.py +133 -0
docling_core/types/rec/record.py +95 -0
docling_core/types/rec/statement.py +41 -0
docling_core/types/rec/subject.py +77 -0
docling_core/utils/__init__.py +6 -0
docling_core/utils/alias.py +27 -0
docling_core/utils/ds_generate_docs.py +144 -0
docling_core/utils/ds_generate_jsonschema.py +62 -0
docling_core/utils/validate.py +86 -0
docling_core/utils/validators.py +100 -0
docling_core-0.0.1.dist-info/LICENSE +21 -0
docling_core-0.0.1.dist-info/METADATA +133 -0
docling_core-0.0.1.dist-info/RECORD +46 -0
docling_core-0.0.1.dist-info/WHEEL +4 -0
docling_core-0.0.1.dist-info/entry_points.txt +5 -0

docling_core/types/base.py ADDED Viewed

@@ -0,0 +1,248 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define common models across types."""
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Generic, Hashable, List, Literal, Optional, TypeVar
+from pydantic import (
+    AfterValidator,
+    AnyUrl,
+    BaseModel,
+    Field,
+    PlainSerializer,
+    StrictStr,
+    StringConstraints,
+    ValidationInfo,
+    WrapValidator,
+    field_validator,
+)
+from pydantic.types import NonNegativeInt
+from typing_extensions import Annotated
+from docling_core.search.mapping import es_field
+from docling_core.search.package import VERSION_PATTERN
+from docling_core.utils.alias import AliasModel
+from docling_core.utils.validators import validate_datetime, validate_unique_list
+LanguageT = TypeVar("LanguageT", bound=str)
+IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
+DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)
+DescriptionAnalyticsT = TypeVar("DescriptionAnalyticsT", bound=BaseModel)
+SubjectTypeT = TypeVar("SubjectTypeT", bound=str)
+SubjectNameTypeT = TypeVar("SubjectNameTypeT", bound=str)
+PredicateValueTypeT = TypeVar("PredicateValueTypeT", bound=str)
+PredicateKeyNameT = TypeVar("PredicateKeyNameT", bound=str)
+PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
+ProvenanceTypeT = TypeVar("ProvenanceTypeT", bound=str)
+CollectionNameTypeT = TypeVar("CollectionNameTypeT", bound=str)
+Coordinates = Annotated[
+    list[float],
+    Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
+]
+T = TypeVar("T", bound=Hashable)
+UniqueList = Annotated[
+    List[T],
+    AfterValidator(validate_unique_list),
+    Field(json_schema_extra={"uniqueItems": True}),
+]
+StrictDateTime = Annotated[
+    datetime,
+    WrapValidator(validate_datetime),
+    PlainSerializer(
+        lambda x: x.astimezone(tz=timezone.utc).isoformat(), return_type=str
+    ),
+]
+ACQUISITION_TYPE = Literal[
+    "API", "FTP", "Download", "Link", "Web scraping/Crawling", "Other"
+]
+class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
+    """Unique identifier of a Docling data object."""
+    type_: IdentifierTypeT = Field(
+        alias="type",
+        description=(
+            "A string representing a collection or database that contains this "
+            "data object."
+        ),
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    value: StrictStr = Field(
+        description=(
+            "The identifier value of the data object within a collection or database."
+        ),
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    name: str = Field(
+        alias="_name",
+        title="_Name",
+        description=(
+            "A unique identifier of the data object across Docling, consisting of "
+            "the concatenation of type and value in lower case, separated by hash "
+            "(#)."
+        ),
+        pattern=r"^.+#.+$",
+        strict=True,
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    @field_validator("name")
+    @classmethod
+    def name_from_type_value(cls, v, info: ValidationInfo):
+        """Validate the reference field for indexes of type Document."""
+        if (
+            "type_" in info.data
+            and "value" in info.data
+            and v != f"{info.data['type_'].lower()}#{info.data['value'].lower()}"
+        ):
+            raise ValueError(
+                "the _name field must be the concatenation of type and value in lower "
+                "case, separated by hash (#)"
+            )
+        return v
+class Log(AliasModel, extra="forbid"):
+    """Log entry to describe an ETL task on a document."""
+    task: Optional[StrictStr] = Field(
+        default=None,
+        description=(
+            "An identifier of this task. It may be used to identify this task from "
+            "other tasks of the same agent and type."
+        ),
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    agent: StrictStr = Field(
+        description="The Docling agent that performed the task, e.g., CCS or CXS.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    type_: StrictStr = Field(
+        alias="type",
+        description="A task category.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    comment: Optional[StrictStr] = Field(
+        default=None,
+        description="A description of the task or any comments in natural language.",
+    )
+    date: StrictDateTime = Field(
+        description=(
+            "A string representation of the task execution datetime in ISO 8601 format."
+        )
+    )
+class FileInfoObject(AliasModel):
+    """Filing information for any data object to be stored in a Docling database."""
+    filename: StrictStr = Field(
+        description="The name of a persistent object that created this data object",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    fileprov: Optional[StrictStr] = Field(
+        default=None,
+        description=(
+            "The provenance of this data object, e.g. an archive file, a URL, or any"
+            " other repository."
+        ),
+        alias="filename-prov",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    document_hash: StrictStr = Field(
+        description=(
+            "A unique identifier of this data object within a collection of a "
+            "Docling database"
+        ),
+        alias="document-hash",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+class CollectionTypeEnum(str, Enum):
+    """Enumeration of valid Docling collection types."""
+    generic = "Generic"
+    document = "Document"
+    record = "Record"
+CollectionTypeT = TypeVar("CollectionTypeT", bound=CollectionTypeEnum)
+class CollectionInfo(
+    BaseModel, Generic[CollectionNameTypeT, CollectionTypeT], extra="forbid"
+):
+    """Information of a collection."""
+    name: Optional[CollectionNameTypeT] = Field(
+        default=None,
+        description="Name of the collection.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    type: CollectionTypeT = Field(
+        ...,
+        description="The collection type.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    version: Optional[
+        Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)]
+    ] = Field(
+        default=None,
+        description="The version of this collection model.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    alias: Optional[list[StrictStr]] = Field(
+        default=None,
+        description="A list of tags (aliases) for the collection.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+class CollectionDocumentInfo(
+    CollectionInfo[CollectionNameTypeT, Literal[CollectionTypeEnum.document]],
+    Generic[CollectionNameTypeT],
+    extra="forbid",
+):
+    """Information of a collection of type Document."""
+class CollectionRecordInfo(
+    CollectionInfo[CollectionNameTypeT, Literal[CollectionTypeEnum.record]],
+    Generic[CollectionNameTypeT],
+    extra="forbid",
+):
+    """Information of a collection of type Record."""
+class Acquisition(BaseModel, extra="forbid"):
+    """Information on how the data was obtained."""
+    type: ACQUISITION_TYPE = Field(
+        description="The method to obtain the data.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    date: Optional[StrictDateTime] = Field(
+        default=None,
+        description=(
+            "A string representation of the acquisition datetime in ISO 8601 format."
+        ),
+    )
+    link: Optional[AnyUrl] = Field(
+        default=None,
+        description="Link to the data source of this document.",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+    size: Optional[NonNegativeInt] = Field(
+        default=None,
+        description="Size in bytes of the raw document from the data source.",
+        json_schema_extra=es_field(type="long"),
+    )

docling_core/types/doc/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Package for models defined by the Document type."""

docling_core/types/doc/base.py ADDED Viewed

@@ -0,0 +1,199 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Define common models across CCS objects."""
+from typing import Annotated, Literal, Optional, Union
+from pydantic import BaseModel, Field, StrictStr
+from docling_core.search.mapping import es_field
+from docling_core.utils.alias import AliasModel
+CellData = tuple[float, float, float, float, str, str]
+CellHeader = tuple[
+    Literal["x0"],
+    Literal["y0"],
+    Literal["x1"],
+    Literal["y1"],
+    Literal["font"],
+    Literal["text"],
+]
+BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
+Span = Annotated[list[int], Field(min_length=2, max_length=2)]
+class CellsContainer(BaseModel):
+    """Cell container."""
+    data: Optional[list[CellData]] = None
+    header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
+class S3Resource(BaseModel):
+    """Resource in a cloud object storage."""
+    mime: str
+    path: str
+    page: Optional[int] = None
+class S3Data(AliasModel):
+    """Data object in a cloud object storage."""
+    pdf_document: Optional[list[S3Resource]] = Field(default=None, alias="pdf-document")
+    pdf_pages: Optional[list[S3Resource]] = Field(default=None, alias="pdf-pages")
+    pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
+    json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
+    json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
+    glm_json_document: Optional[S3Resource] = Field(
+        default=None, alias="glm-json-document"
+    )
+    figures: Optional[list[S3Resource]] = None
+class S3Reference(AliasModel):
+    """References an s3 resource."""
+    ref_s3_data: StrictStr = Field(
+        alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
+    )
+class Prov(AliasModel):
+    """Provenance."""
+    bbox: BoundingBox
+    page: int
+    span: Span
+    ref_s3_data: Optional[StrictStr] = Field(
+        default=None, alias="__ref_s3_data", json_schema_extra=es_field(suppress=True)
+    )
+class BoundingBoxContainer(BaseModel):
+    """Bounding box container."""
+    min: BoundingBox
+    max: BoundingBox
+class BitmapObject(AliasModel):
+    """Bitmap object."""
+    obj_type: str = Field(alias="type")
+    bounding_box: BoundingBoxContainer = Field(
+        json_schema_extra=es_field(suppress=True)
+    )
+    prov: Prov
+class PageDimensions(BaseModel):
+    """Page dimensions."""
+    height: float
+    page: int
+    width: float
+class TableCell(AliasModel):
+    """Table cell."""
+    bbox: Optional[BoundingBox] = None
+    spans: Optional[list[Span]] = None
+    text: str = Field(json_schema_extra=es_field(term_vector="with_positions_offsets"))
+    obj_type: str = Field(alias="type")
+class GlmTableCell(TableCell):
+    """Glm Table cell."""
+    col: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
+    col_header: bool = Field(
+        default=False, alias="col-header", json_schema_extra=es_field(suppress=True)
+    )
+    col_span: Optional[Span] = Field(
+        default=None, alias="col-span", json_schema_extra=es_field(suppress=True)
+    )
+    row: Optional[int] = Field(default=None, json_schema_extra=es_field(suppress=True))
+    row_header: bool = Field(
+        default=False, alias="row-header", json_schema_extra=es_field(suppress=True)
+    )
+    row_span: Optional[Span] = Field(
+        default=None, alias="row-span", json_schema_extra=es_field(suppress=True)
+    )
+class Table(AliasModel):
+    """Table."""
+    num_cols: int = Field(alias="#-cols")
+    num_rows: int = Field(alias="#-rows")
+    bounding_box: Optional[BoundingBoxContainer] = Field(
+        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
+    )
+    data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
+    model: Optional[str] = None
+    prov: Optional[list[Prov]] = None
+    text: Optional[str] = Field(
+        default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
+    )
+    obj_type: str = Field(
+        alias="type",
+        json_schema_extra=es_field(type="keyword", ignore_above=8191),
+    )
+class BaseCell(AliasModel):
+    """Base cell."""
+    bounding_box: Optional[BoundingBoxContainer] = Field(
+        default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
+    )
+    prov: Optional[list[Prov]] = None
+    text: Optional[str] = None
+    obj_type: str = Field(
+        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+class BaseText(AliasModel):
+    """Base model for text objects."""
+    text: StrictStr = Field(
+        json_schema_extra=es_field(term_vector="with_positions_offsets")
+    )
+    obj_type: StrictStr = Field(
+        alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    name: Optional[StrictStr] = Field(
+        default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
+    )
+    font: Optional[str] = None
+    prov: Optional[list[Prov]] = None
+class ListItem(BaseText):
+    """List item."""
+    identifier: str
+class Ref(AliasModel):
+    """Reference."""
+    name: str
+    obj_type: str = Field(alias="type")
+    ref: str = Field(alias="$ref")
+class PageReference(BaseModel):
+    """Page reference."""
+    hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
+    model: str = Field(json_schema_extra=es_field(suppress=True))
+    page: int = Field(json_schema_extra=es_field(type="short"))

docling_core/types/doc/doc_ann.py ADDED Viewed

@@ -0,0 +1,76 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Models for annotations and predictions in CCS."""
+from typing import Any
+from pydantic import BaseModel
+from docling_core.types.doc.base import BoundingBox
+AnnotationReport = Any  # TODO
+class Cell(BaseModel):
+    """Cell."""
+    id: int
+    rawcell_id: int
+    label: str
+class Cluster(BaseModel):
+    """Cluster."""
+    model: str
+    type: str
+    bbox: BoundingBox
+    cell_ids: list[int]
+    merged: bool
+    id: int
+class Table(BaseModel):
+    """Table."""
+    cell_id: int
+    label: str
+    rows: list[int]
+    cols: list[int]
+class Info(BaseModel):
+    """Info."""
+    display_name: str
+    model_name: str
+    model_class: str
+    model_version: str
+    model_id: str
+class Source(BaseModel):
+    """Source."""
+    type: str
+    timestamp: float
+    info: Info
+class AnnotPredItem(BaseModel):
+    """Annotation or prediction item."""
+    cells: list[Cell]
+    clusters: list[Cluster]
+    tables: list[Table]
+    source: Source
+class Annotation(BaseModel):
+    """Annotations."""
+    annotations: list[AnnotPredItem]
+    predictions: list[AnnotPredItem]
+    reports: list[AnnotationReport]

docling_core/types/doc/doc_ocr.py ADDED Viewed

@@ -0,0 +1,83 @@
+#
+# Copyright IBM Corp. 2024 - 2024
+# SPDX-License-Identifier: MIT
+#
+"""Models for CCS objects with OCR."""
+from typing import Any, Dict, List, Literal
+from pydantic import BaseModel, Field
+from docling_core.types.doc.base import BoundingBox
+from docling_core.utils.alias import AliasModel
+CoordsOrder = Literal["x1", "y1", "x2", "y2"]
+CoordsOrigin = Literal["top-left"]  # TODO
+Info = Dict[str, Any]  # TODO
+class Page(BaseModel):
+    """Page."""
+    width: float
+    height: float
+class Meta(AliasModel):
+    """Meta."""
+    page: Page
+    coords_order: List[CoordsOrder] = Field(..., alias="coords-order")
+    coords_origin: CoordsOrigin = Field(..., alias="coords-origin")
+class Dimension(BaseModel):
+    """Dimension."""
+    width: float
+    height: float
+class Word(BaseModel):
+    """Word."""
+    confidence: float
+    bbox: BoundingBox
+    content: str
+class Cell(BaseModel):
+    """Cell."""
+    confidence: float
+    bbox: BoundingBox
+    content: str
+class Box(BaseModel):
+    """Box."""
+    confidence: float
+    bbox: BoundingBox
+    content: str
+class Path(BaseModel):
+    """Path."""
+    x: List[float]
+    y: List[float]
+class OcrOutput(AliasModel):
+    """OCR output."""
+    meta: Meta = Field(..., alias="_meta")
+    info: Info
+    dimension: Dimension
+    words: List[Word]
+    cells: List[Cell]
+    boxes: List[Box]
+    paths: List[Path]