docling-core 1.1.2__tar.gz → 1.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (45) hide show
  1. {docling_core-1.1.2 → docling_core-1.1.4}/PKG-INFO +11 -10
  2. {docling_core-1.1.2 → docling_core-1.1.4}/README.md +10 -8
  3. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/doc/base.py +9 -18
  4. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/doc/document.py +22 -4
  5. {docling_core-1.1.2 → docling_core-1.1.4}/pyproject.toml +1 -2
  6. {docling_core-1.1.2 → docling_core-1.1.4}/LICENSE +0 -0
  7. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/__init__.py +0 -0
  8. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/py.typed +0 -0
  9. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/doc/ANN.json +0 -0
  10. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/doc/DOC.json +0 -0
  11. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  12. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/doc/RAW.json +0 -0
  13. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  14. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  15. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  16. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  17. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/search/__init__.py +0 -0
  18. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  19. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/search/mapping.py +0 -0
  20. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/search/meta.py +0 -0
  21. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/search/package.py +0 -0
  22. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/__init__.py +0 -0
  23. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/base.py +0 -0
  24. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/doc/__init__.py +0 -0
  25. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/doc/doc_ann.py +0 -0
  26. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/doc/doc_ocr.py +0 -0
  27. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/doc/doc_raw.py +0 -0
  28. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/gen/__init__.py +0 -0
  29. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/gen/generic.py +0 -0
  30. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/nlp/__init__.py +0 -0
  31. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/nlp/qa.py +0 -0
  32. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/nlp/qa_labels.py +0 -0
  33. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/rec/__init__.py +0 -0
  34. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/rec/attribute.py +0 -0
  35. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/rec/base.py +0 -0
  36. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/rec/predicate.py +0 -0
  37. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/rec/record.py +0 -0
  38. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/rec/statement.py +0 -0
  39. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/types/rec/subject.py +0 -0
  40. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/utils/__init__.py +0 -0
  41. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/utils/alias.py +0 -0
  42. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/utils/ds_generate_docs.py +0 -0
  43. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  44. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/utils/validate.py +0 -0
  45. {docling_core-1.1.2 → docling_core-1.1.4}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.1.2
3
+ Version: 1.1.4
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -28,7 +28,6 @@ Classifier: Typing :: Typed
28
28
  Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
29
29
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
30
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
- Requires-Dist: poetry (>=1.8.3,<2.0.0)
32
31
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
32
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
34
33
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
@@ -47,7 +46,7 @@ Description-Content-Type: text/markdown
47
46
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
48
47
  [![License MIT](https://img.shields.io/github/license/ds4sd/docling-core)](https://opensource.org/licenses/MIT)
49
48
 
50
- Docling Core is a library that defines the data types in [Docling](https://ds4sd.github.io), leveraging pydantic models.
49
+ Docling Core is a library that defines the data types in [Docling](https://github.com/DS4SD/docling), leveraging pydantic models.
51
50
 
52
51
  ## Installation
53
52
 
@@ -116,13 +115,15 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
116
115
  If you use Docling Core in your projects, please consider citing the following:
117
116
 
118
117
  ```bib
119
- @software{Docling,
120
- author = {Deep Search Team},
121
- month = {7},
122
- title = {{Docling}},
123
- url = {https://github.com/DS4SD/docling},
124
- version = {main},
125
- year = {2024}
118
+ @techreport{Docling,
119
+ author = "Deep Search Team",
120
+ month = 8,
121
+ title = "Docling Technical Report",
122
+ url = "https://arxiv.org/abs/2408.09869",
123
+ eprint = "2408.09869",
124
+ doi = "10.48550/arXiv.2408.09869",
125
+ version = "1.0.0",
126
+ year = 2024
126
127
  }
127
128
  ```
128
129
 
@@ -10,7 +10,7 @@
10
10
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
11
11
  [![License MIT](https://img.shields.io/github/license/ds4sd/docling-core)](https://opensource.org/licenses/MIT)
12
12
 
13
- Docling Core is a library that defines the data types in [Docling](https://ds4sd.github.io), leveraging pydantic models.
13
+ Docling Core is a library that defines the data types in [Docling](https://github.com/DS4SD/docling), leveraging pydantic models.
14
14
 
15
15
  ## Installation
16
16
 
@@ -79,13 +79,15 @@ Please read [Contributing to Docling Core](./CONTRIBUTING.md) for details.
79
79
  If you use Docling Core in your projects, please consider citing the following:
80
80
 
81
81
  ```bib
82
- @software{Docling,
83
- author = {Deep Search Team},
84
- month = {7},
85
- title = {{Docling}},
86
- url = {https://github.com/DS4SD/docling},
87
- version = {main},
88
- year = {2024}
82
+ @techreport{Docling,
83
+ author = "Deep Search Team",
84
+ month = 8,
85
+ title = "Docling Technical Report",
86
+ url = "https://arxiv.org/abs/2408.09869",
87
+ eprint = "2408.09869",
88
+ doi = "10.48550/arXiv.2408.09869",
89
+ version = "1.0.0",
90
+ year = 2024
89
91
  }
90
92
  ```
91
93
 
@@ -128,37 +128,28 @@ class GlmTableCell(TableCell):
128
128
  )
129
129
 
130
130
 
131
- class Table(AliasModel):
132
- """Table."""
131
+ class BaseCell(AliasModel):
132
+ """Base cell."""
133
133
 
134
- num_cols: int = Field(alias="#-cols")
135
- num_rows: int = Field(alias="#-rows")
136
134
  bounding_box: Optional[BoundingBoxContainer] = Field(
137
135
  default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
138
136
  )
139
- data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
140
- model: Optional[str] = None
141
137
  prov: Optional[list[Prov]] = None
142
138
  text: Optional[str] = Field(
143
139
  default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
144
140
  )
145
141
  obj_type: str = Field(
146
- alias="type",
147
- json_schema_extra=es_field(type="keyword", ignore_above=8191),
142
+ alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
148
143
  )
149
144
 
150
145
 
151
- class BaseCell(AliasModel):
152
- """Base cell."""
146
+ class Table(BaseCell):
147
+ """Table."""
153
148
 
154
- bounding_box: Optional[BoundingBoxContainer] = Field(
155
- default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
156
- )
157
- prov: Optional[list[Prov]] = None
158
- text: Optional[str] = None
159
- obj_type: str = Field(
160
- alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
161
- )
149
+ num_cols: int = Field(alias="#-cols")
150
+ num_rows: int = Field(alias="#-rows")
151
+ data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
152
+ model: Optional[str] = None
162
153
 
163
154
 
164
155
  class BaseText(AliasModel):
@@ -311,6 +311,8 @@ class CCSDocument(
311
311
  @classmethod
312
312
  def from_dict(cls, data):
313
313
  """Validates and fixes the input data."""
314
+ if not isinstance(data, dict):
315
+ return data
314
316
  description_collection = data["description"].get("collection")
315
317
  if not description_collection:
316
318
  data["description"].setdefault("collection", {})
@@ -386,6 +388,8 @@ class ExportedCCSDocument(
386
388
  @classmethod
387
389
  def from_dict(cls, data):
388
390
  """Fix ref in main-text."""
391
+ if not isinstance(data, dict):
392
+ return data
389
393
  if data.get("main-text"):
390
394
  for item in data["main-text"]:
391
395
  if ref := item.pop("__ref", None):
@@ -393,14 +397,28 @@ class ExportedCCSDocument(
393
397
 
394
398
  return data
395
399
 
396
- def _resolve_ref(self, item: Ref) -> Optional[Table]:
397
- """Return the resolved reference in case of table reference, otherwise None."""
398
- result: Optional[Table] = None
400
+ def _resolve_ref(self, item: Ref) -> Optional[Union[BaseCell, BaseText]]:
401
+ """Return the resolved reference.
399
402
 
400
- # NOTE: currently only resolves table refs & makes assumptions on ref parts
403
+ Resolved the Ref object within the document.
404
+ If the object is not found, None is returned.
405
+ """
406
+ result: Optional[Union[BaseCell, BaseText]] = None
407
+
408
+ # NOTE: currently only resolves refs explicitely, such that we can make
409
+ # assumptions on ref parts
401
410
  if item.obj_type == "table" and self.tables:
402
411
  parts = item.ref.split("/")
403
412
  result = self.tables[int(parts[2])]
413
+ elif item.obj_type == "figure" and self.figures:
414
+ parts = item.ref.split("/")
415
+ result = self.figures[int(parts[2])]
416
+ elif item.obj_type == "equation" and self.equations:
417
+ parts = item.ref.split("/")
418
+ result = self.equations[int(parts[2])]
419
+ elif item.obj_type == "footnote" and self.footnotes:
420
+ parts = item.ref.split("/")
421
+ result = self.footnotes[int(parts[2])]
404
422
 
405
423
  return result
406
424
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "1.1.2"
3
+ version = "1.1.4"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -51,7 +51,6 @@ jsonschema = "^4.16.0"
51
51
  pydantic = "^2.6.0"
52
52
  jsonref = "^1.1.0"
53
53
  json-schema-for-humans = "^1.0.0"
54
- poetry = "^1.8.3"
55
54
  pyproject-toml = "^0.0.10"
56
55
  tabulate = "^0.9.0"
57
56
 
File without changes