docling-core 1.0.0__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (45) hide show
  1. {docling_core-1.0.0 → docling_core-1.1.0}/PKG-INFO +2 -1
  2. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/document.py +109 -0
  3. {docling_core-1.0.0 → docling_core-1.1.0}/pyproject.toml +3 -2
  4. {docling_core-1.0.0 → docling_core-1.1.0}/LICENSE +0 -0
  5. {docling_core-1.0.0 → docling_core-1.1.0}/README.md +0 -0
  6. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/__init__.py +0 -0
  7. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/py.typed +0 -0
  8. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  9. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  10. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  11. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  12. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  13. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  14. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  15. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  16. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/__init__.py +0 -0
  17. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  18. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/mapping.py +0 -0
  19. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/meta.py +0 -0
  20. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/search/package.py +0 -0
  21. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/__init__.py +0 -0
  22. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/base.py +0 -0
  23. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/__init__.py +0 -0
  24. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/base.py +0 -0
  25. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/doc_ann.py +0 -0
  26. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/doc_ocr.py +0 -0
  27. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/doc/doc_raw.py +0 -0
  28. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/gen/__init__.py +0 -0
  29. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/gen/generic.py +0 -0
  30. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/nlp/__init__.py +0 -0
  31. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/nlp/qa.py +0 -0
  32. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/nlp/qa_labels.py +0 -0
  33. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/__init__.py +0 -0
  34. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/attribute.py +0 -0
  35. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/base.py +0 -0
  36. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/predicate.py +0 -0
  37. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/record.py +0 -0
  38. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/statement.py +0 -0
  39. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/types/rec/subject.py +0 -0
  40. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/__init__.py +0 -0
  41. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/alias.py +0 -0
  42. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/ds_generate_docs.py +0 -0
  43. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  44. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/validate.py +0 -0
  45. {docling_core-1.0.0 → docling_core-1.1.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -31,6 +31,7 @@ Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
31
  Requires-Dist: poetry (>=1.8.3,<2.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
33
  Requires-Dist: pyproject-toml (>=0.0.10,<0.0.11)
34
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
35
  Project-URL: Repository, https://github.com/DS4SD/docling-core
35
36
  Description-Content-Type: text/markdown
36
37
 
@@ -16,6 +16,7 @@ from pydantic import (
16
16
  StrictStr,
17
17
  model_validator,
18
18
  )
19
+ from tabulate import tabulate
19
20
 
20
21
  from docling_core.search.mapping import es_field
21
22
  from docling_core.types.base import (
@@ -391,3 +392,111 @@ class ExportedCCSDocument(
391
392
  item["$ref"] = ref
392
393
 
393
394
  return data
395
+
396
+ def _resolve_ref(self, item: Ref) -> Optional[Table]:
397
+ """Return the resolved reference in case of table reference, otherwise None."""
398
+ result: Optional[Table] = None
399
+
400
+ # NOTE: currently only resolves table refs & makes assumptions on ref parts
401
+ if item.obj_type == "table" and self.tables:
402
+ parts = item.ref.split("/")
403
+ result = self.tables[int(parts[2])]
404
+
405
+ return result
406
+
407
+ def export_to_markdown(
408
+ self,
409
+ delim: str = "\n\n",
410
+ main_text_start: int = 0,
411
+ main_text_stop: Optional[int] = None,
412
+ ) -> str:
413
+ r"""Serialize to Markdown.
414
+
415
+ Operates on a slice of the document's main_text as defined through arguments
416
+ main_text_start and main_text_stop; defaulting to the whole main_text.
417
+
418
+ Args:
419
+ delim (str, optional): Delimiter to use when concatenating the various
420
+ Markdown parts. Defaults to "\n\n".
421
+ main_text_start (int, optional): Main-text slicing start index (inclusive).
422
+ Defaults to 0.
423
+ main_text_end (Optional[int], optional): Main-text slicing stop index
424
+ (exclusive). Defaults to None.
425
+
426
+ Returns:
427
+ str: The exported Markdown representation.
428
+ """
429
+ has_title = False
430
+ prev_text = ""
431
+ md_texts: list[str] = []
432
+
433
+ if self.main_text is not None:
434
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
435
+ markdown_text = ""
436
+
437
+ item = (
438
+ self._resolve_ref(orig_item)
439
+ if isinstance(orig_item, Ref)
440
+ else orig_item
441
+ )
442
+ if item is None:
443
+ continue
444
+
445
+ item_type = item.obj_type
446
+ if isinstance(item, BaseText) and item_type in {
447
+ "title",
448
+ "subtitle-level-1",
449
+ "paragraph",
450
+ "caption",
451
+ }:
452
+ text = item.text
453
+
454
+ # ignore repeated text
455
+ if prev_text == text:
456
+ continue
457
+ else:
458
+ prev_text = text
459
+
460
+ # first title match
461
+ if item_type == "title" and not has_title:
462
+ markdown_text = f"# {text}"
463
+ has_title = True
464
+
465
+ # secondary titles
466
+ elif item_type in {"title", "subtitle-level-1"} or (
467
+ has_title and item_type == "title"
468
+ ):
469
+ markdown_text = f"## {text}"
470
+
471
+ # normal text
472
+ else:
473
+ markdown_text = text
474
+
475
+ elif isinstance(item, Table) and item.data:
476
+ table = []
477
+ for row in item.data:
478
+ tmp = []
479
+ for col in row:
480
+ tmp.append(col.text)
481
+ table.append(tmp)
482
+
483
+ if len(table) > 1 and len(table[0]) > 0:
484
+ try:
485
+ md_table = tabulate(
486
+ table[1:], headers=table[0], tablefmt="github"
487
+ )
488
+ except ValueError:
489
+ md_table = tabulate(
490
+ table[1:],
491
+ headers=table[0],
492
+ tablefmt="github",
493
+ disable_numparse=True,
494
+ )
495
+
496
+ markdown_text = md_table
497
+
498
+ if markdown_text:
499
+ md_texts.append(markdown_text)
500
+
501
+ result = delim.join(md_texts)
502
+ return result
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "1.0.0"
3
+ version = "1.1.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
@@ -53,6 +53,7 @@ jsonref = "^1.1.0"
53
53
  json-schema-for-humans = "^1.0.0"
54
54
  poetry = "^1.8.3"
55
55
  pyproject-toml = "^0.0.10"
56
+ tabulate = "^0.9.0"
56
57
 
57
58
  [tool.poetry.group.dev.dependencies]
58
59
  black = "^24.4.2"
@@ -111,7 +112,7 @@ python_version = "3.9"
111
112
  plugins = ["pydantic.mypy"]
112
113
 
113
114
  [[tool.mypy.overrides]]
114
- module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*"]
115
+ module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*", "tabulate.*"]
115
116
  ignore_missing_imports = true
116
117
 
117
118
  [tool.semantic_release]
File without changes
File without changes