docling-core 1.6.1__tar.gz → 1.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (51) hide show
  1. {docling_core-1.6.1 → docling_core-1.6.3}/PKG-INFO +1 -1
  2. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/search/json_schema_to_search_mapper.py +6 -4
  3. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/doc/document.py +35 -5
  4. {docling_core-1.6.1 → docling_core-1.6.3}/pyproject.toml +1 -1
  5. {docling_core-1.6.1 → docling_core-1.6.3}/LICENSE +0 -0
  6. {docling_core-1.6.1 → docling_core-1.6.3}/README.md +0 -0
  7. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/__init__.py +0 -0
  8. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/py.typed +0 -0
  9. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/doc/ANN.json +0 -0
  10. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/doc/DOC.json +0 -0
  11. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  12. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/doc/RAW.json +0 -0
  13. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  14. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  15. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  16. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  17. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/search/__init__.py +0 -0
  18. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/search/mapping.py +0 -0
  19. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/search/meta.py +0 -0
  20. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/search/package.py +0 -0
  21. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/transforms/__init__.py +0 -0
  22. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/transforms/chunker/__init__.py +0 -0
  23. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/transforms/chunker/base.py +0 -0
  24. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  25. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/__init__.py +0 -0
  26. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/base.py +0 -0
  27. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/doc/__init__.py +0 -0
  28. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/doc/base.py +0 -0
  29. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/doc/doc_ann.py +0 -0
  30. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/doc/doc_ocr.py +0 -0
  31. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/doc/doc_raw.py +0 -0
  32. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/doc/tokens.py +0 -0
  33. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/gen/__init__.py +0 -0
  34. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/gen/generic.py +0 -0
  35. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/nlp/__init__.py +0 -0
  36. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/nlp/qa.py +0 -0
  37. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/nlp/qa_labels.py +0 -0
  38. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/rec/__init__.py +0 -0
  39. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/rec/attribute.py +0 -0
  40. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/rec/base.py +0 -0
  41. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/rec/predicate.py +0 -0
  42. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/rec/record.py +0 -0
  43. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/rec/statement.py +0 -0
  44. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/types/rec/subject.py +0 -0
  45. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/utils/__init__.py +0 -0
  46. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/utils/alias.py +0 -0
  47. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/utils/ds_generate_docs.py +0 -0
  48. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/utils/ds_generate_jsonschema.py +0 -0
  49. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/utils/file.py +0 -0
  50. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/utils/validate.py +0 -0
  51. {docling_core-1.6.1 → docling_core-1.6.3}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.6.1
3
+ Version: 1.6.3
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -8,7 +8,7 @@ import re
8
8
  from copy import deepcopy
9
9
  from typing import Any, Optional, Pattern, Tuple, TypedDict
10
10
 
11
- from jsonref import JsonRef
11
+ from jsonref import replace_refs
12
12
 
13
13
 
14
14
  class SearchIndexDefinition(TypedDict):
@@ -95,7 +95,11 @@ class JsonSchemaToSearchMapper:
95
95
  which define the fields, their data types, and other specifications to index
96
96
  JSON documents into a Lucene index.
97
97
  """
98
- mapping = JsonRef.replace_refs(schema)
98
+ mapping = deepcopy(schema)
99
+
100
+ mapping = self._suppress(mapping, self._suppress_key)
101
+
102
+ mapping = replace_refs(mapping)
99
103
 
100
104
  mapping = self._merge_unions(mapping)
101
105
 
@@ -105,8 +109,6 @@ class JsonSchemaToSearchMapper:
105
109
 
106
110
  mapping = self._remove_keys(mapping, self._rm_keys)
107
111
 
108
- mapping = self._suppress(mapping, self._suppress_key)
109
-
110
112
  mapping = self._translate_keys_re(mapping)
111
113
 
112
114
  mapping = self._clean(mapping)
@@ -477,6 +477,26 @@ class ExportedCCSDocument(
477
477
  md_texts: list[str] = []
478
478
 
479
479
  if self.main_text is not None:
480
+ # collect all captions embedded in table and figure objects
481
+ # to avoid repeating them
482
+ embedded_captions = set()
483
+ for orig_item in self.main_text[main_text_start:main_text_stop]:
484
+ item = (
485
+ self._resolve_ref(orig_item)
486
+ if isinstance(orig_item, Ref)
487
+ else orig_item
488
+ )
489
+ if item is None:
490
+ continue
491
+
492
+ if (
493
+ isinstance(item, (Table, Figure))
494
+ and item.text
495
+ and item.obj_type in main_text_labels
496
+ ):
497
+ embedded_captions.add(item.text)
498
+
499
+ # serialize document to markdown
480
500
  for orig_item in self.main_text[main_text_start:main_text_stop]:
481
501
  markdown_text = ""
482
502
 
@@ -492,6 +512,11 @@ class ExportedCCSDocument(
492
512
  if isinstance(item, BaseText) and item_type in main_text_labels:
493
513
  text = item.text
494
514
 
515
+ # skip captions of they are embedded in the actual
516
+ # floating object
517
+ if item_type == "caption" and text in embedded_captions:
518
+ continue
519
+
495
520
  # ignore repeated text
496
521
  if prev_text == text or text is None:
497
522
  continue
@@ -523,8 +548,9 @@ class ExportedCCSDocument(
523
548
  isinstance(item, Table)
524
549
  and item.data
525
550
  and item_type in main_text_labels
526
- and not strict_text
527
551
  ):
552
+
553
+ md_table = ""
528
554
  table = []
529
555
  for row in item.data:
530
556
  tmp = []
@@ -545,15 +571,19 @@ class ExportedCCSDocument(
545
571
  disable_numparse=True,
546
572
  )
547
573
 
548
- markdown_text = md_table
574
+ markdown_text = ""
575
+ if item.text:
576
+ markdown_text = item.text
577
+ if not strict_text:
578
+ markdown_text += "\n" + md_table
549
579
 
550
580
  elif isinstance(item, Figure) and item_type in main_text_labels:
551
581
 
552
582
  markdown_text = ""
553
- if not strict_text:
554
- markdown_text = f"{image_placeholder}"
555
583
  if item.text:
556
- markdown_text += "\n" + item.text
584
+ markdown_text = item.text
585
+ if not strict_text:
586
+ markdown_text += f"\n{image_placeholder}"
557
587
 
558
588
  if markdown_text:
559
589
  md_texts.append(markdown_text)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "1.6.1"
3
+ version = "1.6.3"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes