docling-core 2.1.0__tar.gz → 2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (55) hide show
  1. {docling_core-2.1.0 → docling_core-2.2.1}/PKG-INFO +1 -1
  2. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/doc/document.py +19 -1
  3. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/utils/file.py +17 -4
  4. {docling_core-2.1.0 → docling_core-2.2.1}/pyproject.toml +1 -1
  5. {docling_core-2.1.0 → docling_core-2.2.1}/LICENSE +0 -0
  6. {docling_core-2.1.0 → docling_core-2.2.1}/README.md +0 -0
  7. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/__init__.py +0 -0
  8. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/py.typed +0 -0
  9. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  10. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  11. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  12. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  13. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  14. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  15. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  16. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  17. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/search/__init__.py +0 -0
  18. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  19. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/search/mapping.py +0 -0
  20. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/search/meta.py +0 -0
  21. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/search/package.py +0 -0
  22. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/transforms/__init__.py +0 -0
  23. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/transforms/chunker/__init__.py +0 -0
  24. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/transforms/chunker/base.py +0 -0
  25. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  26. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/__init__.py +0 -0
  27. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/base.py +0 -0
  28. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/doc/__init__.py +0 -0
  29. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/doc/base.py +0 -0
  30. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/doc/labels.py +0 -0
  31. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/gen/__init__.py +0 -0
  32. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/gen/generic.py +0 -0
  33. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/legacy_doc/__init__.py +0 -0
  34. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/legacy_doc/base.py +0 -0
  35. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  36. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  37. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  38. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/legacy_doc/document.py +0 -0
  39. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/legacy_doc/tokens.py +0 -0
  40. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/nlp/__init__.py +0 -0
  41. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/nlp/qa.py +0 -0
  42. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/nlp/qa_labels.py +0 -0
  43. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/rec/__init__.py +0 -0
  44. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/rec/attribute.py +0 -0
  45. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/rec/base.py +0 -0
  46. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/rec/predicate.py +0 -0
  47. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/rec/record.py +0 -0
  48. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/rec/statement.py +0 -0
  49. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/types/rec/subject.py +0 -0
  50. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/utils/__init__.py +0 -0
  51. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/utils/alias.py +0 -0
  52. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/utils/generate_docs.py +0 -0
  53. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/utils/generate_jsonschema.py +0 -0
  54. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/utils/validate.py +0 -0
  55. {docling_core-2.1.0 → docling_core-2.2.1}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.1.0
3
+ Version: 2.2.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -591,7 +591,13 @@ class TableItem(FloatingItem):
591
591
  for row in self.data.grid:
592
592
  tmp = []
593
593
  for col in row:
594
- tmp.append(col.text)
594
+
595
+ # make sure that md tables are not broken
596
+ # due to newline chars in the text
597
+ text = col.text
598
+ text = text.replace("\n", " ")
599
+ tmp.append(text)
600
+
595
601
  table.append(tmp)
596
602
 
597
603
  md_table = ""
@@ -1285,6 +1291,18 @@ class DoclingDocument(BaseModel):
1285
1291
  mdtext = re.sub(
1286
1292
  r"\n\n\n+", "\n\n", mdtext
1287
1293
  ) # remove cases of double or more empty lines.
1294
+
1295
+ # Our export markdown doesn't contain any emphasis styling:
1296
+ # Bold, Italic, or Bold-Italic
1297
+ # Hence, any underscore that we print into Markdown is coming from document text
1298
+ # That means we need to escape it, to properly reflect content in the markdown
1299
+ def escape_underscores(text):
1300
+ # Replace "_" with "\_" only if it's not already escaped
1301
+ escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
1302
+ return escaped_text
1303
+
1304
+ mdtext = escape_underscores(mdtext)
1305
+
1288
1306
  return mdtext
1289
1307
 
1290
1308
  def export_to_text( # noqa: C901
@@ -5,15 +5,18 @@
5
5
 
6
6
  """File-related utilities."""
7
7
 
8
+ import importlib
8
9
  import tempfile
9
10
  from pathlib import Path
10
- from typing import Union
11
+ from typing import Dict, Optional, Union
11
12
 
12
13
  import requests
13
14
  from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
14
15
 
15
16
 
16
- def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
17
+ def resolve_file_source(
18
+ source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
19
+ ) -> Path:
17
20
  """Resolves the source (URL, path) of a file to a local file path.
18
21
 
19
22
  If a URL is provided, the content is first downloaded to a temporary local file.
@@ -29,7 +32,17 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
29
32
  """
30
33
  try:
31
34
  http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
32
- res = requests.get(http_url, stream=True)
35
+
36
+ # make all header keys lower case
37
+ _headers = headers or {}
38
+ req_headers = {k.lower(): v for k, v in _headers.items()}
39
+ # add user-agent is not set
40
+ if "user-agent" not in req_headers:
41
+ agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
42
+ req_headers["user-agent"] = agent_name
43
+
44
+ # fetch the page
45
+ res = requests.get(http_url, stream=True, headers=req_headers)
33
46
  res.raise_for_status()
34
47
  fname = None
35
48
  # try to get filename from response header
@@ -41,7 +54,7 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
41
54
  break
42
55
  # otherwise, use name from URL:
43
56
  if fname is None:
44
- fname = Path(http_url.path or "file").name
57
+ fname = Path(http_url.path or "").name or "file"
45
58
  local_path = Path(tempfile.mkdtemp()) / fname
46
59
  with open(local_path, "wb") as f:
47
60
  for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.1.0"
3
+ version = "2.2.1"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes