docling-core 2.1.0__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/document.py +19 -1
- docling_core/utils/file.py +17 -4
- {docling_core-2.1.0.dist-info → docling_core-2.2.1.dist-info}/METADATA +1 -1
- {docling_core-2.1.0.dist-info → docling_core-2.2.1.dist-info}/RECORD +7 -7
- {docling_core-2.1.0.dist-info → docling_core-2.2.1.dist-info}/LICENSE +0 -0
- {docling_core-2.1.0.dist-info → docling_core-2.2.1.dist-info}/WHEEL +0 -0
- {docling_core-2.1.0.dist-info → docling_core-2.2.1.dist-info}/entry_points.txt +0 -0
|
@@ -591,7 +591,13 @@ class TableItem(FloatingItem):
|
|
|
591
591
|
for row in self.data.grid:
|
|
592
592
|
tmp = []
|
|
593
593
|
for col in row:
|
|
594
|
-
|
|
594
|
+
|
|
595
|
+
# make sure that md tables are not broken
|
|
596
|
+
# due to newline chars in the text
|
|
597
|
+
text = col.text
|
|
598
|
+
text = text.replace("\n", " ")
|
|
599
|
+
tmp.append(text)
|
|
600
|
+
|
|
595
601
|
table.append(tmp)
|
|
596
602
|
|
|
597
603
|
md_table = ""
|
|
@@ -1285,6 +1291,18 @@ class DoclingDocument(BaseModel):
|
|
|
1285
1291
|
mdtext = re.sub(
|
|
1286
1292
|
r"\n\n\n+", "\n\n", mdtext
|
|
1287
1293
|
) # remove cases of double or more empty lines.
|
|
1294
|
+
|
|
1295
|
+
# Our export markdown doesn't contain any emphasis styling:
|
|
1296
|
+
# Bold, Italic, or Bold-Italic
|
|
1297
|
+
# Hence, any underscore that we print into Markdown is coming from document text
|
|
1298
|
+
# That means we need to escape it, to properly reflect content in the markdown
|
|
1299
|
+
def escape_underscores(text):
|
|
1300
|
+
# Replace "_" with "\_" only if it's not already escaped
|
|
1301
|
+
escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
|
|
1302
|
+
return escaped_text
|
|
1303
|
+
|
|
1304
|
+
mdtext = escape_underscores(mdtext)
|
|
1305
|
+
|
|
1288
1306
|
return mdtext
|
|
1289
1307
|
|
|
1290
1308
|
def export_to_text( # noqa: C901
|
docling_core/utils/file.py
CHANGED
|
@@ -5,15 +5,18 @@
|
|
|
5
5
|
|
|
6
6
|
"""File-related utilities."""
|
|
7
7
|
|
|
8
|
+
import importlib
|
|
8
9
|
import tempfile
|
|
9
10
|
from pathlib import Path
|
|
10
|
-
from typing import Union
|
|
11
|
+
from typing import Dict, Optional, Union
|
|
11
12
|
|
|
12
13
|
import requests
|
|
13
14
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def resolve_file_source(
|
|
17
|
+
def resolve_file_source(
|
|
18
|
+
source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
|
|
19
|
+
) -> Path:
|
|
17
20
|
"""Resolves the source (URL, path) of a file to a local file path.
|
|
18
21
|
|
|
19
22
|
If a URL is provided, the content is first downloaded to a temporary local file.
|
|
@@ -29,7 +32,17 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
|
|
|
29
32
|
"""
|
|
30
33
|
try:
|
|
31
34
|
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
|
32
|
-
|
|
35
|
+
|
|
36
|
+
# make all header keys lower case
|
|
37
|
+
_headers = headers or {}
|
|
38
|
+
req_headers = {k.lower(): v for k, v in _headers.items()}
|
|
39
|
+
# add user-agent is not set
|
|
40
|
+
if "user-agent" not in req_headers:
|
|
41
|
+
agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
|
|
42
|
+
req_headers["user-agent"] = agent_name
|
|
43
|
+
|
|
44
|
+
# fetch the page
|
|
45
|
+
res = requests.get(http_url, stream=True, headers=req_headers)
|
|
33
46
|
res.raise_for_status()
|
|
34
47
|
fname = None
|
|
35
48
|
# try to get filename from response header
|
|
@@ -41,7 +54,7 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
|
|
|
41
54
|
break
|
|
42
55
|
# otherwise, use name from URL:
|
|
43
56
|
if fname is None:
|
|
44
|
-
fname = Path(http_url.path or "
|
|
57
|
+
fname = Path(http_url.path or "").name or "file"
|
|
45
58
|
local_path = Path(tempfile.mkdtemp()) / fname
|
|
46
59
|
with open(local_path, "wb") as f:
|
|
47
60
|
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
|
@@ -21,7 +21,7 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
21
21
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
22
22
|
docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
|
|
23
23
|
docling_core/types/doc/base.py,sha256=zvx631U_yQCcJam83hNdDanXEYnO3eN-CCw9vDr6S-I,4442
|
|
24
|
-
docling_core/types/doc/document.py,sha256=
|
|
24
|
+
docling_core/types/doc/document.py,sha256=SrOXpO6iCIYFkhWW-pksd4C4PeJ2jubKI5m34K_lTac,51902
|
|
25
25
|
docling_core/types/doc/labels.py,sha256=mzmSd072A-qW3IThswHxwIHV8IoyTCbHHlNOrisinRA,1335
|
|
26
26
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
27
27
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -44,13 +44,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
|
|
|
44
44
|
docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
|
|
45
45
|
docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
|
|
46
46
|
docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
|
|
47
|
-
docling_core/utils/file.py,sha256=
|
|
47
|
+
docling_core/utils/file.py,sha256=rZ3kaIpX2ZGxtaSXtqjcrivtXvsbeUolLXT-nntQ5yE,2388
|
|
48
48
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
49
49
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
50
50
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
51
51
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
52
|
-
docling_core-2.1.
|
|
53
|
-
docling_core-2.1.
|
|
54
|
-
docling_core-2.1.
|
|
55
|
-
docling_core-2.1.
|
|
56
|
-
docling_core-2.1.
|
|
52
|
+
docling_core-2.2.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
53
|
+
docling_core-2.2.1.dist-info/METADATA,sha256=k6jXQiT3A4us_DXcTvnIi2Bk1UMF6GiJ23Zj9MLbcNg,5432
|
|
54
|
+
docling_core-2.2.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
55
|
+
docling_core-2.2.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
|
|
56
|
+
docling_core-2.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|