kodexa 7.0.11920845564__py3-none-any.whl → 7.0.12200160150__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kodexa/model/utils.py +92 -0
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12200160150.dist-info}/METADATA +1 -1
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12200160150.dist-info}/RECORD +5 -4
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12200160150.dist-info}/LICENSE +0 -0
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12200160150.dist-info}/WHEEL +0 -0
kodexa/model/utils.py
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from kodexa import ContentNode
|
4
|
+
|
5
|
+
logger = logging.getLogger(__name__)
|
6
|
+
|
7
|
+
def get_pretty_text_from_lines(lines: list[ContentNode], scale, include_line_uuid=False) -> str:
|
8
|
+
pretty_text = ""
|
9
|
+
for line_index, line in enumerate(lines):
|
10
|
+
line_content = f"('{line.uuid}')" if include_line_uuid else ""
|
11
|
+
current_x = 0
|
12
|
+
for word in line.select('//word'):
|
13
|
+
x = int(word.get_bbox()[0] * scale)
|
14
|
+
spaces_needed = max(1, x - current_x) # Ensure at least one space
|
15
|
+
line_content += " " * spaces_needed
|
16
|
+
line_content += f"{word.get_all_content()}"
|
17
|
+
current_x = x + len(word.get_all_content())
|
18
|
+
|
19
|
+
pretty_text += line_content + "\n"
|
20
|
+
|
21
|
+
return pretty_text
|
22
|
+
|
23
|
+
|
24
|
+
def get_max_width(lines: list[ContentNode], max_width=None) -> int:
|
25
|
+
if max_width is None:
|
26
|
+
# Find the line with the most words
|
27
|
+
max_words_line = max(lines, key=lambda line: sum(len(word.get_all_content()) for word in line.select('//word')))
|
28
|
+
|
29
|
+
# Calculate max_width based on the length of all words plus spaces
|
30
|
+
max_width = sum(len(word.get_all_content()) for word in max_words_line.select('//word')) + (len(max_words_line.select('//word'))*4) - 1
|
31
|
+
|
32
|
+
if max_width < 250:
|
33
|
+
max_width = 250
|
34
|
+
|
35
|
+
return max_width
|
36
|
+
|
37
|
+
|
38
|
+
def get_scale_from_words(words: list[ContentNode], max_width) -> float:
|
39
|
+
# Get the bboxes
|
40
|
+
bboxes = [word.get_bbox() for word in words]
|
41
|
+
|
42
|
+
# Find the overall bounding box
|
43
|
+
min_x = min(bbox[0] for bbox in bboxes)
|
44
|
+
max_x = max(bbox[2] for bbox in bboxes)
|
45
|
+
min_y = min(bbox[1] for bbox in bboxes)
|
46
|
+
max_y = max(bbox[3] for bbox in bboxes)
|
47
|
+
|
48
|
+
# Invert y-axis
|
49
|
+
max_y, min_y = min_y, max_y
|
50
|
+
|
51
|
+
# Calculate scale factor to fit within max_width
|
52
|
+
scale = max_width / (max_x - min_x)
|
53
|
+
|
54
|
+
return scale
|
55
|
+
|
56
|
+
|
57
|
+
def get_pretty_page(page: ContentNode, max_width=None, include_line_uuid=False) -> str:
|
58
|
+
"""
|
59
|
+
Get a pretty representation of the page
|
60
|
+
|
61
|
+
:param page: The page to get the pretty representation for
|
62
|
+
:param max_width: The maximum width of the page
|
63
|
+
:param include_line_uuid: Include the line UUID in the pretty representation
|
64
|
+
|
65
|
+
:return: A pretty representation of the page
|
66
|
+
"""
|
67
|
+
|
68
|
+
logger.info(f"Getting pretty page {page.index}")
|
69
|
+
|
70
|
+
pretty_text = ""
|
71
|
+
content_areas = page.select('//content-area')
|
72
|
+
|
73
|
+
lines = page.select('//line')
|
74
|
+
|
75
|
+
max_width = get_max_width(lines, max_width)
|
76
|
+
logger.info(f"Max width: {max_width}")
|
77
|
+
|
78
|
+
words = page.select('//word')
|
79
|
+
if len(words) == 0:
|
80
|
+
return page.get_all_content()
|
81
|
+
|
82
|
+
scale = get_scale_from_words(words, max_width)
|
83
|
+
for area_index, area in enumerate(content_areas):
|
84
|
+
|
85
|
+
if area_index > 0:
|
86
|
+
pretty_text += "\n\n" # Add extra newline between content areas
|
87
|
+
|
88
|
+
pretty_text += get_pretty_text_from_lines(area.select('//line'), scale, include_line_uuid)
|
89
|
+
|
90
|
+
logger.debug(f"Pretty Page: {page.index}: \n{pretty_text}")
|
91
|
+
|
92
|
+
return pretty_text
|
@@ -14,6 +14,7 @@ kodexa/model/entities/product_subscription.py,sha256=UcmWR-qgLfdV7VCtJNwzgkanoS8
|
|
14
14
|
kodexa/model/model.py,sha256=wY5HnpsAnKlH_aDEHWNf-ZrhdrBg-DtqGFszjkdZtPU,118340
|
15
15
|
kodexa/model/objects.py,sha256=CE76KwQwIT6FdWJuac8aIumX_Ok6-9oq1JXz0K_gdwo,185117
|
16
16
|
kodexa/model/persistence.py,sha256=PTh9jmqYCDuWfiuCssLttFaYWiMA_fCiwjgsYDW4AhE,68281
|
17
|
+
kodexa/model/utils.py,sha256=6R-3rFiW9irBwj0Mq5yhp7EDXkNUFaeFhr3bWmnlW4g,2961
|
17
18
|
kodexa/pipeline/__init__.py,sha256=sA7f5D6qkdMrpp2xTIeefnrUBI6xxEEWostvxfX_1Cs,236
|
18
19
|
kodexa/pipeline/pipeline.py,sha256=ZYpJAWcwV4YRK589DUhU0vXGQlkNSj4J2TsGbYqTLjo,25221
|
19
20
|
kodexa/platform/__init__.py,sha256=1O3oiWMg292NPL_NacKDnK1T3_R6cMorrPRue_9e-O4,216
|
@@ -42,7 +43,7 @@ kodexa/testing/test_utils.py,sha256=DrLCkHxdb6AbZ-X3WmTMbQmnVIm55VEBL8MjtUK9POs,
|
|
42
43
|
kodexa/training/__init__.py,sha256=xs2L62YpRkIRfslQwtQZ5Yxjhm7sLzX2TrVX6EuBnZQ,52
|
43
44
|
kodexa/training/train_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
45
|
kodexa/utils/__init__.py,sha256=Pnim1o9_db5YEnNvDTxpM7HG-qTlL6n8JwFwOafU9wo,5928
|
45
|
-
kodexa-7.0.
|
46
|
-
kodexa-7.0.
|
47
|
-
kodexa-7.0.
|
48
|
-
kodexa-7.0.
|
46
|
+
kodexa-7.0.12200160150.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
|
47
|
+
kodexa-7.0.12200160150.dist-info/METADATA,sha256=YeHgvKBNAQbHkYNNR3xHuf4LJNbuJkCClUbWs1C85nk,3527
|
48
|
+
kodexa-7.0.12200160150.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
49
|
+
kodexa-7.0.12200160150.dist-info/RECORD,,
|
File without changes
|
File without changes
|