kodexa 7.0.11920845564__py3-none-any.whl → 7.0.12260797471__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kodexa/model/objects.py +1 -1
- kodexa/model/utils.py +92 -0
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12260797471.dist-info}/METADATA +1 -1
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12260797471.dist-info}/RECORD +6 -5
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12260797471.dist-info}/LICENSE +0 -0
- {kodexa-7.0.11920845564.dist-info → kodexa-7.0.12260797471.dist-info}/WHEEL +0 -0
kodexa/model/objects.py
CHANGED
@@ -5468,7 +5468,6 @@ class DataForm(ExtensionPackProvided):
|
|
5468
5468
|
|
5469
5469
|
"""
|
5470
5470
|
entrypoints: Optional[List[str]] = None
|
5471
|
-
entrypoint: Optional[str] = None
|
5472
5471
|
cards: Optional[List[Card]] = None
|
5473
5472
|
filters: Optional[str] = None
|
5474
5473
|
|
@@ -5488,6 +5487,7 @@ class ProjectDataForm(BaseModel):
|
|
5488
5487
|
description: Optional[str] = None
|
5489
5488
|
|
5490
5489
|
cards: Optional[List[Card]] = None
|
5490
|
+
entrypoints: Optional[List[str]] = None
|
5491
5491
|
|
5492
5492
|
ref: Optional[str] = None
|
5493
5493
|
|
kodexa/model/utils.py
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from kodexa import ContentNode
|
4
|
+
|
5
|
+
logger = logging.getLogger(__name__)
|
6
|
+
|
7
|
+
def get_pretty_text_from_lines(lines: list[ContentNode], scale, include_line_uuid=False) -> str:
|
8
|
+
pretty_text = ""
|
9
|
+
for line_index, line in enumerate(lines):
|
10
|
+
line_content = f"('{line.uuid}')" if include_line_uuid else ""
|
11
|
+
current_x = 0
|
12
|
+
for word in line.select('//word'):
|
13
|
+
x = int(word.get_bbox()[0] * scale)
|
14
|
+
spaces_needed = max(1, x - current_x) # Ensure at least one space
|
15
|
+
line_content += " " * spaces_needed
|
16
|
+
line_content += f"{word.get_all_content()}"
|
17
|
+
current_x = x + len(word.get_all_content())
|
18
|
+
|
19
|
+
pretty_text += line_content + "\n"
|
20
|
+
|
21
|
+
return pretty_text
|
22
|
+
|
23
|
+
|
24
|
+
def get_max_width(lines: list[ContentNode], max_width=None) -> int:
|
25
|
+
if max_width is None:
|
26
|
+
# Find the line with the most words
|
27
|
+
max_words_line = max(lines, key=lambda line: sum(len(word.get_all_content()) for word in line.select('//word')))
|
28
|
+
|
29
|
+
# Calculate max_width based on the length of all words plus spaces
|
30
|
+
max_width = sum(len(word.get_all_content()) for word in max_words_line.select('//word')) + (len(max_words_line.select('//word'))*4) - 1
|
31
|
+
|
32
|
+
if max_width < 250:
|
33
|
+
max_width = 250
|
34
|
+
|
35
|
+
return max_width
|
36
|
+
|
37
|
+
|
38
|
+
def get_scale_from_words(words: list[ContentNode], max_width) -> float:
|
39
|
+
# Get the bboxes
|
40
|
+
bboxes = [word.get_bbox() for word in words]
|
41
|
+
|
42
|
+
# Find the overall bounding box
|
43
|
+
min_x = min(bbox[0] for bbox in bboxes)
|
44
|
+
max_x = max(bbox[2] for bbox in bboxes)
|
45
|
+
min_y = min(bbox[1] for bbox in bboxes)
|
46
|
+
max_y = max(bbox[3] for bbox in bboxes)
|
47
|
+
|
48
|
+
# Invert y-axis
|
49
|
+
max_y, min_y = min_y, max_y
|
50
|
+
|
51
|
+
# Calculate scale factor to fit within max_width
|
52
|
+
scale = max_width / (max_x - min_x)
|
53
|
+
|
54
|
+
return scale
|
55
|
+
|
56
|
+
|
57
|
+
def get_pretty_page(page: ContentNode, max_width=None, include_line_uuid=False) -> str:
|
58
|
+
"""
|
59
|
+
Get a pretty representation of the page
|
60
|
+
|
61
|
+
:param page: The page to get the pretty representation for
|
62
|
+
:param max_width: The maximum width of the page
|
63
|
+
:param include_line_uuid: Include the line UUID in the pretty representation
|
64
|
+
|
65
|
+
:return: A pretty representation of the page
|
66
|
+
"""
|
67
|
+
|
68
|
+
logger.info(f"Getting pretty page {page.index}")
|
69
|
+
|
70
|
+
pretty_text = ""
|
71
|
+
content_areas = page.select('//content-area')
|
72
|
+
|
73
|
+
lines = page.select('//line')
|
74
|
+
|
75
|
+
max_width = get_max_width(lines, max_width)
|
76
|
+
logger.info(f"Max width: {max_width}")
|
77
|
+
|
78
|
+
words = page.select('//word')
|
79
|
+
if len(words) == 0:
|
80
|
+
return page.get_all_content()
|
81
|
+
|
82
|
+
scale = get_scale_from_words(words, max_width)
|
83
|
+
for area_index, area in enumerate(content_areas):
|
84
|
+
|
85
|
+
if area_index > 0:
|
86
|
+
pretty_text += "\n\n" # Add extra newline between content areas
|
87
|
+
|
88
|
+
pretty_text += get_pretty_text_from_lines(area.select('//line'), scale, include_line_uuid)
|
89
|
+
|
90
|
+
logger.debug(f"Pretty Page: {page.index}: \n{pretty_text}")
|
91
|
+
|
92
|
+
return pretty_text
|
@@ -12,8 +12,9 @@ kodexa/model/entities/check_response.py,sha256=eqBHxO6G2OAziL3p9bHGI-oiPkAG82H6C
|
|
12
12
|
kodexa/model/entities/product.py,sha256=ZDpHuBE_9FJ-klnkyBvTfPwYOqBkM1wraZMtHqNA8FQ,3526
|
13
13
|
kodexa/model/entities/product_subscription.py,sha256=UcmWR-qgLfdV7VCtJNwzgkanoS8nBSL6ngVuxQUK1M8,3810
|
14
14
|
kodexa/model/model.py,sha256=wY5HnpsAnKlH_aDEHWNf-ZrhdrBg-DtqGFszjkdZtPU,118340
|
15
|
-
kodexa/model/objects.py,sha256=
|
15
|
+
kodexa/model/objects.py,sha256=4Oyjs6omlHfwziAK1m2tFk4jSnzN7lFdXACog07ed1c,185124
|
16
16
|
kodexa/model/persistence.py,sha256=PTh9jmqYCDuWfiuCssLttFaYWiMA_fCiwjgsYDW4AhE,68281
|
17
|
+
kodexa/model/utils.py,sha256=6R-3rFiW9irBwj0Mq5yhp7EDXkNUFaeFhr3bWmnlW4g,2961
|
17
18
|
kodexa/pipeline/__init__.py,sha256=sA7f5D6qkdMrpp2xTIeefnrUBI6xxEEWostvxfX_1Cs,236
|
18
19
|
kodexa/pipeline/pipeline.py,sha256=ZYpJAWcwV4YRK589DUhU0vXGQlkNSj4J2TsGbYqTLjo,25221
|
19
20
|
kodexa/platform/__init__.py,sha256=1O3oiWMg292NPL_NacKDnK1T3_R6cMorrPRue_9e-O4,216
|
@@ -42,7 +43,7 @@ kodexa/testing/test_utils.py,sha256=DrLCkHxdb6AbZ-X3WmTMbQmnVIm55VEBL8MjtUK9POs,
|
|
42
43
|
kodexa/training/__init__.py,sha256=xs2L62YpRkIRfslQwtQZ5Yxjhm7sLzX2TrVX6EuBnZQ,52
|
43
44
|
kodexa/training/train_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
45
|
kodexa/utils/__init__.py,sha256=Pnim1o9_db5YEnNvDTxpM7HG-qTlL6n8JwFwOafU9wo,5928
|
45
|
-
kodexa-7.0.
|
46
|
-
kodexa-7.0.
|
47
|
-
kodexa-7.0.
|
48
|
-
kodexa-7.0.
|
46
|
+
kodexa-7.0.12260797471.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
|
47
|
+
kodexa-7.0.12260797471.dist-info/METADATA,sha256=05A83UZeGYAMX48VXMOh-geCtXObL4lfIEx6QxCIE5s,3527
|
48
|
+
kodexa-7.0.12260797471.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
|
49
|
+
kodexa-7.0.12260797471.dist-info/RECORD,,
|
File without changes
|
File without changes
|