kodexa 7.0.11920845564__tar.gz → 7.0.12200160150__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/PKG-INFO +1 -1
  2. kodexa-7.0.12200160150/kodexa/model/utils.py +92 -0
  3. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/pyproject.toml +1 -1
  4. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/LICENSE +0 -0
  5. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/README.md +0 -0
  6. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/__init__.py +0 -0
  7. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/assistant/__init__.py +0 -0
  8. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/assistant/assistant.py +0 -0
  9. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/connectors/__init__.py +0 -0
  10. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/connectors/connectors.py +0 -0
  11. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/dataclasses/__init__.py +0 -0
  12. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/dataclasses/templates/llm_data_class.j2 +0 -0
  13. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/__init__.py +0 -0
  14. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/base.py +0 -0
  15. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/entities/__init__.py +0 -0
  16. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/entities/check_response.py +0 -0
  17. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/entities/product.py +0 -0
  18. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/entities/product_subscription.py +0 -0
  19. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/model.py +0 -0
  20. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/objects.py +0 -0
  21. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/model/persistence.py +0 -0
  22. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/pipeline/__init__.py +0 -0
  23. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/pipeline/pipeline.py +0 -0
  24. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/platform/__init__.py +0 -0
  25. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/platform/client.py +0 -0
  26. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/platform/interaction.py +0 -0
  27. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/platform/kodexa.py +0 -0
  28. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/__init__.py +0 -0
  29. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/ast.py +0 -0
  30. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/core.py +0 -0
  31. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/lexrules.py +0 -0
  32. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/lextab.py +0 -0
  33. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/lextab.pyi +0 -0
  34. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/parserules.py +0 -0
  35. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/parserules.pyi +0 -0
  36. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/parsetab.py +0 -0
  37. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/selectors/parsetab.pyi +0 -0
  38. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/spatial/__init__.py +0 -0
  39. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/spatial/azure_models.py +0 -0
  40. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/spatial/bbox_common.py +0 -0
  41. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/spatial/table_form_common.py +0 -0
  42. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/steps/__init__.py +0 -0
  43. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/steps/common.py +0 -0
  44. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/testing/__init__.py +0 -0
  45. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/testing/test_components.py +0 -0
  46. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/testing/test_utils.py +0 -0
  47. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/training/__init__.py +0 -0
  48. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/training/train_utils.py +0 -0
  49. {kodexa-7.0.11920845564 → kodexa-7.0.12200160150}/kodexa/utils/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: kodexa
3
- Version: 7.0.11920845564
3
+ Version: 7.0.12200160150
4
4
  Summary: Python SDK for the Kodexa Platform
5
5
  Author: Austin Redenbaugh
6
6
  Author-email: austin@kodexa.com
@@ -0,0 +1,92 @@
1
+ import logging
2
+
3
+ from kodexa import ContentNode
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def get_pretty_text_from_lines(lines: list[ContentNode], scale, include_line_uuid=False) -> str:
8
+ pretty_text = ""
9
+ for line_index, line in enumerate(lines):
10
+ line_content = f"('{line.uuid}')" if include_line_uuid else ""
11
+ current_x = 0
12
+ for word in line.select('//word'):
13
+ x = int(word.get_bbox()[0] * scale)
14
+ spaces_needed = max(1, x - current_x) # Ensure at least one space
15
+ line_content += " " * spaces_needed
16
+ line_content += f"{word.get_all_content()}"
17
+ current_x = x + len(word.get_all_content())
18
+
19
+ pretty_text += line_content + "\n"
20
+
21
+ return pretty_text
22
+
23
+
24
+ def get_max_width(lines: list[ContentNode], max_width=None) -> int:
25
+ if max_width is None:
26
+ # Find the line with the most words
27
+ max_words_line = max(lines, key=lambda line: sum(len(word.get_all_content()) for word in line.select('//word')))
28
+
29
+ # Calculate max_width based on the length of all words plus spaces
30
+ max_width = sum(len(word.get_all_content()) for word in max_words_line.select('//word')) + (len(max_words_line.select('//word'))*4) - 1
31
+
32
+ if max_width < 250:
33
+ max_width = 250
34
+
35
+ return max_width
36
+
37
+
38
+ def get_scale_from_words(words: list[ContentNode], max_width) -> float:
39
+ # Get the bboxes
40
+ bboxes = [word.get_bbox() for word in words]
41
+
42
+ # Find the overall bounding box
43
+ min_x = min(bbox[0] for bbox in bboxes)
44
+ max_x = max(bbox[2] for bbox in bboxes)
45
+ min_y = min(bbox[1] for bbox in bboxes)
46
+ max_y = max(bbox[3] for bbox in bboxes)
47
+
48
+ # Invert y-axis
49
+ max_y, min_y = min_y, max_y
50
+
51
+ # Calculate scale factor to fit within max_width
52
+ scale = max_width / (max_x - min_x)
53
+
54
+ return scale
55
+
56
+
57
+ def get_pretty_page(page: ContentNode, max_width=None, include_line_uuid=False) -> str:
58
+ """
59
+ Get a pretty representation of the page
60
+
61
+ :param page: The page to get the pretty representation for
62
+ :param max_width: The maximum width of the page
63
+ :param include_line_uuid: Include the line UUID in the pretty representation
64
+
65
+ :return: A pretty representation of the page
66
+ """
67
+
68
+ logger.info(f"Getting pretty page {page.index}")
69
+
70
+ pretty_text = ""
71
+ content_areas = page.select('//content-area')
72
+
73
+ lines = page.select('//line')
74
+
75
+ max_width = get_max_width(lines, max_width)
76
+ logger.info(f"Max width: {max_width}")
77
+
78
+ words = page.select('//word')
79
+ if len(words) == 0:
80
+ return page.get_all_content()
81
+
82
+ scale = get_scale_from_words(words, max_width)
83
+ for area_index, area in enumerate(content_areas):
84
+
85
+ if area_index > 0:
86
+ pretty_text += "\n\n" # Add extra newline between content areas
87
+
88
+ pretty_text += get_pretty_text_from_lines(area.select('//line'), scale, include_line_uuid)
89
+
90
+ logger.debug(f"Pretty Page: {page.index}: \n{pretty_text}")
91
+
92
+ return pretty_text
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "kodexa"
3
- version = "7.0.011920845564"
3
+ version = "7.0.012200160150"
4
4
  description = "Python SDK for the Kodexa Platform"
5
5
  authors = ["Austin Redenbaugh <austin@kodexa.com>", "Philip Dodds <philip@kodexa.com>", "Romar Cablao <rcablao@kodexa.com>", "Amadea Paula Dodds <amadeapaula@kodexa.com>"]
6
6
  readme = "README.md"