kodexa 7.0.10217227753__tar.gz → 7.0.10402571165__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/PKG-INFO +1 -1
  2. kodexa-7.0.10402571165/kodexa/dataclasses/__init__.py +339 -0
  3. kodexa-7.0.10402571165/kodexa/dataclasses/llm_data_class.j2 +21 -0
  4. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/model.py +62 -1
  5. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/persistence.py +113 -1
  6. kodexa-7.0.10402571165/kodexa/utils/__init__.py +178 -0
  7. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/pyproject.toml +1 -1
  8. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/LICENSE +0 -0
  9. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/README.md +0 -0
  10. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/__init__.py +0 -0
  11. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/assistant/__init__.py +0 -0
  12. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/assistant/assistant.py +0 -0
  13. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/connectors/__init__.py +0 -0
  14. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/connectors/connectors.py +0 -0
  15. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/__init__.py +0 -0
  16. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/base.py +0 -0
  17. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/entities/__init__.py +0 -0
  18. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/entities/check_response.py +0 -0
  19. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/entities/product.py +0 -0
  20. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/entities/product_subscription.py +0 -0
  21. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/model/objects.py +0 -0
  22. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/pipeline/__init__.py +0 -0
  23. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/pipeline/pipeline.py +0 -0
  24. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/platform/__init__.py +0 -0
  25. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/platform/client.py +0 -0
  26. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/platform/interaction.py +0 -0
  27. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/platform/kodexa.py +0 -0
  28. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/__init__.py +0 -0
  29. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/ast.py +0 -0
  30. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/core.py +0 -0
  31. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/lexrules.py +0 -0
  32. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/lextab.py +0 -0
  33. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/lextab.pyi +0 -0
  34. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/parserules.py +0 -0
  35. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/parserules.pyi +0 -0
  36. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/parsetab.py +0 -0
  37. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/selectors/parsetab.pyi +0 -0
  38. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/spatial/__init__.py +0 -0
  39. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/spatial/azure_models.py +0 -0
  40. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/spatial/bbox_common.py +0 -0
  41. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/spatial/table_form_common.py +0 -0
  42. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/steps/__init__.py +0 -0
  43. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/steps/common.py +0 -0
  44. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/testing/__init__.py +0 -0
  45. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/testing/test_components.py +0 -0
  46. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/testing/test_utils.py +0 -0
  47. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/training/__init__.py +0 -0
  48. {kodexa-7.0.10217227753 → kodexa-7.0.10402571165}/kodexa/training/train_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: kodexa
3
- Version: 7.0.10217227753
3
+ Version: 7.0.10402571165
4
4
  Summary: Python SDK for the Kodexa Platform
5
5
  Author: Austin Redenbaugh
6
6
  Author-email: austin@kodexa.com
@@ -0,0 +1,339 @@
1
+ import logging
2
+ import os
3
+ import uuid
4
+ from typing import Optional, List
5
+
6
+ import jinja2
7
+ from pydantic import BaseModel
8
+
9
+ from kodexa import ContentNode
10
+ from kodexa.model.model import Tag
11
+ from kodexa.model.objects import ContentException, Taxon, Taxonomy, Assistant
12
+ from kodexa.utils import taxon_to_property_name, taxon_to_class_name, taxon_to_group_path, snake_to_camel, \
13
+ to_snake
14
+
15
+ logger = logging.getLogger()
16
+
17
+
18
+ class LLMDataAttribute(BaseModel):
19
+ """
20
+ This is the data structure that is used take the results from the LLM so
21
+ we can use it. We use this as a base class for building classes that align
22
+ with a taxonomy
23
+ """
24
+
25
+ value: Optional[str] = None
26
+ line_ids: Optional[list[str]] = None
27
+ taxon_path: Optional[str] = None
28
+ data_type: Optional[str] = None
29
+ value_path: Optional[str] = None
30
+ normalized_text: Optional[str] = None
31
+ node_uuid_list: Optional[List[int]] = None
32
+ tag_uuid: Optional[str] = None
33
+ exceptions: Optional[list[ContentException]] = None
34
+
35
+ def create_exception(
36
+ self,
37
+ exception_type_id: str,
38
+ exception_type: str,
39
+ normalized_text: str,
40
+ message: str,
41
+ exception_detail: str,
42
+ ):
43
+ content_exception = ContentException(
44
+ exception_type=exception_type,
45
+ exception_detail=exception_detail,
46
+ message=message,
47
+ tag_uuid=self.tag_uuid,
48
+ )
49
+ self.exceptions.append(content_exception)
50
+
51
+
52
+ class LLMDataObject(BaseModel):
53
+ """
54
+ A class to represent a LLM (Large Language Model) data object.
55
+
56
+ ...
57
+
58
+ Attributes
59
+ ----------
60
+ group_uuid : str, optional
61
+ A unique identifier for the group, by default None
62
+ cell_index : int, optional
63
+ The index of the cell which is really the row, by default 0
64
+
65
+ Methods
66
+ -------
67
+ __init__(self, document: "KodexaDocumentLLMWrapper" = None, **data: Any):
68
+ Initializes the LLMDataObject with a given document and additional data.
69
+ apply_labels(self, document: KodexaDocumentLLMWrapper, parent_group_uuid: str = None):
70
+ Applies labels to the document if it exists.
71
+ """
72
+
73
+ group_uuid: Optional[str] = None
74
+ cell_index: int = 0
75
+ exceptions: Optional[list[ContentException]] = None
76
+
77
+ class Config:
78
+ arbitrary_types_allowed = True
79
+
80
+ def __init__(self, group_uuid: str = None, cell_index: int = 0):
81
+ """
82
+ Initializes the LLMDataObject
83
+ """
84
+ super().__init__()
85
+ self.cell_index = cell_index
86
+ if group_uuid is None:
87
+ self.group_uuid = str(uuid.uuid4())
88
+ else:
89
+ self.group_uuid = group_uuid
90
+
91
+ def create_exception(
92
+ self,
93
+ exception_type_id: str,
94
+ exception_type: str,
95
+ message: str,
96
+ exception_detail: str,
97
+ severity: str = "ERROR",
98
+ ):
99
+ content_exception = ContentException(
100
+ exception_type=exception_type,
101
+ exception_details=exception_detail,
102
+ message=message,
103
+ group_uuid=self.group_uuid,
104
+ severity=severity,
105
+ )
106
+ if self.exceptions is None:
107
+ self.exceptions = []
108
+
109
+ self.exceptions.append(content_exception)
110
+
111
+ def apply_labels(
112
+ self, document: "KodexaDocumentLLMWrapper", parent_group_uuid: str = None,
113
+ assistant: Optional["Assistant"] = None
114
+ ):
115
+ """
116
+ Applies labels to the document if it exists.
117
+
118
+ If a document has been assigned to the LLMDataObject, it calls the
119
+ apply_labels method of the document with the current LLMDataObject and
120
+ the parent group uuid.
121
+
122
+ Parameters
123
+ ----------
124
+ document : KodexaDocumentLLMWrapper
125
+ The Kodexa document LLM wrapper
126
+ parent_group_uuid : str, optional
127
+ A unique identifier for the parent group, by default None
128
+ assistant : Assistant, optional
129
+ """
130
+
131
+ # Lets make sure we add all the content exceptions
132
+ if self.exceptions is not None:
133
+ for exception in self.exceptions:
134
+ # We have two types of exception, one in the API and one in the
135
+ # document
136
+ from kodexa.model import ContentException as KodexaContentException
137
+ internal_exception = KodexaContentException(
138
+ exception_type=exception.exception_type,
139
+ message=exception.message,
140
+ exception_details=exception.exception_details,
141
+ severity=exception.severity,
142
+ group_uuid=exception.group_uuid,
143
+ tag_uuid=exception.tag_uuid,
144
+ )
145
+ document.doc.add_exception(internal_exception)
146
+
147
+ # Let's go through this data object and find all the attributes that have a value
148
+ # then we will apply the labels to the document
149
+ for field in self.__fields__:
150
+ logger.info(f"Processing field {field}")
151
+ value = getattr(self, field)
152
+
153
+ if isinstance(value, list):
154
+ logger.info(f"Processing as a list {value}")
155
+ for item in value:
156
+ self.process_child(item, document, parent_group_uuid, assistant)
157
+ else:
158
+ logger.info(f"Processing as a single value {value}")
159
+ self.process_child(value, document, parent_group_uuid, assistant)
160
+
161
+ def process_child(self, value, document, parent_group_uuid, assistant):
162
+
163
+ logger.info(f"Processing child {value}")
164
+ if isinstance(value, LLMDataAttribute):
165
+ # We need to add the label to the document for this attribute
166
+
167
+ tag = value.taxon_path
168
+
169
+ # TODO need to work out why we are missing them?
170
+ logger.info(f"Value: {value.normalized_text}, node_uuid_list: {value.node_uuid_list}")
171
+ if value.node_uuid_list is None:
172
+ value.node_uuid_list = value.line_ids
173
+ logger.info(f"Applying label {tag} to node UUIDs {value.node_uuid_list}")
174
+
175
+ if isinstance(value.node_uuid_list, int):
176
+ value.node_uuid_list = [value.node_uuid_list]
177
+
178
+ nodes_to_label: list[ContentNode] = (
179
+ [
180
+ document.doc.get_persistence().get_node(node_uuid)
181
+ for node_uuid in value.node_uuid_list if (node_uuid != '0' and node_uuid != 0)
182
+ ]
183
+ if value.node_uuid_list
184
+ else []
185
+ )
186
+
187
+ tag_uuid = str(uuid.uuid4())
188
+ for node in nodes_to_label:
189
+ if node:
190
+ if not node.has_tag(tag):
191
+ try:
192
+ confidence = -1 if value.value_path == 'DERIVED' else 1
193
+ node.tag(
194
+ tag_to_apply=tag,
195
+ value=value.normalized_text,
196
+ tag_uuid=tag_uuid,
197
+ cell_index=self.cell_index,
198
+ selector="//word",
199
+ confidence=confidence,
200
+ group_uuid=self.group_uuid,
201
+ parent_group_uuid=parent_group_uuid,
202
+ owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm",
203
+ )
204
+ except:
205
+ logger.error(f"Error tagging node {node.uuid} with tag {tag}")
206
+ else:
207
+ current_value = node.get_feature_values("tag", tag)
208
+ new_tag = Tag(cell_index=self.cell_index,
209
+ uuid=tag_uuid,
210
+ value=value.normalized_text,
211
+ confidence=-1,
212
+ group_uuid=self.group_uuid,
213
+ parent_group_uuid=parent_group_uuid,
214
+ owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm")
215
+ current_value.append(new_tag)
216
+ node.remove_feature("tag", tag)
217
+ node.add_feature("tag", tag, current_value, single=False)
218
+
219
+ logger.info(f"Applied label {tag} to {len(nodes_to_label)} nodes")
220
+ if isinstance(value, LLMDataObject):
221
+ # We need to apply the labels to the document for this object
222
+ value.apply_labels(document, parent_group_uuid=self.group_uuid)
223
+ # logger.info(f"Applied labels to data object {value.group_uuid}")
224
+
225
+
226
+ def find_nearby_word_to_tag(node, tag):
227
+ logger.info(f"find_nearby_word_to_tag: {tag}")
228
+ # Create an ordered list of the lines on the page, sorted by distance from the target node
229
+ target_line_index = node.index if node.node_type == 'line' else node.select('parent::line')[0].index
230
+ all_lines_on_page = node.select('parent::page')[0].select('//line')
231
+
232
+ print(target_line_index, len(all_lines_on_page), all_lines_on_page)
233
+ sorted_lines = sorted(all_lines_on_page, key=lambda line: abs(target_line_index - line.index))
234
+ # Find the first word that isn't yet tagged by this tag
235
+ for line in sorted_lines:
236
+ for word in line.select('//word'):
237
+ if not word.has_tag(tag):
238
+ return word
239
+ return None
240
+
241
+
242
+ def get_template_env():
243
+ """Get the Jinja2 template environmnet
244
+
245
+ :return:
246
+
247
+ Args:
248
+
249
+ Returns:
250
+
251
+ """
252
+ package_location = os.path.dirname(os.path.abspath(__file__))
253
+ template_loader = jinja2.FileSystemLoader([os.getcwd(), package_location])
254
+ env = jinja2.Environment(loader=template_loader, autoescape=True)
255
+ env.globals["snake_to_camel"] = snake_to_camel
256
+ env.globals["to_snake"] = to_snake
257
+ env.globals['taxon_to_property_name'] = taxon_to_property_name
258
+ env.globals['taxon_to_class_name'] = taxon_to_class_name
259
+ env.globals['taxon_to_group_path'] = taxon_to_group_path
260
+ return env
261
+
262
+
263
+ def write_template(template, output_location, output_filename, context):
264
+ """
265
+ Write the given template out to a file
266
+
267
+ Args:
268
+ template: the name of the template
269
+ output_location: the location to write the output
270
+ output_filename: the name of the output file
271
+ context: the context
272
+ """
273
+ template = get_template_env().get_template(template)
274
+ processed_template = template.render(context)
275
+
276
+ from pathlib import Path
277
+
278
+ Path(output_location).mkdir(parents=True, exist_ok=True)
279
+ with open(output_location + "/" + output_filename, "w") as text_file:
280
+ text_file.write(processed_template)
281
+
282
+
283
+ def build_llm_data_classes_for_taxonomy(
284
+ taxonomy: Taxonomy, output_dir: str, output_file: str, use_labels: bool = False
285
+ ):
286
+ """
287
+ This function will use jinja templates to build a set of classes that represent a taxonomy,
288
+ these classes will extend the LLMData class and therefore have the ability to take an LLM
289
+ response and map it to the Kodexa Document identifying and labeling the nodes as needed
290
+
291
+ :param taxonomy:
292
+ :param output_dir:
293
+ :param output_file:
294
+ :param use_labels:
295
+ :return:
296
+ """
297
+
298
+ # We will use a jinja template to build all the classes we need, to do this
299
+ # will iterate over all the taxons the taxonomy
300
+ def set_path(taxon: Taxon, parent_path: Optional[str] = None):
301
+ if parent_path is not None:
302
+ taxon.path = parent_path + "/" + taxon.name
303
+ else:
304
+ taxon.path = taxon.name
305
+ if taxon.children:
306
+ for child_taxon in taxon.children:
307
+ set_path(child_taxon, taxon.path)
308
+
309
+ for taxon in taxonomy.taxons:
310
+ set_path(taxon, None)
311
+
312
+ def collect_group_taxons(taxons: list[Taxon]) -> list[Taxon]:
313
+ """
314
+ Recursively collects all group taxons from a list of taxons.
315
+
316
+ Args:
317
+ taxons (list[Taxon]): The list of taxons to collect group taxons from.
318
+
319
+ Returns:
320
+ list[Taxon]: A list of group taxons.
321
+
322
+ """
323
+ group_taxons = []
324
+ for taxon in taxons:
325
+ if taxon.group:
326
+ group_taxons.append(taxon)
327
+ if taxon.children:
328
+ group_taxons = group_taxons + collect_group_taxons(taxon.children)
329
+ return group_taxons
330
+
331
+ all_group_taxons = collect_group_taxons(taxonomy.taxons)
332
+ all_group_taxons.reverse()
333
+ context = {"taxons": all_group_taxons, "use_labels": use_labels}
334
+ write_template("llm_data_class.j2", output_dir, output_file, context)
335
+
336
+ # Lets log what we created
337
+ logger.info(f"Created the following classes in {output_dir}/{output_file}")
338
+ with open(f"{output_dir}/{output_file}", "r") as file:
339
+ logger.info(file.read())
@@ -0,0 +1,21 @@
1
+ from typing import Optional, List
2
+ from kodexa_langchain.data_class import LLMDataAttribute, LLMDataObject
3
+ from kodexa_langchain.llm import deserialize_llm_data
4
+ from kodexa_langchain.document import KodexaDocumentLLMWrapper
5
+
6
+ {%- for taxon in taxons %}
7
+
8
+ class {{ taxon_to_class_name(taxon) }}(LLMDataObject):
9
+
10
+ {%- for child_taxon in taxon.children %}{%- if child_taxon.group %}
11
+ {{ taxon_to_property_name(child_taxon) }}: Optional[List[{{ taxon_to_class_name(child_taxon) }}]] = None
12
+ {%- else %}
13
+ {{ taxon_to_property_name(child_taxon) }}: Optional[LLMDataAttribute] = LLMDataAttribute(taxon_path='{{ child_taxon.path }}', data_type='{{ child_taxon.taxon_type.title() }}', value_path='{{ child_taxon.value_path.title() }}')
14
+ {%- endif %}
15
+ {%- endfor %}
16
+
17
+ def __init__(self, data: dict, document: Optional[KodexaDocumentLLMWrapper] = None, group_uuid=None, cell_index: int = 0, taxon=None, extraction_context=None):
18
+ super().__init__(group_uuid, cell_index)
19
+ deserialize_llm_data(self, data, document, f'{{ taxon_to_group_path(taxon) }}', group_uuid, taxon, extraction_context)
20
+
21
+ {%- endfor %}
@@ -12,7 +12,7 @@ from typing import Any, List, Optional
12
12
  from addict import Dict
13
13
  import deepdiff
14
14
  import msgpack
15
- from pydantic import BaseModel, ConfigDict
15
+ from pydantic import BaseModel, ConfigDict, Field
16
16
 
17
17
  from kodexa.model.objects import ContentObject, FeatureSet
18
18
 
@@ -2369,6 +2369,55 @@ class FeatureSetDiff:
2369
2369
  return node
2370
2370
 
2371
2371
 
2372
+ class ProcessingStep(BaseModel):
2373
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
2374
+ name: str
2375
+ metadata: dict = Field(default_factory=lambda: {})
2376
+ children: List['ProcessingStep'] = Field(default_factory=list)
2377
+ parents: List['ProcessingStep'] = Field(default_factory=list)
2378
+
2379
+ def add_child(self, child_step: 'ProcessingStep'):
2380
+ self.children.append(child_step)
2381
+ child_step.parents.append(self)
2382
+
2383
+ @staticmethod
2384
+ def merge_with(*other_steps: 'ProcessingStep') -> 'ProcessingStep':
2385
+ merged_step = ProcessingStep(name=f"Merged({', '.join(step.name for step in other_steps)})")
2386
+ for step in other_steps:
2387
+ step.children.append(merged_step)
2388
+ merged_step.parents.append(step)
2389
+ return merged_step
2390
+
2391
+ class Config:
2392
+ arbitrary_types_allowed = True
2393
+ json_encoders = {
2394
+ 'ProcessingStep': lambda step: step.to_dict()
2395
+ }
2396
+
2397
+ def to_dict(self, seen=None):
2398
+ if seen is None:
2399
+ seen = set()
2400
+
2401
+ # Avoid circular references by skipping already seen objects
2402
+ if self.id in seen:
2403
+ return {'id': self.id, 'name': self.name}
2404
+
2405
+ seen.add(self.id)
2406
+
2407
+ return {
2408
+ 'id': self.id,
2409
+ 'name': self.name,
2410
+ 'children': [child.to_dict(seen) for child in self.children],
2411
+ 'parents': [{'id': parent.id, 'name': parent.name} for parent in self.parents], # or parent.to_dict(seen) if full structure is needed
2412
+ }
2413
+
2414
+ def to_json(self):
2415
+ return json.dumps(self.to_dict())
2416
+
2417
+ def __repr__(self):
2418
+ return f"Step(id={self.id}, name={self.name})"
2419
+
2420
+
2372
2421
  class Document(object):
2373
2422
  """A Document is a collection of metadata and a set of content nodes."""
2374
2423
 
@@ -2384,6 +2433,18 @@ class Document(object):
2384
2433
  def get_exceptions(self) -> List[ContentException]:
2385
2434
  return self._persistence_layer.get_exceptions()
2386
2435
 
2436
+ def get_external_data(self) -> dict:
2437
+ return self._persistence_layer.get_external_data()
2438
+
2439
+ def set_external_data(self, external_data:dict):
2440
+ return self._persistence_layer.set_external_data(external_data)
2441
+
2442
+ def get_steps(self) -> list[ProcessingStep]:
2443
+ return self._persistence_layer.get_steps()
2444
+
2445
+ def set_steps(self, steps: list[ProcessingStep]):
2446
+ self._persistence_layer.set_steps(steps)
2447
+
2387
2448
  def replace_exceptions(self, exceptions: List[ContentException]):
2388
2449
  self._persistence_layer.replace_exceptions(exceptions)
2389
2450
 
@@ -13,7 +13,7 @@ from kodexa.model.model import (
13
13
  DocumentMetadata,
14
14
  ContentFeature,
15
15
  ContentException,
16
- ModelInsight,
16
+ ModelInsight, ProcessingStep,
17
17
  )
18
18
 
19
19
  logger = logging.getLogger()
@@ -1122,6 +1122,90 @@ class SqliteDocumentPersistence(object):
1122
1122
 
1123
1123
  return content_nodes
1124
1124
 
1125
+ def __ensure_ed_table_exists(self):
1126
+ """
1127
+ Ensure the 'ed' table exists in the database.
1128
+ Creates the table if it does not exist.
1129
+ """
1130
+ self.cursor.execute("""
1131
+ CREATE TABLE IF NOT EXISTS ed (
1132
+ obj BLOB
1133
+ )
1134
+ """)
1135
+
1136
+ # Check if the table has any rows, if not, insert an initial empty row
1137
+ result = self.cursor.execute("SELECT COUNT(*) FROM ed").fetchone()
1138
+ if result[0] == 0:
1139
+ self.cursor.execute("INSERT INTO ed (obj) VALUES (?)", [sqlite3.Binary(msgpack.packb({}))])
1140
+
1141
+ def set_external_data(self, external_data: dict):
1142
+ """
1143
+ Sets the external data for the document.
1144
+
1145
+ Args:
1146
+ external_data (dict): The external data to store, must be JSON serializable.
1147
+ """
1148
+ self.__ensure_ed_table_exists()
1149
+ serialized_data = sqlite3.Binary(msgpack.packb(external_data))
1150
+ self.cursor.execute("UPDATE ed SET obj = ? WHERE rowid = 1", [serialized_data])
1151
+ self.connection.commit()
1152
+
1153
+ def get_external_data(self) -> dict:
1154
+ """
1155
+ Gets the external data associated with this document.
1156
+
1157
+ Returns:
1158
+ dict: The external data stored in the ed table.
1159
+ """
1160
+ self.__ensure_ed_table_exists()
1161
+ result = self.cursor.execute("SELECT obj FROM ed WHERE rowid = 1").fetchone()
1162
+ if result and result[0]:
1163
+ return msgpack.unpackb(result[0])
1164
+ return {}
1165
+
1166
+ def __ensure_steps_table_exists(self):
1167
+ """
1168
+ Ensure the 'steps' table exists in the database.
1169
+ Creates the table if it does not exist.
1170
+ """
1171
+ self.cursor.execute("""
1172
+ CREATE TABLE IF NOT EXISTS steps (
1173
+ obj BLOB
1174
+ )
1175
+ """)
1176
+
1177
+ # Check if the table has any rows, if not, insert an initial empty row
1178
+ result = self.cursor.execute("SELECT COUNT(*) FROM steps").fetchone()
1179
+ if result[0] == 0:
1180
+ self.cursor.execute("INSERT INTO steps (obj) VALUES (?)", [sqlite3.Binary(msgpack.packb([]))])
1181
+
1182
+ def set_steps(self, steps: List[ProcessingStep]):
1183
+ """
1184
+ Sets the processing steps for the document.
1185
+
1186
+ Args:
1187
+ steps (List[ProcessingStep]): A list of ProcessingStep objects to store.
1188
+ """
1189
+ self.__ensure_steps_table_exists()
1190
+ serialized_steps = [step.to_dict() for step in steps]
1191
+ packed_data = sqlite3.Binary(msgpack.packb(serialized_steps))
1192
+ self.cursor.execute("UPDATE steps SET obj = ? WHERE rowid = 1", [packed_data])
1193
+ self.connection.commit()
1194
+
1195
+ def get_steps(self) -> List[ProcessingStep]:
1196
+ """
1197
+ Gets the processing steps associated with this document.
1198
+
1199
+ Returns:
1200
+ List[ProcessingStep]: A list of ProcessingStep objects.
1201
+ """
1202
+ self.__ensure_steps_table_exists()
1203
+ result = self.cursor.execute("SELECT obj FROM steps WHERE rowid = 1").fetchone()
1204
+ if result and result[0]:
1205
+ unpacked_data = msgpack.unpackb(result[0])
1206
+ return [ProcessingStep(**step) for step in unpacked_data]
1207
+ return []
1208
+
1125
1209
 
1126
1210
  class SimpleObjectCache(object):
1127
1211
  """
@@ -1260,6 +1344,34 @@ class PersistenceManager(object):
1260
1344
  document, filename, delete_on_close, inmemory=inmemory
1261
1345
  )
1262
1346
 
1347
+ def get_steps(self) -> list[ProcessingStep]:
1348
+ """
1349
+ Gets the processing steps for this document
1350
+
1351
+ :return:
1352
+ """
1353
+ return self._underlying_persistence.get_steps()
1354
+
1355
+ def set_steps(self, steps: list[ProcessingStep]):
1356
+ self._underlying_persistence.set_steps(steps)
1357
+
1358
+ def get_external_data(self) -> dict:
1359
+ """
1360
+ Gets the external data object associated with this document
1361
+
1362
+ :return: dict of the external data
1363
+ """
1364
+ return self._underlying_persistence.get_external_data()
1365
+
1366
+ def set_external_data(self, external_data:dict):
1367
+ """
1368
+ Sets the external data for this document
1369
+
1370
+ :param external_data: dict representing the external data, must be JSON serializable
1371
+ :return:
1372
+ """
1373
+ self._underlying_persistence.set_external_data(external_data)
1374
+
1263
1375
  def get_nodes_by_type(self, node_type: str) -> List[ContentNode]:
1264
1376
  """
1265
1377
  Retrieves all nodes of a given type from the underlying persistence layer.
@@ -0,0 +1,178 @@
1
+ """
2
+ This module provides a set of functions to manipulate and convert taxonomy objects for use within a data model.
3
+ It includes functions to convert taxonomy names to various naming conventions such as property names, class names,
4
+ and group paths. Additionally, it offers utility functions for string manipulation, like converting snake case strings
5
+ to camel case or title case, making string names safe for use as attribute names, converting strings to hexadecimal
6
+ color codes, estimating the token count of a text, and recursively finding all non-abstract subclasses of a given class.
7
+ """
8
+
9
+ import keyword
10
+ import logging
11
+ import re
12
+ from inspect import isabstract
13
+
14
+ from kodexa.model.objects import Taxon
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def taxon_to_property_name(taxon: Taxon):
20
+ # We need to convert the taxon name to a property name
21
+ # if the name of the taxon doesn't look like a UUID we will camel case
22
+ # it otherwise we will camelcase the taxon label
23
+ safe_property_name = to_snake(safe_name(taxon.label))
24
+ taxon.external_name = safe_property_name
25
+ return safe_property_name
26
+
27
+
28
+ def taxon_to_class_name(taxon: Taxon):
29
+ # We need to convert the taxon name to a class name
30
+ # if the name of the taxon doesn't look like a UUID we will camel case
31
+ # it otherwise we will camelcase the taxon label
32
+ safe_class_name = snake_to_camel(safe_name(taxon.label))
33
+ taxon.external_name = safe_class_name
34
+ return safe_class_name
35
+
36
+
37
+ def taxon_to_group_path(taxon: Taxon):
38
+ # We need to get the "group_name" from one of the taxons
39
+ # Which is the first part of the taxon path
40
+ return taxon.path.split('/')[0]
41
+
42
+
43
+ def snake_to_camel(snake_str):
44
+ components = snake_str.replace(" ", "_").split("_")
45
+ # We convert first letter of second word to uppercase
46
+ return components[0].strip().title() + "".join(
47
+ x.strip().title() for x in components[1:]
48
+ )
49
+
50
+
51
+ def to_snake(base_str):
52
+ components = base_str.replace(" ", "_").replace("-", "_").split("_")
53
+
54
+ # if the base string starts with a number than we add n_ to the start
55
+ if components[0].isdigit():
56
+ components[0] = "n_" + components[0]
57
+
58
+ # We convert first letter of second word to uppercase
59
+ return "_".join(x.strip().lower() for x in components)
60
+
61
+
62
+ def make_safe_attribute_name(name):
63
+ # Replace invalid characters (anything not a letter, digit, or underscore) with an underscore
64
+ safe_name = ''.join(char if char.isalnum() or char == '_' else '_' for char in name)
65
+
66
+ # If the name starts with a digit, prepend an underscore
67
+ if safe_name[0].isdigit():
68
+ safe_name = '_' + safe_name
69
+
70
+ # Append an underscore if the name is a Python keyword
71
+ if keyword.iskeyword(safe_name):
72
+ safe_name += '_'
73
+
74
+ return safe_name
75
+
76
+
77
+ def safe_name(string):
78
+ """
79
+ Removes invalid characters from a string, replaces spaces with underscores, removes leading/trailing underscores and hyphens, and makes the string lowercase. If the resulting string
80
+ * starts with a number, it prefixes it with "n_".
81
+
82
+ :param string: The string to be transformed.
83
+ :return: The transformed string.
84
+ """
85
+ # Remove invalid characters
86
+
87
+ # trim the string
88
+ string = string.strip()
89
+
90
+ string = re.sub(r"[^\w\s-]", "", string)
91
+
92
+ # Replace spaces with underscores
93
+ string = re.sub(r"\s+", "_", string)
94
+
95
+ # Remove leading/trailing underscores and hyphens
96
+ string = string.strip("_-")
97
+
98
+ # Make it lowercase
99
+ string = string.lower()
100
+
101
+ if string[0].isdigit():
102
+ # can't have things starting with a number
103
+ string = "n_" + string
104
+
105
+ # make sure we don't collide with a python keyword
106
+ return make_safe_attribute_name(string)
107
+
108
+
109
+ def snake_case_to_title_case(snake_case_string):
110
+ words = snake_case_string.split("_")
111
+ title_case_words = [word.capitalize() for word in words]
112
+ return " ".join(title_case_words)
113
+
114
+
115
+ def string_to_hex_color(string):
116
+ # Remove any leading or trailing whitespace from the string
117
+ string = string.strip()
118
+
119
+ # Calculate the hash value of the string
120
+ hash_value = hash(string)
121
+
122
+ # Convert the hash value to a 24-bit hexadecimal color code
123
+ hex_color = "#{:06x}".format(hash_value & 0xFFFFFF)
124
+
125
+ return hex_color
126
+
127
+
128
+ def get_is_square_bracket_first(string):
129
+ first_square_bracket = string.find("[")
130
+ first_bracket = string.find("{")
131
+ # Check if both "{" and "[" exist in the string
132
+ if first_bracket != -1 and first_square_bracket != -1:
133
+ # Compare their indices to determine which appears first
134
+ if first_bracket < first_square_bracket:
135
+ return False
136
+ else:
137
+ return True
138
+ # If only one of them exists, return the one that appears
139
+ elif first_bracket != -1:
140
+ return False
141
+ elif first_square_bracket != -1:
142
+ return True
143
+ else:
144
+ return None
145
+
146
+
147
+ def cosine_similarity(v1, v2):
148
+ """Compute the cosine similarity between two vectors."""
149
+ dot_product = sum(a * b for a, b in zip(v1, v2))
150
+ norm_a = sum(a * a for a in v1) ** 0.5
151
+ norm_b = sum(b * b for b in v2) ** 0.5
152
+ return dot_product / (norm_a * norm_b)
153
+
154
+
155
+ def estimate_token_count(text, avg_token_length=1):
156
+ # Removing spaces to focus on characters that form tokens
157
+ char_count = len(text.replace(" ", ""))
158
+ # Estimating token count
159
+ estimated_tokens = char_count / avg_token_length
160
+ return round(estimated_tokens)
161
+
162
+
163
+ def get_all_concrete_subclasses(cls):
164
+ """
165
+ Recursively find all non-abstract subclasses of a given class.
166
+
167
+ Parameters:
168
+ cls (class): The parent class to find subclasses for.
169
+
170
+ Returns:
171
+ list: A list of all non-abstract subclasses of cls.
172
+ """
173
+ concrete_subclasses = []
174
+ for subclass in cls.__subclasses__():
175
+ if not isabstract(subclass):
176
+ concrete_subclasses.append(subclass)
177
+ concrete_subclasses.extend(get_all_concrete_subclasses(subclass))
178
+ return concrete_subclasses
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "kodexa"
3
- version = "7.0.010217227753"
3
+ version = "7.0.010402571165"
4
4
  description = "Python SDK for the Kodexa Platform"
5
5
  authors = ["Austin Redenbaugh <austin@kodexa.com>", "Philip Dodds <philip@kodexa.com>", "Romar Cablao <rcablao@kodexa.com>", "Amadea Paula Dodds <amadeapaula@kodexa.com>"]
6
6
  readme = "README.md"