kodexa 7.0.1a10177063353__py3-none-any.whl → 7.0.1a11918232720__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,464 @@
1
+ import logging
2
+ import os
3
+ import uuid
4
+ from typing import Optional, List
5
+
6
+ import jinja2
7
+ from kodexa import ContentNode
8
+ from kodexa.model.model import Tag, Document
9
+ from kodexa.model.objects import ContentException, Taxon, Taxonomy, Assistant
10
+ from pydantic import BaseModel
11
+
12
+ from kodexa.utils import snake_to_camel, to_snake, taxon_to_property_name, taxon_to_class_name, taxon_to_group_path
13
+
14
+ logger = logging.getLogger()
15
+
16
+
17
+ class KodexaDocumentLLMWrapper:
18
+
19
+ doc = None
20
+ entities = None
21
+
22
+ def __init__(self, doc: Document):
23
+ self.doc = doc
24
+
25
+
26
+ def get_doc(self):
27
+ return self.doc
28
+
29
+
30
+ class LLMDataAttribute(BaseModel):
31
+ """
32
+ This is the data structure that is used take the results from the LLM so
33
+ we can use it. We use this as a base class for building classes that align
34
+ with a taxonomy
35
+ """
36
+
37
+ value: Optional[str] = None
38
+ line_ids: Optional[list[str]] = None
39
+ taxon_path: Optional[str] = None
40
+ data_type: Optional[str] = None
41
+ value_path: Optional[str] = None
42
+ normalized_text: Optional[str] = None
43
+ node_uuid_list: Optional[List[int]] = None
44
+ tag_uuid: Optional[str] = None
45
+ page_number: Optional[int] = None
46
+ exceptions: Optional[list[ContentException]] = None
47
+
48
+ def copy_from(self, source: "LLMDataAttribute"):
49
+ self.tag_uuid = source.tag_uuid
50
+ self.value = source.value
51
+ self.normalized_text = source.normalized_text
52
+ self.line_ids = source.line_ids
53
+ self.exceptions = source.exceptions
54
+ self.node_uuid_list = source.node_uuid_list
55
+ self.page_number = source.page_number
56
+
57
+ def create_exception(
58
+ self,
59
+ exception_type_id: str,
60
+ exception_type: str,
61
+ normalized_text: str,
62
+ message: str,
63
+ exception_detail: str,
64
+ ):
65
+ content_exception = ContentException(
66
+ exception_type=exception_type,
67
+ exception_detail=exception_detail,
68
+ message=message,
69
+ tag_uuid=self.tag_uuid,
70
+ )
71
+ self.exceptions.append(content_exception)
72
+
73
+
74
+ class LLMDataObject(BaseModel):
75
+ """
76
+ A class to represent a LLM (Large Language Model) data object.
77
+
78
+ ...
79
+
80
+ Attributes
81
+ ----------
82
+ group_uuid : str, optional
83
+ A unique identifier for the group, by default None
84
+ cell_index : int, optional
85
+ The index of the cell which is really the row, by default 0
86
+
87
+ Methods
88
+ -------
89
+ __init__(self, document: "KodexaDocumentLLMWrapper" = None, **data: Any):
90
+ Initializes the LLMDataObject with a given document and additional data.
91
+ apply_labels(self, document: KodexaDocumentLLMWrapper, parent_group_uuid: str = None):
92
+ Applies labels to the document if it exists.
93
+ """
94
+
95
+ group_uuid: Optional[str] = None
96
+ cell_index: int = 0
97
+ exceptions: Optional[list[ContentException]] = None
98
+
99
+ class Config:
100
+ arbitrary_types_allowed = True
101
+
102
+ def get_all_review_pages(self):
103
+ """
104
+ Returns a list of unique page numbers that would be included in the review.
105
+
106
+ :return: list of unique page numbers
107
+ """
108
+ pages = set()
109
+ for field in self.__fields__:
110
+ pages.update(self._get_field_pages(field))
111
+ return sorted(list(pages))
112
+
113
+ def _get_field_pages(self, field):
114
+ if isinstance(getattr(self, field), list):
115
+ pages = set()
116
+ for item in getattr(self, field):
117
+
118
+ if isinstance(item, LLMDataObject):
119
+ pages.update(item.get_all_review_pages())
120
+ return pages
121
+ elif isinstance(getattr(self, field), LLMDataAttribute):
122
+ if getattr(self, field).value != getattr(self, field).normalized_text:
123
+ return {getattr(self, field).page_number}
124
+ elif isinstance(getattr(self, field), LLMDataObject):
125
+ return getattr(self, field).get_all_review_pages()
126
+ return set()
127
+
128
+ def update_from_review(self, review_dict):
129
+ """
130
+ Update the node UUIDs and value based on the provided review dictionary.
131
+
132
+ :param review_dict: A dictionary containing the updated review information
133
+ """
134
+ for field, field_data in review_dict.items():
135
+ self._update_field_review(field, field_data)
136
+
137
+ def _update_field_review(self, field, field_data):
138
+ if isinstance(field_data, list):
139
+ for i, item_data in enumerate(field_data):
140
+ if i < len(getattr(self, field)):
141
+ getattr(self, field)[i].update_from_review(item_data)
142
+ elif isinstance(field_data, dict):
143
+ if isinstance(getattr(self, field), LLMDataAttribute):
144
+ self._update_data_attribute(field, field_data)
145
+ elif isinstance(getattr(self, field), LLMDataObject):
146
+ getattr(self, field).update_from_review(field_data)
147
+
148
+ def _update_data_attribute(self, field, field_data):
149
+ attr = getattr(self, field)
150
+ if 'value' in field_data:
151
+ attr.value = field_data['value']
152
+ if 'node_uuids' in field_data:
153
+ attr.node_uuid_list = field_data['node_uuids']
154
+ if 'normalized_text' in field_data:
155
+ attr.normalized_text = field_data['normalized_text']
156
+
157
+ def to_review(self, page_number=None):
158
+ """
159
+ Build a representation of the data object and its data attributes that is a dict that includes the
160
+ value, normalized text and node UUIDs so we can use this to review mismatched value/normalized
161
+ with the LLM for a specific page number.
162
+
163
+ :param page_number: Optional page number to filter the review items
164
+ :return: dict of this data object and children for the specified page
165
+ """
166
+ review = {}
167
+ for field in self.__fields__:
168
+ review_field = self._build_review(field, page_number)
169
+ if review_field:
170
+ review[field] = review_field
171
+ return review
172
+
173
+ def _build_review(self, field, page_number=None):
174
+ if isinstance(getattr(self, field), list):
175
+ review_field = []
176
+ for item in getattr(self, field):
177
+ if isinstance(item, LLMDataObject):
178
+ new_review = item.to_review(page_number)
179
+ if new_review:
180
+ review_field.append(new_review)
181
+ return review_field if review_field else None
182
+ elif isinstance(getattr(self, field), LLMDataAttribute):
183
+ if getattr(self, field).value != getattr(self, field).normalized_text:
184
+ if page_number is None or getattr(self, field).page_number == page_number:
185
+ return {
186
+ "value": getattr(self, field).value,
187
+ "normalized_text": getattr(self, field).normalized_text,
188
+ "node_uuids": getattr(self, field).node_uuid_list,
189
+ "page_number": getattr(self, field).page_number,
190
+ }
191
+ elif isinstance(getattr(self, field), LLMDataObject):
192
+ return getattr(self, field).to_review(page_number)
193
+
194
+ return None
195
+
196
+ def create_exception(
197
+ self,
198
+ exception_type_id: str,
199
+ exception_type: str,
200
+ message: str,
201
+ exception_detail: str,
202
+ severity: str = "ERROR",
203
+ ):
204
+ content_exception = ContentException(
205
+ exception_type=exception_type,
206
+ exception_details=exception_detail,
207
+ message=message,
208
+ group_uuid=self.group_uuid,
209
+ severity=severity,
210
+ )
211
+ if self.exceptions is None:
212
+ self.exceptions = []
213
+
214
+ self.exceptions.append(content_exception)
215
+
216
+ def apply_labels(
217
+ self, document: "KodexaDocumentLLMWrapper", parent_group_uuid: str = None,
218
+ assistant: Optional["Assistant"] = None
219
+ ):
220
+ """
221
+ Applies labels to the document if it exists.
222
+
223
+ If a document has been assigned to the LLMDataObject, it calls the
224
+ apply_labels method of the document with the current LLMDataObject and
225
+ the parent group uuid.
226
+
227
+ Parameters
228
+ ----------
229
+ document : KodexaDocumentLLMWrapper
230
+ The Kodexa document LLM wrapper
231
+ parent_group_uuid : str, optional
232
+ A unique identifier for the parent group, by default None
233
+ assistant : Assistant, optional
234
+ """
235
+
236
+ # Lets make sure we add all the content exceptions
237
+ if self.exceptions is not None:
238
+ for exception in self.exceptions:
239
+ # We have two types of exception, one in the API and one in the
240
+ # document
241
+ from kodexa.model import ContentException as KodexaContentException
242
+ internal_exception = KodexaContentException(
243
+ exception_type=exception.exception_type,
244
+ message=exception.message,
245
+ exception_details=exception.exception_details,
246
+ severity=exception.severity,
247
+ group_uuid=exception.group_uuid,
248
+ tag_uuid=exception.tag_uuid,
249
+ )
250
+ document.doc.add_exception(internal_exception)
251
+
252
+ # Let's go through this data object and find all the attributes that have a value
253
+ # then we will apply the labels to the document
254
+ for field in self.__fields__:
255
+ logger.info(f"Processing field {field}")
256
+ value = getattr(self, field)
257
+
258
+ if isinstance(value, list):
259
+ logger.info(f"Processing as a list {value}")
260
+ for item in value:
261
+ self.process_child(item, document, parent_group_uuid, assistant)
262
+ else:
263
+ logger.info(f"Processing as a single value {value}")
264
+ self.process_child(value, document, parent_group_uuid, assistant)
265
+
266
+ def process_child(self, value, document, parent_group_uuid, assistant):
267
+
268
+ logger.info(f"Processing child {value}")
269
+ if isinstance(value, LLMDataAttribute):
270
+ # We need to add the label to the document for this attribute
271
+
272
+ tag = value.taxon_path
273
+
274
+ # TODO need to work out why we are missing them?
275
+ logger.info(f"Value: {value.normalized_text}, node_uuid_list: {value.node_uuid_list}")
276
+ if value.node_uuid_list is None:
277
+ value.node_uuid_list = value.line_ids
278
+ logger.info(f"Applying label {tag} to node UUIDs {value.node_uuid_list}")
279
+
280
+ if isinstance(value.node_uuid_list, int):
281
+ value.node_uuid_list = [value.node_uuid_list]
282
+
283
+ nodes_to_label: list[ContentNode] = (
284
+ [
285
+ document.doc.get_persistence().get_node(node_uuid)
286
+ for node_uuid in value.node_uuid_list if (node_uuid != '0' and node_uuid != 0)
287
+ ]
288
+ if value.node_uuid_list
289
+ else []
290
+ )
291
+
292
+ tag_uuid = str(uuid.uuid4())
293
+ for node in nodes_to_label:
294
+ if node:
295
+ if not node.has_tag(tag):
296
+ try:
297
+ confidence = -1 if value.value_path == 'DERIVED' else 1
298
+ node.tag(
299
+ tag_to_apply=tag,
300
+ value=value.normalized_text,
301
+ tag_uuid=tag_uuid,
302
+ cell_index=self.cell_index,
303
+ selector="//word",
304
+ confidence=confidence,
305
+ group_uuid=self.group_uuid,
306
+ parent_group_uuid=parent_group_uuid,
307
+ owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm",
308
+ )
309
+ except:
310
+ logger.error(f"Error tagging node {node.uuid} with tag {tag}")
311
+ else:
312
+ current_value = node.get_feature_values("tag", tag)
313
+ new_tag = Tag(cell_index=self.cell_index,
314
+ uuid=tag_uuid,
315
+ value=value.normalized_text,
316
+ confidence=-1,
317
+ group_uuid=self.group_uuid,
318
+ parent_group_uuid=parent_group_uuid,
319
+ owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm")
320
+ current_value.append(new_tag)
321
+ node.remove_feature("tag", tag)
322
+ node.add_feature("tag", tag, current_value, single=False)
323
+ # try:
324
+ # if value.data_type == 'Derived':
325
+ # logger.info(f"Node already has tag {tag} - Tagging something nearby {node.get_all_content()}")
326
+ # nearby_node = find_nearby_word_to_tag(node, tag)
327
+ # nearby_node.tag(
328
+ # tag_to_apply=tag,
329
+ # value=value.normalized_text,
330
+ # tag_uuid=tag_uuid,
331
+ # cell_index=self.cell_index,
332
+ # selector="//word",
333
+ # confidence=-1,
334
+ # group_uuid=self.group_uuid,
335
+ # parent_group_uuid=parent_group_uuid,
336
+ # owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm",
337
+ # )
338
+ # else:
339
+ # logger.info(f"Node already has tag {tag} - Skipping.")
340
+ # except:
341
+ # logger.error(f"Error tagging nearby node with tag {tag}")
342
+
343
+ logger.info(f"Applied label {tag} to {len(nodes_to_label)} nodes")
344
+ if isinstance(value, LLMDataObject):
345
+ # We need to apply the labels to the document for this object
346
+ value.apply_labels(document, parent_group_uuid=self.group_uuid)
347
+ # logger.info(f"Applied labels to data object {value.group_uuid}")
348
+
349
+
350
+ def find_nearby_word_to_tag(node, tag):
351
+ logger.info(f"find_nearby_word_to_tag: {tag}")
352
+ # Create an ordered list of the lines on the page, sorted by distance from the target node
353
+ target_line_index = node.index if node.node_type == 'line' else node.select('parent::line')[0].index
354
+ all_lines_on_page = node.select('parent::page')[0].select('//line')
355
+
356
+ print(target_line_index, len(all_lines_on_page), all_lines_on_page)
357
+ sorted_lines = sorted(all_lines_on_page, key=lambda line: abs(target_line_index - line.index))
358
+ # Find the first word that isn't yet tagged by this tag
359
+ for line in sorted_lines:
360
+ for word in line.select('//word'):
361
+ if not word.has_tag(tag):
362
+ return word
363
+ return None
364
+
365
+
366
+ def get_template_env():
367
+ """Get the Jinja2 template environmnet
368
+
369
+ :return:
370
+
371
+ Args:
372
+
373
+ Returns:
374
+
375
+ """
376
+ cli_path = os.path.dirname(os.path.abspath(__file__))
377
+ package_location = os.path.join(cli_path, "templates")
378
+ template_loader = jinja2.FileSystemLoader([os.getcwd(), package_location])
379
+ env = jinja2.Environment(loader=template_loader, autoescape=True)
380
+ env.globals["snake_to_camel"] = snake_to_camel
381
+ env.globals["to_snake"] = to_snake
382
+ env.globals['taxon_to_property_name'] = taxon_to_property_name
383
+ env.globals['taxon_to_class_name'] = taxon_to_class_name
384
+ env.globals['taxon_to_group_path'] = taxon_to_group_path
385
+ return env
386
+
387
+
388
+ def write_template(template, output_location, output_filename, context):
389
+ """
390
+ Write the given template out to a file
391
+
392
+ Args:
393
+ template: the name of the template
394
+ output_location: the location to write the output
395
+ output_filename: the name of the output file
396
+ context: the context
397
+ """
398
+ template = get_template_env().get_template(template)
399
+ processed_template = template.render(context)
400
+
401
+ from pathlib import Path
402
+
403
+ Path(output_location).mkdir(parents=True, exist_ok=True)
404
+ with open(output_location + "/" + output_filename, "w") as text_file:
405
+ text_file.write(processed_template)
406
+
407
+
408
+ def build_llm_data_classes_for_taxonomy(
409
+ taxonomy: Taxonomy, output_dir: str, output_file: str, use_labels: bool = False
410
+ ):
411
+ """
412
+ This function will use jinja templates to build a set of classes that represent a taxonomy,
413
+ these classes will extend the LLMData class and therefore have the ability to take an LLM
414
+ response and map it to the Kodexa Document identifying and labeling the nodes as needed
415
+
416
+ :param taxonomy:
417
+ :param output_dir:
418
+ :param output_file:
419
+ :param use_labels:
420
+ :return:
421
+ """
422
+
423
+ # We will use a jinja template to build all the classes we need, to do this
424
+ # will iterate over all the taxons the taxonomy
425
+ def set_path(taxon: Taxon, parent_path: Optional[str] = None):
426
+ if parent_path is not None:
427
+ taxon.path = parent_path + "/" + taxon.name
428
+ else:
429
+ taxon.path = taxon.name
430
+ if taxon.children:
431
+ for child_taxon in taxon.children:
432
+ set_path(child_taxon, taxon.path)
433
+
434
+ for taxon in taxonomy.taxons:
435
+ set_path(taxon, None)
436
+
437
+ def collect_group_taxons(taxons: list[Taxon]) -> list[Taxon]:
438
+ """
439
+ Recursively collects all group taxons from a list of taxons.
440
+
441
+ Args:
442
+ taxons (list[Taxon]): The list of taxons to collect group taxons from.
443
+
444
+ Returns:
445
+ list[Taxon]: A list of group taxons.
446
+
447
+ """
448
+ group_taxons = []
449
+ for taxon in taxons:
450
+ if taxon.group:
451
+ group_taxons.append(taxon)
452
+ if taxon.children:
453
+ group_taxons = group_taxons + collect_group_taxons(taxon.children)
454
+ return group_taxons
455
+
456
+ all_group_taxons = collect_group_taxons(taxonomy.taxons)
457
+ all_group_taxons.reverse()
458
+ context = {"taxons": all_group_taxons, "use_labels": use_labels}
459
+ write_template("llm_data_class.j2", output_dir, output_file, context)
460
+
461
+ # Lets log what we created
462
+ logger.info(f"Created the following classes in {output_dir}/{output_file}")
463
+ with open(f"{output_dir}/{output_file}", "r") as file:
464
+ logger.info(file.read())
@@ -0,0 +1,15 @@
1
+ from typing import Optional, List
2
+ from kodexa.dataclasses import LLMDataAttribute, LLMDataObject
3
+
4
+ {%- for taxon in taxons %}
5
+
6
+ class {{ taxon_to_class_name(taxon) }}(LLMDataObject):
7
+
8
+ {%- for child_taxon in taxon.children %}{%- if child_taxon.group %}
9
+ {{ taxon_to_property_name(child_taxon) }}: Optional[List[{{ taxon_to_class_name(child_taxon) }}]] = None
10
+ {%- else %}
11
+ {{ taxon_to_property_name(child_taxon) }}: Optional[LLMDataAttribute] = LLMDataAttribute(taxon_path='{{ child_taxon.path }}', data_type='{{ child_taxon.taxon_type.title() }}', value_path='{{ child_taxon.value_path.title() }}')
12
+ {%- endif %}
13
+ {%- endfor %}
14
+
15
+ {%- endfor %}
kodexa/model/model.py CHANGED
@@ -12,9 +12,9 @@ from typing import Any, List, Optional
12
12
  from addict import Dict
13
13
  import deepdiff
14
14
  import msgpack
15
- from pydantic import BaseModel, ConfigDict
15
+ from pydantic import BaseModel, ConfigDict, Field
16
16
 
17
- from kodexa.model.objects import ContentObject, FeatureSet
17
+ from kodexa.model.objects import ContentObject, FeatureSet, DocumentTaxonValidation
18
18
 
19
19
 
20
20
  class Ref:
@@ -2369,6 +2369,58 @@ class FeatureSetDiff:
2369
2369
  return node
2370
2370
 
2371
2371
 
2372
+ class ProcessingStep(BaseModel):
2373
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
2374
+ name: str
2375
+ metadata: dict = Field(default_factory=lambda: {})
2376
+ presentation_metadata: dict = Field(default_factory=lambda: {}, alias='presentationMetadata')
2377
+ children: List['ProcessingStep'] = Field(default_factory=list)
2378
+ parents: List['ProcessingStep'] = Field(default_factory=list)
2379
+
2380
+ def add_child(self, child_step: 'ProcessingStep'):
2381
+ self.children.append(child_step)
2382
+ child_step.parents.append(self)
2383
+
2384
+ @staticmethod
2385
+ def merge_with(*other_steps: 'ProcessingStep') -> 'ProcessingStep':
2386
+ merged_step = ProcessingStep(name=f"Merged({', '.join(step.name for step in other_steps)})")
2387
+ for step in other_steps:
2388
+ step.children.append(merged_step)
2389
+ merged_step.parents.append(step)
2390
+ return merged_step
2391
+
2392
+ class Config:
2393
+ arbitrary_types_allowed = True
2394
+ json_encoders = {
2395
+ 'ProcessingStep': lambda step: step.to_dict()
2396
+ }
2397
+
2398
+ def to_dict(self, seen=None):
2399
+ if seen is None:
2400
+ seen = set()
2401
+
2402
+ # Avoid circular references by skipping already seen objects
2403
+ if self.id in seen:
2404
+ return {'id': self.id, 'name': self.name}
2405
+
2406
+ seen.add(self.id)
2407
+
2408
+ return {
2409
+ 'id': self.id,
2410
+ 'name': self.name,
2411
+ 'metadata': self.metadata,
2412
+ 'presentationMetadata': self.presentation_metadata,
2413
+ 'children': [child.to_dict(seen) for child in self.children],
2414
+ 'parents': [{'id': parent.id, 'name': parent.name} for parent in self.parents], # or parent.to_dict(seen) if full structure is needed
2415
+ }
2416
+
2417
+ def to_json(self):
2418
+ return json.dumps(self.to_dict())
2419
+
2420
+ def __repr__(self):
2421
+ return f"Step(id={self.id}, name={self.name})"
2422
+
2423
+
2372
2424
  class Document(object):
2373
2425
  """A Document is a collection of metadata and a set of content nodes."""
2374
2426
 
@@ -2378,12 +2430,30 @@ class Document(object):
2378
2430
  def __str__(self):
2379
2431
  return f"kodexa://{self.uuid}"
2380
2432
 
2433
+ def get_validations(self) -> list[DocumentTaxonValidation]:
2434
+ return self.get_persistence().get_validations()
2435
+
2436
+ def set_validations(self, validations: list[DocumentTaxonValidation]):
2437
+ self.get_persistence().set_validations(validations)
2438
+
2381
2439
  def add_exception(self, exception: ContentException):
2382
2440
  self._persistence_layer.add_exception(exception)
2383
2441
 
2384
2442
  def get_exceptions(self) -> List[ContentException]:
2385
2443
  return self._persistence_layer.get_exceptions()
2386
2444
 
2445
+ def get_external_data(self) -> dict:
2446
+ return self._persistence_layer.get_external_data()
2447
+
2448
+ def set_external_data(self, external_data:dict):
2449
+ return self._persistence_layer.set_external_data(external_data)
2450
+
2451
+ def get_steps(self) -> list[ProcessingStep]:
2452
+ return self._persistence_layer.get_steps()
2453
+
2454
+ def set_steps(self, steps: list[ProcessingStep]):
2455
+ self._persistence_layer.set_steps(steps)
2456
+
2387
2457
  def replace_exceptions(self, exceptions: List[ContentException]):
2388
2458
  self._persistence_layer.replace_exceptions(exceptions)
2389
2459