kodexa 7.0.10599563193__py3-none-any.whl → 7.0.10667738312__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,16 @@
1
1
  import logging
2
2
  import os
3
3
  import uuid
4
+ from importlib.machinery import SourceFileLoader
4
5
  from typing import Optional, List
5
6
 
6
7
  import jinja2
7
- from pydantic import BaseModel
8
-
9
8
  from kodexa import ContentNode
10
- from kodexa.model.model import Tag
9
+ from kodexa.model.model import Tag, Document
11
10
  from kodexa.model.objects import ContentException, Taxon, Taxonomy, Assistant
12
- from kodexa.utils import taxon_to_property_name, taxon_to_class_name, taxon_to_group_path, snake_to_camel, \
13
- to_snake
11
+ from pydantic import BaseModel
12
+
13
+ from kodexa.utils import snake_to_camel, to_snake, taxon_to_property_name, taxon_to_class_name, taxon_to_group_path
14
14
 
15
15
  logger = logging.getLogger()
16
16
 
@@ -30,6 +30,7 @@ class LLMDataAttribute(BaseModel):
30
30
  normalized_text: Optional[str] = None
31
31
  node_uuid_list: Optional[List[int]] = None
32
32
  tag_uuid: Optional[str] = None
33
+ page_number: Optional[int] = None
33
34
  exceptions: Optional[list[ContentException]] = None
34
35
 
35
36
  def create_exception(
@@ -88,6 +89,100 @@ class LLMDataObject(BaseModel):
88
89
  else:
89
90
  self.group_uuid = group_uuid
90
91
 
92
+ def get_all_review_pages(self):
93
+ """
94
+ Returns a list of unique page numbers that would be included in the review.
95
+
96
+ :return: list of unique page numbers
97
+ """
98
+ pages = set()
99
+ for field in self.__fields__:
100
+ pages.update(self._get_field_pages(field))
101
+ return sorted(list(pages))
102
+
103
+ def _get_field_pages(self, field):
104
+ if isinstance(getattr(self, field), list):
105
+ pages = set()
106
+ for item in getattr(self, field):
107
+
108
+ if isinstance(item, LLMDataObject):
109
+ pages.update(item.get_all_review_pages())
110
+ return pages
111
+ elif isinstance(getattr(self, field), LLMDataAttribute):
112
+ if getattr(self, field).value != getattr(self, field).normalized_text:
113
+ return {getattr(self, field).page_number}
114
+ elif isinstance(getattr(self, field), LLMDataObject):
115
+ return getattr(self, field).get_all_review_pages()
116
+ return set()
117
+
118
+ def update_from_review(self, review_dict):
119
+ """
120
+ Update the node UUIDs and value based on the provided review dictionary.
121
+
122
+ :param review_dict: A dictionary containing the updated review information
123
+ """
124
+ for field, field_data in review_dict.items():
125
+ self._update_field_review(field, field_data)
126
+
127
+ def _update_field_review(self, field, field_data):
128
+ if isinstance(field_data, list):
129
+ for i, item_data in enumerate(field_data):
130
+ if i < len(getattr(self, field)):
131
+ getattr(self, field)[i].update_from_review(item_data)
132
+ elif isinstance(field_data, dict):
133
+ if isinstance(getattr(self, field), LLMDataAttribute):
134
+ self._update_data_attribute(field, field_data)
135
+ elif isinstance(getattr(self, field), LLMDataObject):
136
+ getattr(self, field).update_from_review(field_data)
137
+
138
+ def _update_data_attribute(self, field, field_data):
139
+ attr = getattr(self, field)
140
+ if 'value' in field_data:
141
+ attr.value = field_data['value']
142
+ if 'node_uuids' in field_data:
143
+ attr.node_uuid_list = field_data['node_uuids']
144
+ if 'normalized_text' in field_data:
145
+ attr.normalized_text = field_data['normalized_text']
146
+
147
+ def to_review(self, page_number=None):
148
+ """
149
+ Build a representation of the data object and its data attributes that is a dict that includes the
150
+ value, normalized text and node UUIDs so we can use this to review mismatched value/normalized
151
+ with the LLM for a specific page number.
152
+
153
+ :param page_number: Optional page number to filter the review items
154
+ :return: dict of this data object and children for the specified page
155
+ """
156
+ review = {}
157
+ for field in self.__fields__:
158
+ review_field = self._build_review(field, page_number)
159
+ if review_field:
160
+ review[field] = review_field
161
+ return review
162
+
163
+ def _build_review(self, field, page_number=None):
164
+ if isinstance(getattr(self, field), list):
165
+ review_field = []
166
+ for item in getattr(self, field):
167
+ if isinstance(item, LLMDataObject):
168
+ new_review = item.to_review(page_number)
169
+ if new_review:
170
+ review_field.append(new_review)
171
+ return review_field if review_field else None
172
+ elif isinstance(getattr(self, field), LLMDataAttribute):
173
+ if getattr(self, field).value != getattr(self, field).normalized_text:
174
+ if page_number is None or getattr(self, field).page_number == page_number:
175
+ return {
176
+ "value": getattr(self, field).value,
177
+ "normalized_text": getattr(self, field).normalized_text,
178
+ "node_uuids": getattr(self, field).node_uuid_list,
179
+ "page_number": getattr(self, field).page_number,
180
+ }
181
+ elif isinstance(getattr(self, field), LLMDataObject):
182
+ return getattr(self, field).to_review(page_number)
183
+
184
+ return None
185
+
91
186
  def create_exception(
92
187
  self,
93
188
  exception_type_id: str,
@@ -215,6 +310,25 @@ class LLMDataObject(BaseModel):
215
310
  current_value.append(new_tag)
216
311
  node.remove_feature("tag", tag)
217
312
  node.add_feature("tag", tag, current_value, single=False)
313
+ # try:
314
+ # if value.data_type == 'Derived':
315
+ # logger.info(f"Node already has tag {tag} - Tagging something nearby {node.get_all_content()}")
316
+ # nearby_node = find_nearby_word_to_tag(node, tag)
317
+ # nearby_node.tag(
318
+ # tag_to_apply=tag,
319
+ # value=value.normalized_text,
320
+ # tag_uuid=tag_uuid,
321
+ # cell_index=self.cell_index,
322
+ # selector="//word",
323
+ # confidence=-1,
324
+ # group_uuid=self.group_uuid,
325
+ # parent_group_uuid=parent_group_uuid,
326
+ # owner_uri=f"assistant://{assistant.id}" if assistant else f"model://taxonomy-llm",
327
+ # )
328
+ # else:
329
+ # logger.info(f"Node already has tag {tag} - Skipping.")
330
+ # except:
331
+ # logger.error(f"Error tagging nearby node with tag {tag}")
218
332
 
219
333
  logger.info(f"Applied label {tag} to {len(nodes_to_label)} nodes")
220
334
  if isinstance(value, LLMDataObject):
@@ -249,7 +363,8 @@ def get_template_env():
249
363
  Returns:
250
364
 
251
365
  """
252
- package_location = os.path.dirname(os.path.abspath(__file__))
366
+ cli_path = os.path.dirname(os.path.abspath(__file__))
367
+ package_location = os.path.join(cli_path, "templates")
253
368
  template_loader = jinja2.FileSystemLoader([os.getcwd(), package_location])
254
369
  env = jinja2.Environment(loader=template_loader, autoescape=True)
255
370
  env.globals["snake_to_camel"] = snake_to_camel
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: kodexa
3
- Version: 7.0.10599563193
3
+ Version: 7.0.10667738312
4
4
  Summary: Python SDK for the Kodexa Platform
5
5
  Author: Austin Redenbaugh
6
6
  Author-email: austin@kodexa.com
@@ -3,8 +3,8 @@ kodexa/assistant/__init__.py,sha256=nlXm_YnV_50hgn0TIT2Fkc2fQ-86OjmctY_j8My9nc4,
3
3
  kodexa/assistant/assistant.py,sha256=5KFdbqFSLIZJyDRyZdpcfr448fT-CW4JhYu9A6B9DGY,14663
4
4
  kodexa/connectors/__init__.py,sha256=WF6G_MUeU32TlKSUKkpNoNX7dq8iBPliFMep4E8BmZc,328
5
5
  kodexa/connectors/connectors.py,sha256=FpUZDkSyHld2b9eYRuVOWzaFtuGoaRuPXXicJB7THbc,10413
6
- kodexa/dataclasses/__init__.py,sha256=gM1meK2rltv3OO9oJGtuLG7It0L-JS8rMmSAg44Wbp8,12815
7
- kodexa/dataclasses/llm_data_class.j2,sha256=1l30_Men0_cPEd6FCzbwsrWUi1QZidNEFXR06WudYlk,1127
6
+ kodexa/dataclasses/__init__.py,sha256=hxx1Z2vKKxxFH7NUqKTyi9reiwNkrWCwndoZgu_k5p8,18394
7
+ kodexa/dataclasses/templates/llm_data_class.j2,sha256=1l30_Men0_cPEd6FCzbwsrWUi1QZidNEFXR06WudYlk,1127
8
8
  kodexa/model/__init__.py,sha256=rtLXYJBxB-rnukhslN9rlqoB3--1H3253HyHGbD_Gc8,796
9
9
  kodexa/model/base.py,sha256=CaZK8nMhT1LdCpt4aLhebJGcorjq9qRID1FjnXnP14M,521
10
10
  kodexa/model/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -42,7 +42,7 @@ kodexa/testing/test_utils.py,sha256=DrLCkHxdb6AbZ-X3WmTMbQmnVIm55VEBL8MjtUK9POs,
42
42
  kodexa/training/__init__.py,sha256=xs2L62YpRkIRfslQwtQZ5Yxjhm7sLzX2TrVX6EuBnZQ,52
43
43
  kodexa/training/train_utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  kodexa/utils/__init__.py,sha256=Pnim1o9_db5YEnNvDTxpM7HG-qTlL6n8JwFwOafU9wo,5928
45
- kodexa-7.0.10599563193.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
46
- kodexa-7.0.10599563193.dist-info/METADATA,sha256=dvrbwiDKwbdmvc9H2sFbhROxErxf5RNj_B-u4xueB-U,3533
47
- kodexa-7.0.10599563193.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
48
- kodexa-7.0.10599563193.dist-info/RECORD,,
45
+ kodexa-7.0.10667738312.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
46
+ kodexa-7.0.10667738312.dist-info/METADATA,sha256=aoe6uSDyFD3ti1sZOt9Dpiw0Z2qYc3Ftsdm3pyIIE_4,3533
47
+ kodexa-7.0.10667738312.dist-info/WHEEL,sha256=Zb28QaM1gQi8f4VCBhsUklF61CTlNYfs9YAZn-TOGFk,88
48
+ kodexa-7.0.10667738312.dist-info/RECORD,,