kodexa 6.1.15059968382__tar.gz → 6.1.15059982168__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/PKG-INFO +1 -1
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/model/model.py +162 -6
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/pyproject.toml +1 -1
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/setup.py +1 -1
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/LICENSE +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/README.md +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/assistant/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/assistant/assistant.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/connectors/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/connectors/connectors.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/model/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/model/base.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/model/objects.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/model/persistence.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/pipeline/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/pipeline/pipeline.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/platform/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/platform/client.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/platform/kodexa.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/ast.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/core.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/lexrules.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/lextab.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/lextab.pyi +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/parserules.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/parserules.pyi +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/parsetab.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/selectors/parsetab.pyi +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/spatial/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/spatial/azure_models.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/spatial/bbox_common.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/spatial/table_form_common.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/steps/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/steps/common.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/testing/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/testing/test_components.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/testing/test_utils.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/training/__init__.py +0 -0
- {kodexa-6.1.15059968382 → kodexa-6.1.15059982168}/kodexa/training/train_utils.py +0 -0
@@ -15,6 +15,7 @@ from addict import Dict
|
|
15
15
|
|
16
16
|
from kodexa.model.base import KodexaBaseModel
|
17
17
|
from kodexa.model.objects import ContentObject, FeatureSet
|
18
|
+
import deepdiff
|
18
19
|
|
19
20
|
|
20
21
|
class Ref:
|
@@ -81,7 +82,7 @@ class Tag(Dict):
|
|
81
82
|
"""A string representing the value that was labelled in the node"""
|
82
83
|
self.data: Optional[Any] = data
|
83
84
|
"""Any data object (JSON serializable) that you wish to associate with the label"""
|
84
|
-
self.uuid: Optional[str] = uuid
|
85
|
+
self.uuid: Optional[str] = uuid or str(uuid.uuid4())
|
85
86
|
"""The UUID for this tag instance, this allows tags that are on different content nodes to be related through the same UUID"""
|
86
87
|
self.confidence: Optional[float] = confidence
|
87
88
|
"""The confidence of the tag in a range of 0-1"""
|
@@ -1717,13 +1718,124 @@ class FeatureSetDiff:
|
|
1717
1718
|
"""
|
1718
1719
|
|
1719
1720
|
def __init__(self, first_feature_set: FeatureSet, second_feature_set: FeatureSet):
|
1720
|
-
self.
|
1721
|
-
self.
|
1721
|
+
self.first_feature_map = self.parse_feature_set(first_feature_set)
|
1722
|
+
self.second_feature_map = self.parse_feature_set(second_feature_set)
|
1723
|
+
self._differences = deepdiff.DeepDiff(self.first_feature_map, self.second_feature_map,
|
1724
|
+
exclude_obj_callback=self.exclude_callback).to_dict()
|
1725
|
+
self._changed_nodes = self.get_changed_nodes()
|
1722
1726
|
|
1723
|
-
def
|
1724
|
-
|
1727
|
+
def get_differences(self):
|
1728
|
+
"""
|
1729
|
+
:return: Data dictionaries that contains the differences of two feature sets
|
1730
|
+
"""
|
1731
|
+
if 'type_changes' in self._differences:
|
1732
|
+
self._differences.pop('type_changes')
|
1733
|
+
|
1734
|
+
return self._differences
|
1735
|
+
|
1736
|
+
def get_changed_nodes(self):
|
1737
|
+
"""
|
1738
|
+
:return: Data dictionary of added and removed nodes
|
1739
|
+
"""
|
1740
|
+
return self._changed_nodes
|
1741
|
+
|
1742
|
+
def get_exclude_paths(self):
|
1743
|
+
"""
|
1744
|
+
:return: List of paths to exclude
|
1745
|
+
"""
|
1746
|
+
return ['shape', 'group_uuid', 'uuid', 'parent_group_uuid', 'single']
|
1747
|
+
|
1748
|
+
def exclude_callback(self, path, key):
|
1749
|
+
"""
|
1750
|
+
Checks if the key is to be exluceded from the diff
|
1751
|
+
:param path: contains the values of that key
|
1752
|
+
:param key: The key of the data dictionary to compare
|
1753
|
+
:return: boolean
|
1754
|
+
"""
|
1755
|
+
if any(re.search(exclude_key, key) for exclude_key in self.get_exclude_paths()):
|
1756
|
+
return True
|
1757
|
+
else:
|
1758
|
+
return False
|
1759
|
+
|
1760
|
+
def parse_feature_set(self, feature_set: FeatureSet):
|
1761
|
+
"""
|
1762
|
+
:param feature_set: The feature set to be parsed
|
1763
|
+
:return: Dictionary of feature with the key as the nodeUuid
|
1764
|
+
"""
|
1765
|
+
return {feature.get('nodeUuid'): feature for feature in feature_set.node_features}
|
1766
|
+
|
1767
|
+
def parsed_values_changed(self):
|
1768
|
+
for key, value in self._differences.get('values_changed').items():
|
1769
|
+
# Check if the old_value is stil in the second_feature_map. If it is remove the key
|
1770
|
+
if key in self.second_feature_map.node_features:
|
1771
|
+
self._differences.get('values_changed').remove(key)
|
1772
|
+
|
1773
|
+
def is_equal(self) -> bool:
|
1774
|
+
"""
|
1775
|
+
Checks if the two feature set is equal to each other
|
1776
|
+
:return: This returns a bool
|
1777
|
+
"""
|
1778
|
+
return self._differences == {}
|
1779
|
+
|
1780
|
+
def get_changed_nodes(self):
|
1781
|
+
"""
|
1782
|
+
:return: A list of nodes that were changed
|
1783
|
+
"""
|
1784
|
+
if self.is_equal():
|
1785
|
+
return []
|
1786
|
+
|
1787
|
+
# Check for new nodes added in the second_feature_map
|
1788
|
+
new_added_nodes = []
|
1725
1789
|
|
1726
|
-
|
1790
|
+
# Checked for removed nodes in the first_feature_map
|
1791
|
+
removed_nodes = []
|
1792
|
+
|
1793
|
+
# Checked for modified nodes
|
1794
|
+
modified_nodes = []
|
1795
|
+
for key, value in self._differences.get('values_changed').items():
|
1796
|
+
modified_nodes.append(self.parsed_node_uuid(key))
|
1797
|
+
|
1798
|
+
# Merge unique nodeUuid of first_feature_map and second_feature_map
|
1799
|
+
merged_node_uuids = set(self.first_feature_map.keys()).union(set(self.second_feature_map.keys()))
|
1800
|
+
for node_uuid in merged_node_uuids:
|
1801
|
+
if node_uuid not in self.first_feature_map:
|
1802
|
+
new_added_nodes.append(node_uuid)
|
1803
|
+
elif node_uuid not in self.second_feature_map:
|
1804
|
+
removed_nodes.append(node_uuid)
|
1805
|
+
|
1806
|
+
return {
|
1807
|
+
'new_added_nodes': new_added_nodes,
|
1808
|
+
'removed_nodes': removed_nodes,
|
1809
|
+
'existing_modified_nodes': modified_nodes
|
1810
|
+
}
|
1811
|
+
|
1812
|
+
def get_difference_count(self):
|
1813
|
+
"""
|
1814
|
+
:return: The total number of differences between the feature sets
|
1815
|
+
"""
|
1816
|
+
return len(self._differences().keys())
|
1817
|
+
|
1818
|
+
def parsed_item_added(self):
|
1819
|
+
item_added: Dict = self._differences.get('iterable_item_added')
|
1820
|
+
if item_added:
|
1821
|
+
return {}
|
1822
|
+
|
1823
|
+
for key, value in item_added.items():
|
1824
|
+
node = self.parsed_node_uuid(key)
|
1825
|
+
if node in self._changed_nodes['new_added_nodes']:
|
1826
|
+
self._differences['iterable_item_added'][key]['details'] = f'Node: {node} was added'
|
1827
|
+
continue
|
1828
|
+
|
1829
|
+
# if node in
|
1830
|
+
return self.get_difference_count()
|
1831
|
+
|
1832
|
+
def parsed_node_uuid(self, key):
|
1833
|
+
"""
|
1834
|
+
:param key: Key of data dictionary
|
1835
|
+
:return: node uuid from the key
|
1836
|
+
"""
|
1837
|
+
node = key.split("['")[1].split("']")[0]
|
1838
|
+
return node
|
1727
1839
|
|
1728
1840
|
|
1729
1841
|
class Document(object):
|
@@ -1781,6 +1893,9 @@ class Document(object):
|
|
1781
1893
|
self.classes: List[ContentClassification] = []
|
1782
1894
|
"""A list of the content classifications associated at the document level"""
|
1783
1895
|
|
1896
|
+
self.tag_instances: List[TagInstance] = []
|
1897
|
+
"""A list of tag instances that contains a set of tag that has a set of nodes"""
|
1898
|
+
|
1784
1899
|
# Start persistence layer
|
1785
1900
|
from kodexa.model import PersistenceManager
|
1786
1901
|
|
@@ -1789,6 +1904,36 @@ class Document(object):
|
|
1789
1904
|
delete_on_close=delete_on_close)
|
1790
1905
|
self._persistence_layer.initialize()
|
1791
1906
|
|
1907
|
+
def add_tag_instance(self, tag_to_apply, node_list: List[ContentNode]):
|
1908
|
+
"""
|
1909
|
+
This will create a group of a tag with indexes
|
1910
|
+
:param tag: name of the tag
|
1911
|
+
:param node_indices: contains the list of index of a node
|
1912
|
+
:return:
|
1913
|
+
"""
|
1914
|
+
# For each node in the list create/update a feature
|
1915
|
+
tag = Tag()
|
1916
|
+
for node in node_list:
|
1917
|
+
node.add_feature('tag', tag_to_apply, Tag)
|
1918
|
+
# Tag Object
|
1919
|
+
tag_instance = TagInstance(tag, node_list)
|
1920
|
+
self.tag_instances.append(tag_instance)
|
1921
|
+
|
1922
|
+
def update_tag_instance(self, tag_uuid):
|
1923
|
+
for tag_instance in self.tag_instances:
|
1924
|
+
if tag_instance.tag.uuid == tag_uuid:
|
1925
|
+
# Update attributes of a Tag
|
1926
|
+
for node in tag_instance.nodes:
|
1927
|
+
node.get_tag(tag_instance.tag.value, tag_uuid=tag_instance.tag.uuid)
|
1928
|
+
|
1929
|
+
def get_tag_instance(self, tag):
|
1930
|
+
"""
|
1931
|
+
Get the tag instance based on the tag itself
|
1932
|
+
:param tag: name of the tag
|
1933
|
+
:return: a list of tag instance
|
1934
|
+
"""
|
1935
|
+
return [tag_instance for tag_instance in self.tag_instances if tag_instance.tag == tag]
|
1936
|
+
|
1792
1937
|
def get_persistence(self):
|
1793
1938
|
return self._persistence_layer
|
1794
1939
|
|
@@ -2321,6 +2466,8 @@ class Document(object):
|
|
2321
2466
|
feature_dict['name'] = feature.name
|
2322
2467
|
node_feature['features'].append(feature_dict)
|
2323
2468
|
|
2469
|
+
return feature_set
|
2470
|
+
|
2324
2471
|
def get_all_tagged_nodes(self) -> List[ContentNode]:
|
2325
2472
|
"""
|
2326
2473
|
Get all the tagged nodes in the document
|
@@ -2330,6 +2477,15 @@ class Document(object):
|
|
2330
2477
|
return self._persistence_layer.get_all_tagged_nodes()
|
2331
2478
|
|
2332
2479
|
|
2480
|
+
class TagInstance:
|
2481
|
+
def __init__(self, tag: Tag, nodes):
|
2482
|
+
self.tag = tag
|
2483
|
+
self.nodes = nodes
|
2484
|
+
|
2485
|
+
def add_node(self, nodes: List[ContentNode]):
|
2486
|
+
self.nodes.extend(nodes)
|
2487
|
+
|
2488
|
+
|
2333
2489
|
class ContentObjectReference:
|
2334
2490
|
""" """
|
2335
2491
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "kodexa"
|
3
|
-
version = "6.1.
|
3
|
+
version = "6.1.15059982168"
|
4
4
|
description = "Python SDK for the Kodexa Platform"
|
5
5
|
authors = ["Austin Redenbaugh <austin@kodexa.com>", "Philip Dodds <philip@kodexa.com>", "Romar Cablao <rcablao@kodexa.com>", "Amadea Paula Dodds <amadeapaula@kodexa.com>"]
|
6
6
|
readme = "README.md"
|
@@ -37,7 +37,7 @@ install_requires = \
|
|
37
37
|
|
38
38
|
setup_kwargs = {
|
39
39
|
'name': 'kodexa',
|
40
|
-
'version': '6.1.
|
40
|
+
'version': '6.1.15059982168',
|
41
41
|
'description': 'Python SDK for the Kodexa Platform',
|
42
42
|
'long_description': '# Kodexa\n\n[](https://github.com/kodexa-ai/kodexa/actions/workflows/main.yml)\n\n\n\nKodexa is a platform for building intelligent document processing pipelines. It is a set of tools and services that\nallow you to build a pipeline that can take a document, extract the content, and then process it to extract the\ninformation you need.\n\nIt is built on a set of core principles:\n\n* **Document Centric** - Kodexa is built around the idea of a document. A document is a collection of content\n nodes that are connected together. This is a powerful model that allows you to build pipelines that can\n extract content from a wide range of sources.\n\n* **Pipeline Oriented** - Kodexa is built around the idea of a pipeline. A pipeline is a series of steps that\n can be executed on a document. This allows you to build a pipeline that can extract content from a wide range\n of sources.\n\n* **Extensible** - Kodexa is built around the idea of a pipeline. A pipeline is a series of steps that can be executed\n on a document. This allows you to build a pipeline that can extract content from a wide range of sources.\n\n* **Label Driven** - Kodexa focuses on the idea of labels. Labels are a way to identify content within a document\n and then use that content to drive the processing of the document.\n\n# Python SDK\n\nThis repository contains the Python SDK for Kodexa. The SDK is the primary way to interact with Kodexa. It allows you to\ndefine actions, models, and pipelines that can be executed on Kodexa. It also includes a complete SDK client for\nworking with a Kodexa platform instance.\n\n## Documentation & Examples\n\nDocumentation is available at the [Kodexa Documentation Portal](https://docs.kodexa.com)\n\n## Current Development\n\n[//]: # (Replace it with the diagrams and descriptions for build releases)\n**BUILD VERSION FLOW**\n\nBuild version will differ based on the branches that are published to pypi.\n\n**GITHUB PROCESS**\n\nChanges that contain bugs, features, and fixes should first be pushed to the test branch. \nOnce these changes are thoroughly tested, they can be submitted as a pull request to the main branch. The pull request should be reviewed and approved by an appropriate person before the changes can be merged.\n\n## Set-up\n\nWe use poetry to manage our dependencies, so you can install them with:\n\n poetry install\n\nYou can then run the tests with:\n\n poetry run pytest\n\n# Contributing\n\nWe welcome contributions to the Kodexa platform. Please see our [contributing guide](CONTRIBUTING.md) for more details.\n\n# License\n\nApache 2.0\n\n',
|
43
43
|
'author': 'Austin Redenbaugh',
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|