deriva 1.7.0__py3-none-any.whl → 1.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva/config/annotation_config.py +2 -2
- deriva/core/__init__.py +1 -1
- deriva/core/catalog_cli.py +83 -39
- deriva/core/datapath.py +519 -26
- deriva/core/ermrest_catalog.py +103 -23
- deriva/core/ermrest_model.py +387 -7
- deriva/core/hatrac_cli.py +5 -3
- deriva/core/utils/globus_auth_utils.py +3 -1
- deriva/transfer/__init__.py +4 -2
- deriva/transfer/download/__init__.py +4 -0
- deriva/transfer/download/deriva_download.py +33 -13
- deriva/transfer/download/deriva_download_cli.py +3 -2
- deriva/transfer/download/processors/query/base_query_processor.py +9 -4
- deriva/transfer/upload/__init__.py +4 -0
- deriva/transfer/upload/deriva_upload.py +9 -2
- deriva/transfer/upload/deriva_upload_cli.py +2 -2
- {deriva-1.7.0.dist-info → deriva-1.7.3.dist-info}/METADATA +2 -2
- {deriva-1.7.0.dist-info → deriva-1.7.3.dist-info}/RECORD +23 -23
- tests/deriva/core/test_datapath.py +24 -2
- {deriva-1.7.0.dist-info → deriva-1.7.3.dist-info}/LICENSE +0 -0
- {deriva-1.7.0.dist-info → deriva-1.7.3.dist-info}/WHEEL +0 -0
- {deriva-1.7.0.dist-info → deriva-1.7.3.dist-info}/entry_points.txt +0 -0
- {deriva-1.7.0.dist-info → deriva-1.7.3.dist-info}/top_level.txt +0 -0
deriva/core/datapath.py
CHANGED
|
@@ -5,6 +5,7 @@ import copy
|
|
|
5
5
|
from datetime import date
|
|
6
6
|
import itertools
|
|
7
7
|
import logging
|
|
8
|
+
import time
|
|
8
9
|
import re
|
|
9
10
|
from requests import HTTPError
|
|
10
11
|
import warnings
|
|
@@ -395,7 +396,8 @@ class DataPath (object):
|
|
|
395
396
|
By default links use inner join semantics on the foreign key / key equality comparison. The `join_type`
|
|
396
397
|
parameter can be used to specify `left`, `right`, or `full` outer join semantics.
|
|
397
398
|
|
|
398
|
-
:param right: the right hand table of the link expression
|
|
399
|
+
:param right: the right hand table of the link expression; if the table or alias name is in use, an incremental
|
|
400
|
+
number will be used to disambiguate tables instances of the same original name.
|
|
399
401
|
:param on: an equality comparison between key and foreign key columns, a conjunction of such comparisons, or a foreign key object
|
|
400
402
|
:param join_type: the join type of this link which may be 'left', 'right', 'full' outer joins or '' for inner
|
|
401
403
|
join link by default.
|
|
@@ -413,7 +415,7 @@ class DataPath (object):
|
|
|
413
415
|
raise ValueError("'on' must be specified for outer joins")
|
|
414
416
|
if right._schema._catalog != self._root._schema._catalog:
|
|
415
417
|
raise ValueError("'right' is from a different catalog. Cannot link across catalogs.")
|
|
416
|
-
if isinstance(right, _TableAlias) and right.
|
|
418
|
+
if isinstance(right, _TableAlias) and right._parent == self:
|
|
417
419
|
raise ValueError("'right' is a table alias that has already been used.")
|
|
418
420
|
else:
|
|
419
421
|
# Generate an unused alias name for the table
|
|
@@ -606,6 +608,18 @@ class DataPath (object):
|
|
|
606
608
|
|
|
607
609
|
return self
|
|
608
610
|
|
|
611
|
+
def denormalize(self, context_name=None, heuristic=None, groupkey_name='RID'):
|
|
612
|
+
"""Denormalizes a path based on a visible-columns annotation 'context' or a heuristic approach.
|
|
613
|
+
|
|
614
|
+
This method does not mutate this object. It returns a result set representing the denormalization of the path.
|
|
615
|
+
|
|
616
|
+
:param context_name: name of the visible-columns context or if none given, will attempt apply heuristics
|
|
617
|
+
:param heuristic: heuristic to apply if no context name specified
|
|
618
|
+
:param groupkey_name: column name for the group by key of the generated query expression (default: 'RID')
|
|
619
|
+
:return: a results set.
|
|
620
|
+
"""
|
|
621
|
+
return _datapath_denormalize(self, context_name=context_name, heuristic=heuristic, groupkey_name=groupkey_name)
|
|
622
|
+
|
|
609
623
|
|
|
610
624
|
class _ResultSet (object):
|
|
611
625
|
"""A set of results for various queries or data manipulations.
|
|
@@ -623,6 +637,7 @@ class _ResultSet (object):
|
|
|
623
637
|
self._fetcher_fn = fetcher_fn
|
|
624
638
|
self._results_doc = None
|
|
625
639
|
self._sort_keys = None
|
|
640
|
+
self._limit = None
|
|
626
641
|
self.uri = uri
|
|
627
642
|
|
|
628
643
|
@property
|
|
@@ -656,6 +671,19 @@ class _ResultSet (object):
|
|
|
656
671
|
self._results_doc = None
|
|
657
672
|
return self
|
|
658
673
|
|
|
674
|
+
def limit(self, n):
|
|
675
|
+
"""Set a limit on the number of results to be returned.
|
|
676
|
+
|
|
677
|
+
:param n: integer or None.
|
|
678
|
+
:return: self
|
|
679
|
+
"""
|
|
680
|
+
try:
|
|
681
|
+
self._limit = None if n is None else int(n)
|
|
682
|
+
self._results_doc = None
|
|
683
|
+
return self
|
|
684
|
+
except ValueError:
|
|
685
|
+
raise ValueError('limit argument "n" must be an integer or None')
|
|
686
|
+
|
|
659
687
|
def fetch(self, limit=None, headers=DEFAULT_HEADERS):
|
|
660
688
|
"""Fetches the results from the catalog.
|
|
661
689
|
|
|
@@ -663,11 +691,107 @@ class _ResultSet (object):
|
|
|
663
691
|
:param headers: headers to send in request to server
|
|
664
692
|
:return: self
|
|
665
693
|
"""
|
|
666
|
-
limit = int(limit) if limit else
|
|
694
|
+
limit = int(limit) if limit else self._limit
|
|
667
695
|
self._results_doc = self._fetcher_fn(limit, self._sort_keys, headers)
|
|
668
696
|
logger.debug("Fetched %d entities" % len(self._results_doc))
|
|
669
697
|
return self
|
|
670
698
|
|
|
699
|
+
def _json_size_approx(data):
|
|
700
|
+
"""Return approximate byte count for minimal JSON encoding of data
|
|
701
|
+
|
|
702
|
+
Minimal encoding has no optional whitespace/indentation.
|
|
703
|
+
"""
|
|
704
|
+
nbytes = 0
|
|
705
|
+
|
|
706
|
+
if isinstance(data, (list, tuple)):
|
|
707
|
+
nbytes += 2
|
|
708
|
+
for elem in data:
|
|
709
|
+
nbytes += _json_size_approx(elem) + 1
|
|
710
|
+
elif isinstance(data, dict):
|
|
711
|
+
nbytes += 2
|
|
712
|
+
for k, v in data.items():
|
|
713
|
+
nbytes += _json_size_approx(k) + _json_size_approx(v) + 2
|
|
714
|
+
elif isinstance(data, str):
|
|
715
|
+
nbytes += len(data.encode("utf-8")) + 2
|
|
716
|
+
else:
|
|
717
|
+
nbytes += len(str(data))
|
|
718
|
+
|
|
719
|
+
return nbytes
|
|
720
|
+
|
|
721
|
+
def _generate_batches(entities, max_batch_rows=1000, max_batch_bytes=250*1024):
|
|
722
|
+
"""Generate a series of entity batches as slices of the input entities
|
|
723
|
+
|
|
724
|
+
"""
|
|
725
|
+
if not isinstance(entities, (list, tuple)):
|
|
726
|
+
raise TypeError('invalid type %s for entities, list or tuple expected' % (type(entities),))
|
|
727
|
+
|
|
728
|
+
if not max_batch_rows:
|
|
729
|
+
logger.debug("disabling batching due to max_batch_rows=%r" % (max_batch_rows,))
|
|
730
|
+
return entities
|
|
731
|
+
|
|
732
|
+
top = len(entities)
|
|
733
|
+
lower = 0
|
|
734
|
+
|
|
735
|
+
while lower < top:
|
|
736
|
+
# to ensure progress, always use at least one row per batch regardless of nbytes
|
|
737
|
+
upper = lower + 1
|
|
738
|
+
batch_nbytes = _json_size_approx(entities[lower])
|
|
739
|
+
|
|
740
|
+
# advance upper position until a batch size limit is reached
|
|
741
|
+
while (upper - lower) < max_batch_rows:
|
|
742
|
+
if upper >= top:
|
|
743
|
+
break
|
|
744
|
+
batch_nbytes += _json_size_approx(entities[upper])
|
|
745
|
+
if batch_nbytes > max_batch_bytes:
|
|
746
|
+
break
|
|
747
|
+
upper += 1
|
|
748
|
+
|
|
749
|
+
# generate one batch and advance for next batch
|
|
750
|
+
logger.debug("yielding batch of %d/%d entities (%d:%d)" % (upper-lower, top, lower, upper))
|
|
751
|
+
yield entities[lower:upper]
|
|
752
|
+
lower = upper
|
|
753
|
+
|
|
754
|
+
def _request_with_retry(request_func, retry_codes={408, 429, 500, 502, 503, 504}, backoff_factor=4, max_attempts=5):
|
|
755
|
+
"""Perform request func with exponential backoff and retry.
|
|
756
|
+
|
|
757
|
+
:param request_func: A function returning a requests.Response object or raising HTTPError
|
|
758
|
+
:param retry_codes: HTTPError status codes on which to attempt retry
|
|
759
|
+
:param backoff_factor: Base number of seconds for factor**attempt exponential backoff
|
|
760
|
+
:param max_attempts: Max number of request attempts.
|
|
761
|
+
|
|
762
|
+
Retry will be attempted on HTTPError exceptions which match retry_codes and
|
|
763
|
+
also on other unknown exceptions, presumed to be transport errors.
|
|
764
|
+
|
|
765
|
+
The request_func should do the equivalent of resp.raise_on_status() so that
|
|
766
|
+
it only returns a response object for successful requests.
|
|
767
|
+
"""
|
|
768
|
+
attempt = 0
|
|
769
|
+
last_ex = None
|
|
770
|
+
|
|
771
|
+
while attempt < max_attempts:
|
|
772
|
+
try:
|
|
773
|
+
if attempt > 0:
|
|
774
|
+
delay = backoff_factor**(attempt-1)
|
|
775
|
+
logger.debug("sleeping %d seconds before retry %d..." % (delay, attempt))
|
|
776
|
+
time.sleep(delay)
|
|
777
|
+
attempt += 1
|
|
778
|
+
return request_func()
|
|
779
|
+
except HTTPError as e:
|
|
780
|
+
logger.debug(e.response.text)
|
|
781
|
+
last_ex = e
|
|
782
|
+
if 400 <= e.response.status_code < 500:
|
|
783
|
+
last_ex = DataPathException(_http_error_message(e), e)
|
|
784
|
+
if int(e.response.status_code) not in retry_codes:
|
|
785
|
+
raise last_ex
|
|
786
|
+
except Exception as e:
|
|
787
|
+
logger.debug(e.response.text)
|
|
788
|
+
last_ex = e
|
|
789
|
+
|
|
790
|
+
# early return means we don't get here on successful requests
|
|
791
|
+
logger.warning("maximum request retry limit %d exceeded" % (max_attempts,))
|
|
792
|
+
if last_ex is None:
|
|
793
|
+
raise ValueError('exceeded max_attempts without catching a request exception')
|
|
794
|
+
raise last_ex
|
|
671
795
|
|
|
672
796
|
class _TableWrapper (object):
|
|
673
797
|
"""Wraps a Table for datapath expressions.
|
|
@@ -797,7 +921,19 @@ class _TableWrapper (object):
|
|
|
797
921
|
"""
|
|
798
922
|
return _AttributeGroup(self, self._query, keys)
|
|
799
923
|
|
|
800
|
-
def
|
|
924
|
+
def denormalize(self, context_name=None, heuristic=None, groupkey_name='RID'):
|
|
925
|
+
"""Denormalizes a path based on a visible-columns annotation 'context' or a heuristic approach.
|
|
926
|
+
|
|
927
|
+
This method does not mutate this object. It returns a result set representing the denormalization of the path.
|
|
928
|
+
|
|
929
|
+
:param context_name: name of the visible-columns context or if none given, will attempt apply heuristics
|
|
930
|
+
:param heuristic: heuristic to apply if no context name specified
|
|
931
|
+
:param groupkey_name: column name for the group by key of the generated query expression (default: 'RID')
|
|
932
|
+
:return: a results set.
|
|
933
|
+
"""
|
|
934
|
+
return self.path.denormalize(context_name=context_name, heuristic=heuristic, groupkey_name=groupkey_name)
|
|
935
|
+
|
|
936
|
+
def insert(self, entities, defaults=set(), nondefaults=set(), add_system_defaults=True, on_conflict_skip=False, retry_codes={408, 429, 500, 502, 503, 504}, backoff_factor=4, max_attempts=5, max_batch_rows=1000, max_batch_bytes=250*1024):
|
|
801
937
|
"""Inserts entities into the table.
|
|
802
938
|
|
|
803
939
|
:param entities: an iterable collection of entities (i.e., rows) to be inserted into the table.
|
|
@@ -805,7 +941,23 @@ class _TableWrapper (object):
|
|
|
805
941
|
:param nondefaults: optional, set of columns names to override implicit system defaults
|
|
806
942
|
:param add_system_defaults: flag to add system columns to the set of default columns.
|
|
807
943
|
:param on_conflict_skip: flag to skip entities that violate uniqueness constraints.
|
|
944
|
+
:param retry_codes: set of HTTP status codes for which retry should be considered.
|
|
945
|
+
:param backoff_factor: number of seconds for base of exponential retry backoff.
|
|
946
|
+
:param max_attempts: maximum number of requests attempts with retry.
|
|
947
|
+
:param max_batch_rows: maximum number of rows for one request, or False to disable batching.
|
|
948
|
+
:param max_batch_bytes: approximate maximum number of bytes for one request.
|
|
808
949
|
:return a collection of newly created entities.
|
|
950
|
+
|
|
951
|
+
Retry will only be attempted for idempotent insertion
|
|
952
|
+
requests, which are when a user-controlled, non-nullable key
|
|
953
|
+
is present in the table and the key's constituent column(s)
|
|
954
|
+
are not listed as defaults, and on_conflict_skip=True.
|
|
955
|
+
|
|
956
|
+
When performing retries, an exponential backoff delay is
|
|
957
|
+
introduced after each failed attempt. The delay is
|
|
958
|
+
backoff_factor**attempt_number seconds for attempts 0 through
|
|
959
|
+
max_attempts-1.
|
|
960
|
+
|
|
809
961
|
"""
|
|
810
962
|
# empty entities will be accepted but results are therefore an empty entity set
|
|
811
963
|
if not entities:
|
|
@@ -840,17 +992,55 @@ class _TableWrapper (object):
|
|
|
840
992
|
if not hasattr(entities[0], 'keys'):
|
|
841
993
|
raise TypeError('entities[0] does not look like a dictionary -- does not have a "keys()" method')
|
|
842
994
|
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
return
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
if
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
995
|
+
# perform one batch request in a helper we can hand to retry helper
|
|
996
|
+
def request_func(batch):
|
|
997
|
+
return self._schema._catalog._wrapped_catalog.post(path, json=batch, headers={'Content-Type': 'application/json'})
|
|
998
|
+
|
|
999
|
+
def _has_user_pkey(table):
|
|
1000
|
+
"""Return True if table has at least one primary key other than the system RID key"""
|
|
1001
|
+
for key in table.keys:
|
|
1002
|
+
if { c.name for c in key.unique_columns } != {'RID'}:
|
|
1003
|
+
if all([ not c.nullok for c in key.unique_columns ]) \
|
|
1004
|
+
and all([ c.name not in defaults for c in key.unique_columns ]):
|
|
1005
|
+
return True
|
|
1006
|
+
return False
|
|
1007
|
+
|
|
1008
|
+
# determine whether insert is idempotent and therefore retry safe
|
|
1009
|
+
retry_safe = on_conflict_skip and _has_user_pkey(self._wrapped_table)
|
|
1010
|
+
|
|
1011
|
+
# perform all requests in a helper we can hand to _ResultSet
|
|
1012
|
+
def results_func(ignore1, ignore2, ignore3):
|
|
1013
|
+
results = []
|
|
1014
|
+
for batch in _generate_batches(
|
|
1015
|
+
entities,
|
|
1016
|
+
max_batch_rows=max_batch_rows,
|
|
1017
|
+
max_batch_bytes=max_batch_bytes
|
|
1018
|
+
):
|
|
1019
|
+
try:
|
|
1020
|
+
if retry_safe:
|
|
1021
|
+
resp = _request_with_retry(
|
|
1022
|
+
lambda: request_func(batch),
|
|
1023
|
+
retry_codes=retry_codes,
|
|
1024
|
+
backoff_factor=backoff_factor,
|
|
1025
|
+
max_attempts=max_attempts
|
|
1026
|
+
)
|
|
1027
|
+
else:
|
|
1028
|
+
resp = request_func(batch)
|
|
1029
|
+
results.extend(resp.json())
|
|
1030
|
+
except HTTPError as e:
|
|
1031
|
+
logger.debug(e.response.text)
|
|
1032
|
+
if 400 <= e.response.status_code < 500:
|
|
1033
|
+
raise DataPathException(_http_error_message(e), e)
|
|
1034
|
+
else:
|
|
1035
|
+
raise e
|
|
1036
|
+
return results
|
|
1037
|
+
|
|
1038
|
+
result = _ResultSet(self.path.uri, results_func)
|
|
1039
|
+
result.fetch()
|
|
1040
|
+
return result
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
def update(self, entities, correlation={'RID'}, targets=None, retry_codes={408, 429, 500, 502, 503, 504}, backoff_factor=4, max_attempts=5, max_batch_rows=1000, max_batch_bytes=250*1024):
|
|
854
1044
|
"""Update entities of a table.
|
|
855
1045
|
|
|
856
1046
|
For more information see the ERMrest protocol for the `attributegroup` interface. By default, this method will
|
|
@@ -862,7 +1052,17 @@ class _TableWrapper (object):
|
|
|
862
1052
|
:param correlation: an iterable collection of column names used to correlate input set to the set of rows to be
|
|
863
1053
|
updated in the catalog. E.g., `{'col name'}` or `{mytable.mycolumn}` will work if you pass a _ColumnWrapper object.
|
|
864
1054
|
:param targets: an iterable collection of column names used as the targets of the update operation.
|
|
865
|
-
:
|
|
1055
|
+
:param retry_codes: set of HTTP status codes for which retry should be considered.
|
|
1056
|
+
:param backoff_factor: number of seconds for base of exponential retry backoff.
|
|
1057
|
+
:param max_attempts: maximum number of requests attempts with retry.
|
|
1058
|
+
:param max_batch_rows: maximum number of rows for one request, or False to disable batching.
|
|
1059
|
+
:param max_batch_bytes: approximate maximum number of bytes for one request.
|
|
1060
|
+
:return a collection of newly created entities.
|
|
1061
|
+
|
|
1062
|
+
When performing retries, an exponential backoff delay is
|
|
1063
|
+
introduced after each failed attempt. The delay is
|
|
1064
|
+
backoff_factor**attempt_number seconds for attempts 0 through
|
|
1065
|
+
max_attempts-1.
|
|
866
1066
|
"""
|
|
867
1067
|
# empty entities will be accepted but results are therefore an empty entity set
|
|
868
1068
|
if not entities:
|
|
@@ -897,16 +1097,37 @@ class _TableWrapper (object):
|
|
|
897
1097
|
targets=','.join(target_cnames)
|
|
898
1098
|
)
|
|
899
1099
|
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
return
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
1100
|
+
# perform one batch request in a helper we can hand to retry helper
|
|
1101
|
+
def request_func(batch):
|
|
1102
|
+
return self._schema._catalog._wrapped_catalog.put(path, json=batch, headers={'Content-Type': 'application/json'})
|
|
1103
|
+
|
|
1104
|
+
# perform all requests in a helper we can hand to _ResultSet
|
|
1105
|
+
def results_func(ignore1, ignore2, ignore3):
|
|
1106
|
+
results = []
|
|
1107
|
+
for batch in _generate_batches(
|
|
1108
|
+
entities,
|
|
1109
|
+
max_batch_rows=max_batch_rows,
|
|
1110
|
+
max_batch_bytes=max_batch_bytes
|
|
1111
|
+
):
|
|
1112
|
+
try:
|
|
1113
|
+
resp = _request_with_retry(
|
|
1114
|
+
lambda: request_func(batch),
|
|
1115
|
+
retry_codes=retry_codes,
|
|
1116
|
+
backoff_factor=backoff_factor,
|
|
1117
|
+
max_attempts=max_attempts
|
|
1118
|
+
)
|
|
1119
|
+
results.extend(resp.json())
|
|
1120
|
+
except HTTPError as e:
|
|
1121
|
+
logger.debug(e.response.text)
|
|
1122
|
+
if 400 <= e.response.status_code < 500:
|
|
1123
|
+
raise DataPathException(_http_error_message(e), e)
|
|
1124
|
+
else:
|
|
1125
|
+
raise e
|
|
1126
|
+
return results
|
|
1127
|
+
|
|
1128
|
+
result = _ResultSet(self.path.uri, results_func)
|
|
1129
|
+
result.fetch()
|
|
1130
|
+
return result
|
|
910
1131
|
|
|
911
1132
|
class _TableAlias (_TableWrapper):
|
|
912
1133
|
"""Represents a table alias in datapath expressions.
|
|
@@ -1760,3 +1981,275 @@ class _AttributeGroup (object):
|
|
|
1760
1981
|
bin.maxval = result.get('maxval', bin.maxval)
|
|
1761
1982
|
if (bin.minval is None) or (bin.maxval is None):
|
|
1762
1983
|
raise ValueError('Automatic determination of binning bounds failed.')
|
|
1984
|
+
|
|
1985
|
+
##
|
|
1986
|
+
## UTILITIES FOR DENORMALIZATION ##############################################
|
|
1987
|
+
##
|
|
1988
|
+
|
|
1989
|
+
def _datapath_left_outer_join_by_fkey(path, fk, alias_name=None):
|
|
1990
|
+
"""Link a table to the path based on a foreign key reference.
|
|
1991
|
+
|
|
1992
|
+
:param path: a DataPath object
|
|
1993
|
+
:param fk: an ermrest_model.ForeignKey object
|
|
1994
|
+
:param alias_name: an optional 'alias' name to use for the foreign table
|
|
1995
|
+
"""
|
|
1996
|
+
assert isinstance(path, DataPath)
|
|
1997
|
+
assert isinstance(fk, _erm.ForeignKey)
|
|
1998
|
+
catalog = path._root._schema._catalog
|
|
1999
|
+
|
|
2000
|
+
# determine 'direction' -- inbound or outbound
|
|
2001
|
+
path_context_table = path.context._base_table._wrapped_table
|
|
2002
|
+
if (path_context_table.schema.name, path_context_table.name) == (fk.table.schema.name, fk.table.name):
|
|
2003
|
+
right = catalog.schemas[fk.pk_table.schema.name].tables[fk.pk_table.name]
|
|
2004
|
+
fkcols = zip(fk.foreign_key_columns, fk.referenced_columns)
|
|
2005
|
+
elif (path_context_table.schema.name, path_context_table.name) == (fk.pk_table.schema.name, fk.pk_table.name):
|
|
2006
|
+
right = catalog.schemas[fk.table.schema.name].tables[fk.table.name]
|
|
2007
|
+
fkcols = zip(fk.referenced_columns, fk.foreign_key_columns)
|
|
2008
|
+
else:
|
|
2009
|
+
raise ValueError('Context table "%s" not referenced by foreign key "%s"' % (path_context_table.name, fk.constraint_name))
|
|
2010
|
+
|
|
2011
|
+
# compose join condition
|
|
2012
|
+
on = None
|
|
2013
|
+
for lcol, rcol in fkcols:
|
|
2014
|
+
lcol = catalog.schemas[lcol.table.schema.name].tables[lcol.table.name].columns[lcol.name]
|
|
2015
|
+
rcol = catalog.schemas[rcol.table.schema.name].tables[rcol.table.name].columns[rcol.name]
|
|
2016
|
+
if on:
|
|
2017
|
+
on = on & (lcol == rcol)
|
|
2018
|
+
else:
|
|
2019
|
+
on = lcol == rcol
|
|
2020
|
+
|
|
2021
|
+
# link
|
|
2022
|
+
path.link(right.alias(alias_name) if alias_name else right, on=on, join_type='left')
|
|
2023
|
+
|
|
2024
|
+
|
|
2025
|
+
def _datapath_deserialize_vizcolumn(path, vizcol, sources=None):
|
|
2026
|
+
"""Deserializes a visual column specification.
|
|
2027
|
+
|
|
2028
|
+
If the visible column specifies a foreign key path, the datapath object
|
|
2029
|
+
will be changed by linking the foreign keys in the path.
|
|
2030
|
+
|
|
2031
|
+
:param path: a datapath object
|
|
2032
|
+
:param vizcol: a visible column specification
|
|
2033
|
+
:return: the element to be projected from the datapath or None
|
|
2034
|
+
"""
|
|
2035
|
+
assert isinstance(path, DataPath)
|
|
2036
|
+
sources = sources if sources else {}
|
|
2037
|
+
context = path.context
|
|
2038
|
+
table = context._wrapped_table
|
|
2039
|
+
model = table.schema.model
|
|
2040
|
+
|
|
2041
|
+
if isinstance(vizcol, str):
|
|
2042
|
+
# column name specification
|
|
2043
|
+
return context.columns[vizcol]
|
|
2044
|
+
elif isinstance(vizcol, list):
|
|
2045
|
+
# constraint specification
|
|
2046
|
+
try:
|
|
2047
|
+
fk = model.fkey(vizcol)
|
|
2048
|
+
_datapath_left_outer_join_by_fkey(path, fk, alias_name='F')
|
|
2049
|
+
return ArrayD(path.context).alias(path.context._name) # project all attributes
|
|
2050
|
+
except KeyError as e:
|
|
2051
|
+
raise ValueError('Invalid foreign key constraint name: %s. If this is a key constraint name, note that keys are not supported at this time.' % str(e))
|
|
2052
|
+
elif isinstance(vizcol, dict):
|
|
2053
|
+
# resolve visible column
|
|
2054
|
+
while 'sourcekey' in vizcol:
|
|
2055
|
+
temp = sources.get(vizcol['sourcekey'], {})
|
|
2056
|
+
if temp == vizcol:
|
|
2057
|
+
raise ValueError('Visible column self reference for sourcekey "%s"' % vizcol['sourcekey'])
|
|
2058
|
+
vizcol = temp
|
|
2059
|
+
# deserialize source definition
|
|
2060
|
+
source = vizcol.get('source')
|
|
2061
|
+
if not source:
|
|
2062
|
+
# case: none
|
|
2063
|
+
raise ValueError('Could not resolve source definition for visible column')
|
|
2064
|
+
elif isinstance(source, str):
|
|
2065
|
+
# case: column name
|
|
2066
|
+
return context.columns[source]
|
|
2067
|
+
elif isinstance(source, list):
|
|
2068
|
+
# case: path expression
|
|
2069
|
+
# ...validate syntax
|
|
2070
|
+
if not all(isinstance(obj, dict) for obj in source[:-1]):
|
|
2071
|
+
raise ValueError('Source path element must be a foreign key dict')
|
|
2072
|
+
if not isinstance(source[-1], str):
|
|
2073
|
+
raise ValueError('Source path must terminate in a column name string')
|
|
2074
|
+
# link path elements by fkey; and track whether path is outbound only fkeys
|
|
2075
|
+
outbound_only = True
|
|
2076
|
+
for path_elem in source[:-1]:
|
|
2077
|
+
try:
|
|
2078
|
+
fk = model.fkey(path_elem.get('inbound', path_elem.get('outbound')))
|
|
2079
|
+
_datapath_left_outer_join_by_fkey(path, fk, alias_name='F')
|
|
2080
|
+
outbound_only = outbound_only and 'outbound' in path_elem
|
|
2081
|
+
except KeyError as e:
|
|
2082
|
+
raise ValueError('Invalid foreign key constraint name: %s' % str(e))
|
|
2083
|
+
# return terminating column or entity
|
|
2084
|
+
# ...get terminal name
|
|
2085
|
+
terminal = source[-1]
|
|
2086
|
+
# ...get alias name
|
|
2087
|
+
alias = vizcol.get('markdown_name', vizcol.get('name', path.context._name + '_' + terminal))
|
|
2088
|
+
# ...get aggregate function
|
|
2089
|
+
aggregate = {
|
|
2090
|
+
'min': Min,
|
|
2091
|
+
'max': Max,
|
|
2092
|
+
'cnt': Cnt,
|
|
2093
|
+
'cnd_d': CntD,
|
|
2094
|
+
'array': Array,
|
|
2095
|
+
'array_d': ArrayD
|
|
2096
|
+
}.get(vizcol.get('aggregate'), ArrayD)
|
|
2097
|
+
# ...determine projection mode
|
|
2098
|
+
if vizcol.get('entity', True):
|
|
2099
|
+
# case: whole entities
|
|
2100
|
+
return aggregate(path.context).alias(alias)
|
|
2101
|
+
else:
|
|
2102
|
+
# case: specified attribute value(s)
|
|
2103
|
+
if outbound_only:
|
|
2104
|
+
# for outbound only paths, we can project a single value
|
|
2105
|
+
return path.context.columns[terminal].alias(alias)
|
|
2106
|
+
else:
|
|
2107
|
+
# otherwise, we need to use aggregate the values
|
|
2108
|
+
return aggregate(path.context.columns[terminal]).alias(alias)
|
|
2109
|
+
else:
|
|
2110
|
+
raise ValueError('Malformed source: %s' % str(source))
|
|
2111
|
+
else:
|
|
2112
|
+
raise ValueError('Malformed visible column: %s' % str(vizcol))
|
|
2113
|
+
|
|
2114
|
+
|
|
2115
|
+
def _datapath_contextualize(path, context_name='*', context_body=None, groupkey_name='RID'):
|
|
2116
|
+
"""Contextualizes a data path to a named visible columns context.
|
|
2117
|
+
|
|
2118
|
+
:param path: a datapath object
|
|
2119
|
+
:param context_name: name of the context within the path's terminating table's "visible columns" annotations
|
|
2120
|
+
:param context_body: a list of visible column definitions, if given, the `context_name` will be ignored
|
|
2121
|
+
:param groupkey_name: column name for the group by key of the generated query expression (default: 'RID')
|
|
2122
|
+
:return: a 'contextualized' attribute group query object
|
|
2123
|
+
"""
|
|
2124
|
+
assert isinstance(path, DataPath)
|
|
2125
|
+
path = copy.deepcopy(path)
|
|
2126
|
+
context = path.context
|
|
2127
|
+
table = context._wrapped_table
|
|
2128
|
+
sources = table.annotations.get(_erm.tag.source_definitions, {}).get('sources')
|
|
2129
|
+
vizcols = context_body if context_body else table.annotations.get(_erm.tag.visible_columns, {}).get(context_name, [])
|
|
2130
|
+
if not vizcols:
|
|
2131
|
+
raise ValueError('Visible columns context "%s" not found for table %s:%s' % (context_name, table.schema.name, table.name))
|
|
2132
|
+
groupkey = context.columns[groupkey_name]
|
|
2133
|
+
projection = []
|
|
2134
|
+
|
|
2135
|
+
for vizcol in vizcols:
|
|
2136
|
+
try:
|
|
2137
|
+
projection.append(_datapath_deserialize_vizcolumn(path, vizcol, sources=sources))
|
|
2138
|
+
path.context = context
|
|
2139
|
+
except ValueError as e:
|
|
2140
|
+
logger.warning(str(e))
|
|
2141
|
+
|
|
2142
|
+
def not_same_as_group_key(x):
|
|
2143
|
+
assert isinstance(groupkey, _ColumnWrapper)
|
|
2144
|
+
if not isinstance(x, _ColumnWrapper):
|
|
2145
|
+
return True
|
|
2146
|
+
return groupkey._wrapped_column != x._wrapped_column
|
|
2147
|
+
|
|
2148
|
+
projection = filter(not_same_as_group_key, projection) # project groupkey only once
|
|
2149
|
+
query = path.groupby(groupkey).attributes(*projection)
|
|
2150
|
+
return query
|
|
2151
|
+
|
|
2152
|
+
|
|
2153
|
+
def _datapath_generate_simple_denormalization(path, include_whole_entities=False):
|
|
2154
|
+
"""Generates a denormalized form of the table expressed in a visible columns specification.
|
|
2155
|
+
|
|
2156
|
+
:param path: a datapath object
|
|
2157
|
+
:param include_whole_entities: if a denormalization cannot find a 'name' like terminal, include the whole entity (i.e., all attributes), else return just the 'RID'
|
|
2158
|
+
:return: a generated visible columns specification based on a denormalization heuristic
|
|
2159
|
+
"""
|
|
2160
|
+
assert isinstance(path, DataPath)
|
|
2161
|
+
context = path.context
|
|
2162
|
+
table = context._wrapped_table
|
|
2163
|
+
|
|
2164
|
+
fkeys = list(table.foreign_keys)
|
|
2165
|
+
single_column_fkeys = {
|
|
2166
|
+
fkey.foreign_key_columns[0].name: fkey
|
|
2167
|
+
for fkey in table.foreign_keys if len(fkey.foreign_key_columns) == 1
|
|
2168
|
+
}
|
|
2169
|
+
|
|
2170
|
+
def _fkey_to_vizcol(name, fk, inbound=None):
|
|
2171
|
+
# name columns to look for in related tables
|
|
2172
|
+
name_candidates = [
|
|
2173
|
+
'displayname',
|
|
2174
|
+
'preferredname',
|
|
2175
|
+
'fullname',
|
|
2176
|
+
'name',
|
|
2177
|
+
'title',
|
|
2178
|
+
'label'
|
|
2179
|
+
]
|
|
2180
|
+
|
|
2181
|
+
# determine terminal column
|
|
2182
|
+
terminal = 'RID'
|
|
2183
|
+
for candidate_col in fk.pk_table.columns:
|
|
2184
|
+
if candidate_col.name.lower().replace(' ', '').replace('_', '') in name_candidates:
|
|
2185
|
+
terminal = candidate_col.name
|
|
2186
|
+
break
|
|
2187
|
+
|
|
2188
|
+
# define source path
|
|
2189
|
+
source = [{'outbound': fk.names[0]}, terminal]
|
|
2190
|
+
if inbound:
|
|
2191
|
+
source = [{'inbound': inbound.names[0]}] + source
|
|
2192
|
+
|
|
2193
|
+
# return vizcol spec
|
|
2194
|
+
return {
|
|
2195
|
+
'markdown_name': name,
|
|
2196
|
+
'source': source,
|
|
2197
|
+
'entity': include_whole_entities and terminal == 'RID'
|
|
2198
|
+
}
|
|
2199
|
+
|
|
2200
|
+
# assemble the visible column:
|
|
2201
|
+
# 1. column or single column fkeys
|
|
2202
|
+
# 2. all other (outbound fkey) related tables
|
|
2203
|
+
# 3. all associated tables
|
|
2204
|
+
vizcols = []
|
|
2205
|
+
for col in table.column_definitions:
|
|
2206
|
+
if col.name in single_column_fkeys:
|
|
2207
|
+
fkey = single_column_fkeys[col.name]
|
|
2208
|
+
vizcols.append(_fkey_to_vizcol(col.name, fkey))
|
|
2209
|
+
del single_column_fkeys[col.name]
|
|
2210
|
+
fkeys.remove(fkey)
|
|
2211
|
+
else:
|
|
2212
|
+
vizcols.append(col.name)
|
|
2213
|
+
|
|
2214
|
+
for outbound_fkey in fkeys:
|
|
2215
|
+
vizcols.append(_fkey_to_vizcol(outbound_fkey.constraint_name, outbound_fkey))
|
|
2216
|
+
|
|
2217
|
+
for inbound_fkey in table.referenced_by:
|
|
2218
|
+
if inbound_fkey.table.is_association():
|
|
2219
|
+
vizcols.append(
|
|
2220
|
+
_fkey_to_vizcol(
|
|
2221
|
+
inbound_fkey.table.name,
|
|
2222
|
+
inbound_fkey.table.foreign_keys[0] if inbound_fkey != inbound_fkey.table.foreign_keys[0] else inbound_fkey.table.foreign_keys[1],
|
|
2223
|
+
inbound=inbound_fkey
|
|
2224
|
+
)
|
|
2225
|
+
)
|
|
2226
|
+
|
|
2227
|
+
return vizcols
|
|
2228
|
+
|
|
2229
|
+
def simple_denormalization(path):
|
|
2230
|
+
"""A simple heuristic denormalization."""
|
|
2231
|
+
return _datapath_generate_simple_denormalization(path)
|
|
2232
|
+
|
|
2233
|
+
def simple_denormalization_with_whole_entities(path):
|
|
2234
|
+
"""A simple heuristic denormalization with related and associated entities."""
|
|
2235
|
+
return _datapath_generate_simple_denormalization(path, include_whole_entities=True)
|
|
2236
|
+
|
|
2237
|
+
def _datapath_denormalize(path, context_name=None, heuristic=None, groupkey_name='RID'):
|
|
2238
|
+
"""Denormalizes a path based on annotations or heuristics.
|
|
2239
|
+
|
|
2240
|
+
:param path: a DataPath object
|
|
2241
|
+
:param context_name: name of the visible-columns context or if none given, will attempt apply heuristics
|
|
2242
|
+
:param heuristic: heuristic to apply if no context name specified
|
|
2243
|
+
:param groupkey_name: column name for the group by key of the generated query expression (default: 'RID')
|
|
2244
|
+
"""
|
|
2245
|
+
assert isinstance(path, DataPath)
|
|
2246
|
+
assert context_name is None or isinstance(context_name, str)
|
|
2247
|
+
assert isinstance(groupkey_name, str)
|
|
2248
|
+
heuristic = heuristic or simple_denormalization
|
|
2249
|
+
assert callable(heuristic)
|
|
2250
|
+
return _datapath_contextualize(
|
|
2251
|
+
path,
|
|
2252
|
+
context_name=context_name,
|
|
2253
|
+
context_body=None if context_name else heuristic(path),
|
|
2254
|
+
groupkey_name=groupkey_name
|
|
2255
|
+
)
|