acryl-datahub 1.3.0.1rc3__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc3.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/METADATA +2458 -2458
- {acryl_datahub-1.3.0.1rc3.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/RECORD +11 -11
- datahub/_version.py +1 -1
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/metadata/_internal_schema_classes.py +544 -544
- datahub/metadata/_urns/urn_defs.py +1728 -1728
- datahub/metadata/schema.avsc +15157 -15157
- {acryl_datahub-1.3.0.1rc3.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc3.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc3.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc3.dist-info → acryl_datahub-1.3.0.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.3.0.
|
|
1
|
+
acryl_datahub-1.3.0.1rc4.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=wuuyfFhYvCgNlVjy7dlOQ6sSJuFvaUAnTlhSVJzQ-fM,323
|
|
5
5
|
datahub/entrypoints.py,sha256=VcbU6Z47b_JKW1zI-WJMYIngm05FSogKLiuvFNtyNcI,9088
|
|
6
6
|
datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -476,7 +476,7 @@ datahub/ingestion/source/schema_inference/avro.py,sha256=aaqCMhLU2nxMJYPSNZv0o0A
|
|
|
476
476
|
datahub/ingestion/source/schema_inference/base.py,sha256=dI98TOieCqqA1SdB6729EAReanGX2AC7UgSDkPls8Sg,379
|
|
477
477
|
datahub/ingestion/source/schema_inference/csv_tsv.py,sha256=ypuBZEAf8Hx2Efrvu1nMWDdqVH_lg4i7N68YCwi8NiU,2259
|
|
478
478
|
datahub/ingestion/source/schema_inference/json.py,sha256=p5S-3idn65V2uad5T8txs1UakA4cfllcrxfN-6qltss,2577
|
|
479
|
-
datahub/ingestion/source/schema_inference/object.py,sha256=
|
|
479
|
+
datahub/ingestion/source/schema_inference/object.py,sha256=ERR0XdiGE_qBWbNvt1oEWPYeB7ZNAsCnTZTF3ngn4F8,6582
|
|
480
480
|
datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
|
|
481
481
|
datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
482
482
|
datahub/ingestion/source/sigma/config.py,sha256=ztZf0YisGSXKgKeqP9ipDlRKLXU-Y-XABqm7HCJ8pvA,6265
|
|
@@ -646,12 +646,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
|
|
|
646
646
|
datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
|
|
647
647
|
datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
|
|
648
648
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
649
|
-
datahub/metadata/_internal_schema_classes.py,sha256=
|
|
650
|
-
datahub/metadata/schema.avsc,sha256=
|
|
649
|
+
datahub/metadata/_internal_schema_classes.py,sha256=iJEKSTxDX1uH8YgUwxt0_HSd2AryK0-vUTBeN-7wG0Q,1076970
|
|
650
|
+
datahub/metadata/schema.avsc,sha256=sn-1LEnMogcM9WhP7f8PWa_I7mgLALVeN75XtuCd4hY,812831
|
|
651
651
|
datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
|
|
652
652
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
653
653
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
654
|
-
datahub/metadata/_urns/urn_defs.py,sha256=
|
|
654
|
+
datahub/metadata/_urns/urn_defs.py,sha256=q1vWaLDYps2cfLh5IYsWUrbFYx_N2txLAFIl9vKdq-M,143257
|
|
655
655
|
datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
656
656
|
datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
657
657
|
datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
|
|
@@ -1128,8 +1128,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1128
1128
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1129
1129
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1130
1130
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1131
|
-
acryl_datahub-1.3.0.
|
|
1132
|
-
acryl_datahub-1.3.0.
|
|
1133
|
-
acryl_datahub-1.3.0.
|
|
1134
|
-
acryl_datahub-1.3.0.
|
|
1135
|
-
acryl_datahub-1.3.0.
|
|
1131
|
+
acryl_datahub-1.3.0.1rc4.dist-info/METADATA,sha256=cJb6Hz3UpjAA15FINI4riQLYt7C5Q-aUsoWz2pelBp0,184504
|
|
1132
|
+
acryl_datahub-1.3.0.1rc4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
1133
|
+
acryl_datahub-1.3.0.1rc4.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
|
|
1134
|
+
acryl_datahub-1.3.0.1rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1135
|
+
acryl_datahub-1.3.0.1rc4.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from collections import Counter
|
|
1
|
+
from collections import Counter, defaultdict
|
|
2
2
|
from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from typing_extensions import TypedDict
|
|
@@ -84,7 +84,7 @@ def is_nullable_collection(
|
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
def construct_schema(
|
|
87
|
-
collection: Sequence[Dict[str, Any]], delimiter: str
|
|
87
|
+
collection: Sequence[Dict[str, Any]], delimiter: str = "."
|
|
88
88
|
) -> Dict[Tuple[str, ...], SchemaDescription]:
|
|
89
89
|
"""
|
|
90
90
|
Construct (infer) a schema from a collection of documents.
|
|
@@ -104,9 +104,11 @@ def construct_schema(
|
|
|
104
104
|
string to concatenate field names by
|
|
105
105
|
"""
|
|
106
106
|
|
|
107
|
-
schema: Dict[Tuple[str, ...], BasicSchemaDescription] =
|
|
107
|
+
schema: Dict[Tuple[str, ...], BasicSchemaDescription] = defaultdict(
|
|
108
|
+
lambda: {"types": Counter(), "count": 0}
|
|
109
|
+
)
|
|
108
110
|
|
|
109
|
-
def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) ->
|
|
111
|
+
def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> int:
|
|
110
112
|
"""
|
|
111
113
|
Recursively update the schema with a document, which may/may not contain nested fields.
|
|
112
114
|
|
|
@@ -118,18 +120,24 @@ def construct_schema(
|
|
|
118
120
|
prefix of fields that the document is under, pass an empty tuple when initializing
|
|
119
121
|
"""
|
|
120
122
|
|
|
123
|
+
# we want to make sure that parents of nested structures are included first, before their children, so that
|
|
124
|
+
# they are displayed properly in the UI, also in the event of trimming the list (which happens, for example,
|
|
125
|
+
# in mongodb ingestor)
|
|
126
|
+
max_count = 0
|
|
121
127
|
for key, value in doc.items():
|
|
122
128
|
new_parent_prefix = parent_prefix + (key,)
|
|
123
129
|
|
|
124
130
|
# if nested value, look at the types within
|
|
125
131
|
if isinstance(value, dict):
|
|
126
|
-
append_to_schema(value, new_parent_prefix)
|
|
132
|
+
max_count = max(append_to_schema(value, new_parent_prefix), max_count)
|
|
127
133
|
# if array of values, check what types are within
|
|
128
134
|
if isinstance(value, list):
|
|
129
135
|
for item in value:
|
|
130
136
|
# if dictionary, add it as a nested object
|
|
131
137
|
if isinstance(item, dict):
|
|
132
|
-
|
|
138
|
+
max_count = max(
|
|
139
|
+
append_to_schema(item, new_parent_prefix), max_count
|
|
140
|
+
)
|
|
133
141
|
|
|
134
142
|
# don't record None values (counted towards nullable)
|
|
135
143
|
if value is not None:
|
|
@@ -143,6 +151,14 @@ def construct_schema(
|
|
|
143
151
|
# update the type count
|
|
144
152
|
schema[new_parent_prefix]["types"].update({type(value): 1})
|
|
145
153
|
schema[new_parent_prefix]["count"] += 1
|
|
154
|
+
max_count = max(schema[new_parent_prefix]["count"], max_count)
|
|
155
|
+
|
|
156
|
+
if parent_prefix != ():
|
|
157
|
+
schema[parent_prefix]["count"] = max(
|
|
158
|
+
schema[parent_prefix]["count"], max_count
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return max_count
|
|
146
162
|
|
|
147
163
|
for document in collection:
|
|
148
164
|
append_to_schema(document, ())
|