acryl-datahub 1.3.0.1rc3__py3-none-any.whl → 1.3.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.3.0.1rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.3.0.1rc4.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=em6A_AK1PEbN80kEFbwFQP_4ZhxiWZZekBVZKx4EpV4,323
4
+ datahub/_version.py,sha256=wuuyfFhYvCgNlVjy7dlOQ6sSJuFvaUAnTlhSVJzQ-fM,323
5
5
  datahub/entrypoints.py,sha256=VcbU6Z47b_JKW1zI-WJMYIngm05FSogKLiuvFNtyNcI,9088
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -476,7 +476,7 @@ datahub/ingestion/source/schema_inference/avro.py,sha256=aaqCMhLU2nxMJYPSNZv0o0A
476
476
  datahub/ingestion/source/schema_inference/base.py,sha256=dI98TOieCqqA1SdB6729EAReanGX2AC7UgSDkPls8Sg,379
477
477
  datahub/ingestion/source/schema_inference/csv_tsv.py,sha256=ypuBZEAf8Hx2Efrvu1nMWDdqVH_lg4i7N68YCwi8NiU,2259
478
478
  datahub/ingestion/source/schema_inference/json.py,sha256=p5S-3idn65V2uad5T8txs1UakA4cfllcrxfN-6qltss,2577
479
- datahub/ingestion/source/schema_inference/object.py,sha256=dhSOtxVJHbTDY0hWeHwdLYHnOsW07Omk7Y4DPeztie0,5847
479
+ datahub/ingestion/source/schema_inference/object.py,sha256=ERR0XdiGE_qBWbNvt1oEWPYeB7ZNAsCnTZTF3ngn4F8,6582
480
480
  datahub/ingestion/source/schema_inference/parquet.py,sha256=CdqsNuiabLLCulWbuPMssijeFmKLv3M5MKFIhlatpWA,3456
481
481
  datahub/ingestion/source/sigma/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
482
482
  datahub/ingestion/source/sigma/config.py,sha256=ztZf0YisGSXKgKeqP9ipDlRKLXU-Y-XABqm7HCJ8pvA,6265
@@ -646,12 +646,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
646
646
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
647
647
  datahub/lite/lite_util.py,sha256=G0LQHKkyEb1pc_q183g6hflShclGx7kikgMaOxtVVcs,4545
648
648
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
649
- datahub/metadata/_internal_schema_classes.py,sha256=1UZsNj9XmThYFXbG39BVKlaTTFywzayhVVon6svD3kM,1076970
650
- datahub/metadata/schema.avsc,sha256=P6j7fiukfv03ZW8gis3m3mVKGlSV2JhgMcmrtf5sU7Q,775491
649
+ datahub/metadata/_internal_schema_classes.py,sha256=iJEKSTxDX1uH8YgUwxt0_HSd2AryK0-vUTBeN-7wG0Q,1076970
650
+ datahub/metadata/schema.avsc,sha256=sn-1LEnMogcM9WhP7f8PWa_I7mgLALVeN75XtuCd4hY,812831
651
651
  datahub/metadata/schema_classes.py,sha256=tPT8iHCak4IsZi_oL0nirbPpI8ETTPTZzapqLRpeKU4,1326
652
652
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
653
653
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
654
- datahub/metadata/_urns/urn_defs.py,sha256=_LgqKLHrmHHxpvrP-93NMJSLEnoFI8q72lkX17mK1XA,143257
654
+ datahub/metadata/_urns/urn_defs.py,sha256=q1vWaLDYps2cfLh5IYsWUrbFYx_N2txLAFIl9vKdq-M,143257
655
655
  datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
656
656
  datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
657
657
  datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
@@ -1128,8 +1128,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1128
1128
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1129
1129
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1130
1130
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1131
- acryl_datahub-1.3.0.1rc3.dist-info/METADATA,sha256=Q32VN8kEGo-T0nge3wOkp_EmXJQQZKtZYl9SnsCu3PY,184504
1132
- acryl_datahub-1.3.0.1rc3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1133
- acryl_datahub-1.3.0.1rc3.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1134
- acryl_datahub-1.3.0.1rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1135
- acryl_datahub-1.3.0.1rc3.dist-info/RECORD,,
1131
+ acryl_datahub-1.3.0.1rc4.dist-info/METADATA,sha256=cJb6Hz3UpjAA15FINI4riQLYt7C5Q-aUsoWz2pelBp0,184504
1132
+ acryl_datahub-1.3.0.1rc4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1133
+ acryl_datahub-1.3.0.1rc4.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1134
+ acryl_datahub-1.3.0.1rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1135
+ acryl_datahub-1.3.0.1rc4.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.3.0.1rc3"
3
+ __version__ = "1.3.0.1rc4"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -1,4 +1,4 @@
1
- from collections import Counter
1
+ from collections import Counter, defaultdict
2
2
  from typing import Any, Counter as CounterType, Dict, Sequence, Tuple, Union
3
3
 
4
4
  from typing_extensions import TypedDict
@@ -84,7 +84,7 @@ def is_nullable_collection(
84
84
 
85
85
 
86
86
  def construct_schema(
87
- collection: Sequence[Dict[str, Any]], delimiter: str
87
+ collection: Sequence[Dict[str, Any]], delimiter: str = "."
88
88
  ) -> Dict[Tuple[str, ...], SchemaDescription]:
89
89
  """
90
90
  Construct (infer) a schema from a collection of documents.
@@ -104,9 +104,11 @@ def construct_schema(
104
104
  string to concatenate field names by
105
105
  """
106
106
 
107
- schema: Dict[Tuple[str, ...], BasicSchemaDescription] = {}
107
+ schema: Dict[Tuple[str, ...], BasicSchemaDescription] = defaultdict(
108
+ lambda: {"types": Counter(), "count": 0}
109
+ )
108
110
 
109
- def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> None:
111
+ def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> int:
110
112
  """
111
113
  Recursively update the schema with a document, which may/may not contain nested fields.
112
114
 
@@ -118,18 +120,24 @@ def construct_schema(
118
120
  prefix of fields that the document is under, pass an empty tuple when initializing
119
121
  """
120
122
 
123
+ # we want to make sure that parents of nested structures are included first, before their children, so that
124
+ # they are displayed properly in the UI, also in the event of trimming the list (which happens, for example,
125
+ # in mongodb ingestor)
126
+ max_count = 0
121
127
  for key, value in doc.items():
122
128
  new_parent_prefix = parent_prefix + (key,)
123
129
 
124
130
  # if nested value, look at the types within
125
131
  if isinstance(value, dict):
126
- append_to_schema(value, new_parent_prefix)
132
+ max_count = max(append_to_schema(value, new_parent_prefix), max_count)
127
133
  # if array of values, check what types are within
128
134
  if isinstance(value, list):
129
135
  for item in value:
130
136
  # if dictionary, add it as a nested object
131
137
  if isinstance(item, dict):
132
- append_to_schema(item, new_parent_prefix)
138
+ max_count = max(
139
+ append_to_schema(item, new_parent_prefix), max_count
140
+ )
133
141
 
134
142
  # don't record None values (counted towards nullable)
135
143
  if value is not None:
@@ -143,6 +151,14 @@ def construct_schema(
143
151
  # update the type count
144
152
  schema[new_parent_prefix]["types"].update({type(value): 1})
145
153
  schema[new_parent_prefix]["count"] += 1
154
+ max_count = max(schema[new_parent_prefix]["count"], max_count)
155
+
156
+ if parent_prefix != ():
157
+ schema[parent_prefix]["count"] = max(
158
+ schema[parent_prefix]["count"], max_count
159
+ )
160
+
161
+ return max_count
146
162
 
147
163
  for document in collection:
148
164
  append_to_schema(document, ())