acryl-datahub 1.2.0.3rc1__py3-none-any.whl → 1.2.0.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.2.0.3rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.2.0.3rc2.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=BAFY2OaLzkEm6Hs8RwoQ69XvJQdk3iPPrc9oCg1xGAE,323
4
+ datahub/_version.py,sha256=68k6koV9OpoFsqms-Y85vFl7s7-exthVNAueW6OVqNk,323
5
5
  datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -132,7 +132,7 @@ datahub/emitter/mcp_builder.py,sha256=8IwJAlolQkPpMqQJPLtGrsUqAcuFNs98nrI5iYUxga
132
132
  datahub/emitter/mcp_patch_builder.py,sha256=u7cpW6DkiN7KpLapmMaXgL_FneoN69boxiANbVgMdSI,4564
133
133
  datahub/emitter/request_helper.py,sha256=2Sij9VJqgA7xZI6I7IuxsA8ioakbz0FJ3gvazxU_z3M,5738
134
134
  datahub/emitter/response_helper.py,sha256=qGm45n43CepW7j6kP9wTXuP-U-SZnn7hQdJTdVaoqhQ,7504
135
- datahub/emitter/rest_emitter.py,sha256=lMqjtDyPOArIrNgL47kq1cbB4xiR17CHfRRxpGYriDY,38793
135
+ datahub/emitter/rest_emitter.py,sha256=za2b8C0f8Mpo8E7DVh3jNENYlNMTV0nwdC3FLm8n2DQ,39532
136
136
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
137
137
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
138
138
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -181,7 +181,7 @@ datahub/ingestion/glossary/classifier.py,sha256=daLxnVv_JlfB_jBOxH5LrU_xQRndrsGo
181
181
  datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
182
182
  datahub/ingestion/glossary/datahub_classifier.py,sha256=O7wm6gQT1Jf2QSKdWjJQbS5oSzJwplXzfza26Gdq5Mg,7555
183
183
  datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
- datahub/ingestion/graph/client.py,sha256=b23UCgszAFcW_UjNBEvY5c3oqMNGifz2445vl2IkKyo,74224
184
+ datahub/ingestion/graph/client.py,sha256=xUURT6KxwOhwuAbUznxrOzmGuXxHI-3MmDgJQHFpaGk,74671
185
185
  datahub/ingestion/graph/config.py,sha256=rmkcqAL8fJoY9QyAeS0Xm8HvwHzV3pCjY-Om-50JJTI,1015
186
186
  datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
187
187
  datahub/ingestion/graph/entity_versioning.py,sha256=nrcNz0Qm6kpE6oTu_mrYUQDx14KPspBTc6R9SyFUY6c,6901
@@ -201,7 +201,7 @@ datahub/ingestion/sink/blackhole.py,sha256=-jYcWo4i8q7312bCIoHrGr7nT9JdPvA7c4jvS
201
201
  datahub/ingestion/sink/console.py,sha256=TZfhA0Ec2eNCrMH7RRy2JOdUE-U-hkoIQrPm1CmKLQs,591
202
202
  datahub/ingestion/sink/datahub_kafka.py,sha256=_cjuXu5I6G0zJ2UK7hMbaKjMPZXeIwRMgm7CVeTiNtc,2578
203
203
  datahub/ingestion/sink/datahub_lite.py,sha256=7u2aWm7ENLshKHl-PkjJg6Mrw4bWs8sTfKIBz4mm8Ak,1879
204
- datahub/ingestion/sink/datahub_rest.py,sha256=DOhtTHqKpmqgI3rUY9ri2QZAyXYDFINWMG6ne7VYUXI,13463
204
+ datahub/ingestion/sink/datahub_rest.py,sha256=QrtR-hJ6yljN1quXcjoUHdAmJueZclrFZFrhU7c4YJM,13563
205
205
  datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
206
206
  datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
207
207
  datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -301,7 +301,7 @@ datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vB
301
301
  datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
302
302
  datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
303
303
  datahub/ingestion/source/dbt/dbt_cloud.py,sha256=4gWOFSX0YU8EAJgO4J47NBE4QbNtJ-5nUe66vry-oGc,18160
304
- datahub/ingestion/source/dbt/dbt_common.py,sha256=ByCqzjkToXgfhOyxxc6VEuD8BZbYbPsD5yrLRMMPUcI,85640
304
+ datahub/ingestion/source/dbt/dbt_common.py,sha256=3NcCYsJSDfC5j7ajC_Mr3MnA_sF9DTq1ka6ft3b0u6A,85997
305
305
  datahub/ingestion/source/dbt/dbt_core.py,sha256=WVI2ZYXOMxgFzJnJqsqmEGS-5xdfiVIDsCb78lvSeQ0,24930
306
306
  datahub/ingestion/source/dbt/dbt_tests.py,sha256=pOZJaP4VsbaE5j4qVlE_E3ifno_KQpidfGTvOi5fr6I,9839
307
307
  datahub/ingestion/source/debug/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -356,7 +356,7 @@ datahub/ingestion/source/hex/constants.py,sha256=8hUTMWyG5keTNfXoLu_Dh413Hw_mGGJ
356
356
  datahub/ingestion/source/hex/hex.py,sha256=tUYNcvwKVoQuRWv4KhcDnMeOpICh4JwhD8oF988Tjg4,13199
357
357
  datahub/ingestion/source/hex/mapper.py,sha256=N3mTlEcrOmhv9ia1dnHGFgFJD2ddyTtU3H5IUbb-UxU,13344
358
358
  datahub/ingestion/source/hex/model.py,sha256=S9bUhfFcjzuio2dBS6HzSyRVPiSJvRvMQ0qyVrjV5-E,1766
359
- datahub/ingestion/source/hex/query_fetcher.py,sha256=0VqDfviyfR14gUHvIBovCXEqwW4ftFehPSB2VzaYk14,13312
359
+ datahub/ingestion/source/hex/query_fetcher.py,sha256=r9UvF_qwswkRlNY7AI8p46eqAYSxVtjVE2e7eO4XagA,13384
360
360
  datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
361
361
  datahub/ingestion/source/iceberg/iceberg.py,sha256=BNDGooK9cmqpOjzkV1u4rpsduVPNWg_97Uca6aLurNU,35431
362
362
  datahub/ingestion/source/iceberg/iceberg_common.py,sha256=CD_yHQ_wEgivyLQUTRO9BZJB29S7j5fUVllki-BPwUU,12292
@@ -499,7 +499,7 @@ datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=asZW8DztIB1TcGzOoZFmK6
499
499
  datahub/ingestion/source/snowflake/stored_proc_lineage.py,sha256=rOb78iHiWiK8v8WdVs1xDwVut4Y0OHmszej6IopQfCo,5341
500
500
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
501
501
  datahub/ingestion/source/sql/athena.py,sha256=TPKwL9oRiZlVnqIsOSBWUEwyvoW-1ssXvY4PfjxOR6g,28175
502
- datahub/ingestion/source/sql/athena_properties_extractor.py,sha256=n2SvqeUbNWxiWWdkDs8VYlUPlLwfZzZy9AIa-V4D7AY,28531
502
+ datahub/ingestion/source/sql/athena_properties_extractor.py,sha256=OS2E2HD7xTn0MBy__pIvjKXMfGp02Zf93hQRAPMXE_Y,28533
503
503
  datahub/ingestion/source/sql/clickhouse.py,sha256=zd5qE6XPw0AXtY_71-n0yz4ua69xP3oxMuIoabAuT3Q,25987
504
504
  datahub/ingestion/source/sql/cockroachdb.py,sha256=WoOKCq7YjsuzSPm1SmKIYZ9CrvlSF8zWmP1fNHn4G3Q,1360
505
505
  datahub/ingestion/source/sql/druid.py,sha256=_tzgTa5jhPUXk6WCmS7p10feCwJm6yUFcOgMZA-OcE8,2922
@@ -1101,8 +1101,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1101
1101
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1102
1102
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1103
1103
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1104
- acryl_datahub-1.2.0.3rc1.dist-info/METADATA,sha256=zo0PhZMaumsiXe8Vq8ud1VQPSZWVspaAuUIx1FoCk9s,182014
1105
- acryl_datahub-1.2.0.3rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1106
- acryl_datahub-1.2.0.3rc1.dist-info/entry_points.txt,sha256=bnGf6eX9UhiW8yVHtt6MJCVcmLErvrVQxTJAayA-PKc,9885
1107
- acryl_datahub-1.2.0.3rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1108
- acryl_datahub-1.2.0.3rc1.dist-info/RECORD,,
1104
+ acryl_datahub-1.2.0.3rc2.dist-info/METADATA,sha256=sUemCtB9B2qW0ADg8AmidAEwuqUwCjVp-qYmFaX-dPc,182014
1105
+ acryl_datahub-1.2.0.3rc2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1106
+ acryl_datahub-1.2.0.3rc2.dist-info/entry_points.txt,sha256=bnGf6eX9UhiW8yVHtt6MJCVcmLErvrVQxTJAayA-PKc,9885
1107
+ acryl_datahub-1.2.0.3rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1108
+ acryl_datahub-1.2.0.3rc2.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.2.0.3rc1"
3
+ __version__ = "1.2.0.3rc2"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -95,7 +95,7 @@ TRACE_INITIAL_BACKOFF = 1.0 # Start with 1 second
95
95
  TRACE_MAX_BACKOFF = 300.0 # Cap at 5 minutes
96
96
  TRACE_BACKOFF_FACTOR = 2.0 # Double the wait time each attempt
97
97
 
98
- # The limit is 16mb. We will use a max of 15mb to have some space
98
+ # The limit is 16,000,000 bytes. We will use a max of 15mb to have some space
99
99
  # for overhead like request headers.
100
100
  # This applies to pretty much all calls to GMS.
101
101
  INGEST_MAX_PAYLOAD_BYTES = int(
@@ -586,6 +586,11 @@ class DataHubRestEmitter(Closeable, Emitter):
586
586
  "systemMetadata": system_metadata_obj,
587
587
  }
588
588
  payload = json.dumps(snapshot)
589
+ if len(payload) > INGEST_MAX_PAYLOAD_BYTES:
590
+ logger.warning(
591
+ f"MCE object has size {len(payload)} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
592
+ "so this metadata will likely fail to be emitted."
593
+ )
589
594
 
590
595
  self._emit_generic(url, payload)
591
596
 
@@ -764,16 +769,24 @@ class DataHubRestEmitter(Closeable, Emitter):
764
769
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
765
770
 
766
771
  mcp_objs = [pre_json_transform(mcp.to_obj()) for mcp in mcps]
772
+ if len(mcp_objs) == 0:
773
+ return 0
767
774
 
768
775
  # As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
769
776
  # If we will exceed the limit, we need to break it up into chunks.
770
- mcp_obj_chunks: List[List[str]] = []
771
- current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
777
+ mcp_obj_chunks: List[List[str]] = [[]]
778
+ current_chunk_size = 0
772
779
  for mcp_obj in mcp_objs:
780
+ mcp_identifier = f"{mcp_obj.get('entityUrn')}-{mcp_obj.get('aspectName')}"
773
781
  mcp_obj_size = len(json.dumps(mcp_obj))
774
782
  if _DATAHUB_EMITTER_TRACE:
775
783
  logger.debug(
776
- f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
784
+ f"Iterating through object ({mcp_identifier}) with size {mcp_obj_size}"
785
+ )
786
+ if mcp_obj_size > INGEST_MAX_PAYLOAD_BYTES:
787
+ logger.warning(
788
+ f"MCP object {mcp_identifier} has size {mcp_obj_size} that exceeds the max payload size of {INGEST_MAX_PAYLOAD_BYTES}, "
789
+ "so this metadata will likely fail to be emitted."
777
790
  )
778
791
 
779
792
  if (
@@ -786,7 +799,7 @@ class DataHubRestEmitter(Closeable, Emitter):
786
799
  current_chunk_size = 0
787
800
  mcp_obj_chunks[-1].append(mcp_obj)
788
801
  current_chunk_size += mcp_obj_size
789
- if len(mcp_obj_chunks) > 0:
802
+ if len(mcp_obj_chunks) > 1 or _DATAHUB_EMITTER_TRACE:
790
803
  logger.debug(
791
804
  f"Decided to send {len(mcps)} MCP batch in {len(mcp_obj_chunks)} chunks"
792
805
  )
@@ -76,7 +76,15 @@ from datahub.metadata.schema_classes import (
76
76
  SystemMetadataClass,
77
77
  TelemetryClientIdClass,
78
78
  )
79
- from datahub.metadata.urns import CorpUserUrn, Urn
79
+ from datahub.metadata.urns import (
80
+ CorpUserUrn,
81
+ MlFeatureTableUrn,
82
+ MlFeatureUrn,
83
+ MlModelGroupUrn,
84
+ MlModelUrn,
85
+ MlPrimaryKeyUrn,
86
+ Urn,
87
+ )
80
88
  from datahub.telemetry.telemetry import telemetry_instance
81
89
  from datahub.utilities.perf_timer import PerfTimer
82
90
  from datahub.utilities.str_enum import StrEnum
@@ -118,8 +126,16 @@ def entity_type_to_graphql(entity_type: str) -> str:
118
126
  """Convert the entity types into GraphQL "EntityType" enum values."""
119
127
 
120
128
  # Hard-coded special cases.
121
- if entity_type == CorpUserUrn.ENTITY_TYPE:
122
- return "CORP_USER"
129
+ special_cases = {
130
+ CorpUserUrn.ENTITY_TYPE: "CORP_USER",
131
+ MlModelUrn.ENTITY_TYPE: "MLMODEL",
132
+ MlModelGroupUrn.ENTITY_TYPE: "MLMODEL_GROUP",
133
+ MlFeatureTableUrn.ENTITY_TYPE: "MLFEATURE_TABLE",
134
+ MlFeatureUrn.ENTITY_TYPE: "MLFEATURE",
135
+ MlPrimaryKeyUrn.ENTITY_TYPE: "MLPRIMARY_KEY",
136
+ }
137
+ if entity_type in special_cases:
138
+ return special_cases[entity_type]
123
139
 
124
140
  # Convert camelCase to UPPER_UNDERSCORE.
125
141
  entity_type = (
@@ -92,6 +92,7 @@ class DatahubRestSinkConfig(DatahubClientConfig):
92
92
  @dataclasses.dataclass
93
93
  class DataHubRestSinkReport(SinkReport):
94
94
  mode: Optional[RestSinkMode] = None
95
+ endpoint: Optional[RestSinkEndpoint] = None
95
96
  max_threads: Optional[int] = None
96
97
  gms_version: Optional[str] = None
97
98
  pending_requests: int = 0
@@ -142,6 +143,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
142
143
 
143
144
  self.report.gms_version = gms_config.service_version
144
145
  self.report.mode = self.config.mode
146
+ self.report.endpoint = self.config.endpoint
145
147
  self.report.max_threads = self.config.max_threads
146
148
  logger.debug("Setting env variables to override config")
147
149
  logger.debug("Setting gms config")
@@ -120,6 +120,7 @@ logger = logging.getLogger(__name__)
120
120
  DBT_PLATFORM = "dbt"
121
121
 
122
122
  _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
123
+ _DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024 # 1MB
123
124
 
124
125
 
125
126
  @dataclass
@@ -1684,6 +1685,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1684
1685
  def get_external_url(self, node: DBTNode) -> Optional[str]:
1685
1686
  pass
1686
1687
 
1688
+ @staticmethod
1689
+ def _truncate_code(code: str, max_length: int) -> str:
1690
+ if len(code) > max_length:
1691
+ return code[:max_length] + "..."
1692
+ return code
1693
+
1687
1694
  def _create_view_properties_aspect(
1688
1695
  self, node: DBTNode
1689
1696
  ) -> Optional[ViewPropertiesClass]:
@@ -1695,6 +1702,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1695
1702
  compiled_code = try_format_query(
1696
1703
  node.compiled_code, platform=self.config.target_platform
1697
1704
  )
1705
+ compiled_code = self._truncate_code(
1706
+ compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
1707
+ )
1698
1708
 
1699
1709
  materialized = node.materialization in {"table", "incremental", "snapshot"}
1700
1710
  view_properties = ViewPropertiesClass(
@@ -97,7 +97,7 @@ class HexQueryFetcher:
97
97
  if not query_urns or not entities_by_urn:
98
98
  self.report.warning(
99
99
  title="No Queries found with Hex as origin",
100
- message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
100
+ message="No lineage because of no Queries found with Hex as origin in the given time range. You may need to set use_queries_v2: true on your warehouse ingestion or you may consider extending the time range to fetch more queries.",
101
101
  context=str(
102
102
  dict(
103
103
  workspace_name=self.workspace_name,
@@ -99,10 +99,10 @@ class AthenaPropertiesExtractor:
99
99
  """A class to extract properties from Athena CREATE TABLE statements."""
100
100
 
101
101
  CREATE_TABLE_REGEXP = re.compile(
102
- "(CREATE TABLE[\s\n]*)(.*?)(\s*\()", re.MULTILINE | re.IGNORECASE
102
+ r"(CREATE TABLE[\s\n]*)(.*?)(\s*\()", re.MULTILINE | re.IGNORECASE
103
103
  )
104
104
  PARTITIONED_BY_REGEXP = re.compile(
105
- "(PARTITIONED BY[\s\n]*\()((?:[^()]|\([^)]*\))*?)(\))",
105
+ r"(PARTITIONED BY[\s\n]*\()((?:[^()]|\([^)]*\))*?)(\))",
106
106
  re.MULTILINE | re.IGNORECASE,
107
107
  )
108
108