acryl-datahub 0.15.0rc14__py3-none-any.whl → 0.15.0rc15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=TT6lQ2hGZTB3ZKUJKer0P4dfP-zsktBs_ZiZhHSPcV8,575
1
+ datahub/__init__.py,sha256=buN0Z9LU7z07XbgkhbhlEfo3o-_kT-HZCi9rZTe1ovY,575
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -485,7 +485,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
485
485
  datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
486
486
  datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
487
487
  datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
488
- datahub/ingestion/source/tableau/tableau.py,sha256=AFlDng8EfvBvZL692hMf_sfzGwpHpUU6FW_ElR4uitQ,131551
488
+ datahub/ingestion/source/tableau/tableau.py,sha256=khC6lPXMz-t7Oqbxw0GH-O47NTthJe38clIP1XXLzsg,135453
489
489
  datahub/ingestion/source/tableau/tableau_common.py,sha256=Dy_2pvkPucZJsG_LvQZLlxNEkjh-yOXHlZ4jurq9opM,26069
490
490
  datahub/ingestion/source/tableau/tableau_constant.py,sha256=nWElhtDo5kj5mWivZFmtVF_4Ugw0-EatBYWyDVzu5hE,2501
491
491
  datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -853,7 +853,7 @@ datahub/specific/__init__.py,sha256=r5RYM5mDnskLzin3vc87HV-9GSz3P6uQw8AlsN14LaI,
853
853
  datahub/specific/chart.py,sha256=DsLA5qHBIMNc1pIZ1AC5kLvwpRDd79Q56N4SANOofps,11324
854
854
  datahub/specific/custom_properties.py,sha256=Ob8L9b9QIbUvHfzWo4L-SNY1QSRhgRy30kLRDdenGEs,1024
855
855
  datahub/specific/dashboard.py,sha256=kRfyJsm7piugxBg0IfIbLmvv6Smk3D44IGVw8THLqPE,15100
856
- datahub/specific/datajob.py,sha256=Yp_LSy12ogbz9KYKTkdg6J9ScaFgg-o5--VkRfC1qRo,18793
856
+ datahub/specific/datajob.py,sha256=5pEBrN6llpgS7jWYEfrvqpbT2vMVVpepH71jIUJUo4U,18480
857
857
  datahub/specific/dataproduct.py,sha256=Mt-QlndY4Die87XwakYTAcvyDzaB5fmyn1NpQGGcZyI,5235
858
858
  datahub/specific/dataset.py,sha256=TAI8SRhhhsv1zEi3lGv24NX6PTJDrEyt5v0Sdg-uFY8,13568
859
859
  datahub/specific/form.py,sha256=jVI0JD-o2-XkD1suW_ITnTZUF0GNbGjaNb9-PXdfdkA,4549
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
974
974
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
975
975
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
976
976
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
977
- acryl_datahub-0.15.0rc14.dist-info/METADATA,sha256=GAon0PKaDuM17zZePUxkbcfczQk1bvUN_FVAcWCPVgI,174408
978
- acryl_datahub-0.15.0rc14.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
- acryl_datahub-0.15.0rc14.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
- acryl_datahub-0.15.0rc14.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
- acryl_datahub-0.15.0rc14.dist-info/RECORD,,
977
+ acryl_datahub-0.15.0rc15.dist-info/METADATA,sha256=nwGDKdrHWd11W8TVe55NQ8bCWex4qALoCTIRsKal1T4,174408
978
+ acryl_datahub-0.15.0rc15.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
+ acryl_datahub-0.15.0rc15.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
+ acryl_datahub-0.15.0rc15.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
+ acryl_datahub-0.15.0rc15.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc14"
6
+ __version__ = "0.15.0rc15"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -485,6 +485,18 @@ class TableauConfig(
485
485
  description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
486
486
  )
487
487
 
488
+ ingest_hidden_assets: bool = Field(
489
+ True,
490
+ description="When enabled, hidden views and dashboards are ingested into Datahub. "
491
+ "If a dashboard or view is hidden in Tableau the luid is blank. Default of this config field is True.",
492
+ )
493
+
494
+ tags_for_hidden_assets: List[str] = Field(
495
+ default=[],
496
+ description="Tags to be added to hidden dashboards and views. If a dashboard or view is hidden in Tableau the luid is blank. "
497
+ "This can only be used with ingest_tags enabled as it will overwrite tags entered from the UI.",
498
+ )
499
+
488
500
  # pre = True because we want to take some decision before pydantic initialize the configuration to default values
489
501
  @root_validator(pre=True)
490
502
  def projects_backward_compatibility(cls, values: Dict) -> Dict:
@@ -510,6 +522,20 @@ class TableauConfig(
510
522
 
511
523
  return values
512
524
 
525
+ @root_validator()
526
+ def validate_config_values(cls, values: Dict) -> Dict:
527
+ tags_for_hidden_assets = values.get("tags_for_hidden_assets")
528
+ ingest_tags = values.get("ingest_tags")
529
+ if (
530
+ not ingest_tags
531
+ and tags_for_hidden_assets
532
+ and len(tags_for_hidden_assets) > 0
533
+ ):
534
+ raise ValueError(
535
+ "tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
536
+ )
537
+ return values
538
+
513
539
 
514
540
  class WorkbookKey(ContainerKey):
515
541
  workbook_id: str
@@ -596,7 +622,16 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
596
622
  num_datasource_field_skipped_no_name: int = 0
597
623
  num_csql_field_skipped_no_name: int = 0
598
624
  num_table_field_skipped_no_name: int = 0
625
+ # lineage
626
+ num_tables_with_upstream_lineage: int = 0
627
+ num_upstream_table_lineage: int = 0
628
+ num_upstream_fine_grained_lineage: int = 0
599
629
  num_upstream_table_skipped_no_name: int = 0
630
+ num_upstream_table_skipped_no_columns: int = 0
631
+ num_upstream_table_failed_generate_reference: int = 0
632
+ num_upstream_table_lineage_failed_parse_sql: int = 0
633
+ num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
634
+ num_hidden_assets_skipped: int = 0
600
635
 
601
636
 
602
637
  @platform_name("Tableau")
@@ -1043,6 +1078,11 @@ class TableauSiteSource:
1043
1078
  ),
1044
1079
  )
1045
1080
 
1081
+ def _is_hidden_view(self, dashboard_or_view: Dict) -> bool:
1082
+ # LUID is blank if the view is hidden in the workbook.
1083
+ # More info here: https://help.tableau.com/current/api/metadata_api/en-us/reference/view.doc.html
1084
+ return not dashboard_or_view.get(c.LUID)
1085
+
1046
1086
  def get_connection_object_page(
1047
1087
  self,
1048
1088
  query: str,
@@ -1311,7 +1351,7 @@ class TableauSiteSource:
1311
1351
  datasource: dict,
1312
1352
  browse_path: Optional[str],
1313
1353
  is_embedded_ds: bool = False,
1314
- ) -> Tuple:
1354
+ ) -> Tuple[List[Upstream], List[FineGrainedLineage]]:
1315
1355
  upstream_tables: List[Upstream] = []
1316
1356
  fine_grained_lineages: List[FineGrainedLineage] = []
1317
1357
  table_id_to_urn = {}
@@ -1472,6 +1512,7 @@ class TableauSiteSource:
1472
1512
  c.COLUMNS_CONNECTION
1473
1513
  ].get("totalCount")
1474
1514
  if not is_custom_sql and not num_tbl_cols:
1515
+ self.report.num_upstream_table_skipped_no_columns += 1
1475
1516
  logger.warning(
1476
1517
  f"Skipping upstream table with id {table[c.ID]}, no columns: {table}"
1477
1518
  )
@@ -1488,6 +1529,7 @@ class TableauSiteSource:
1488
1529
  table, default_schema_map=self.config.default_schema_map
1489
1530
  )
1490
1531
  except Exception as e:
1532
+ self.report.num_upstream_table_failed_generate_reference += 1
1491
1533
  self.report.warning(
1492
1534
  title="Potentially Missing Lineage Issue",
1493
1535
  message="Failed to generate upstream reference",
@@ -1659,15 +1701,7 @@ class TableauSiteSource:
1659
1701
  func_overridden_info=None, # Here we don't want to override any information from configuration
1660
1702
  )
1661
1703
 
1662
- if parsed_result is None:
1663
- logger.info(
1664
- f"Failed to extract column level lineage from datasource {datasource_urn}"
1665
- )
1666
- return []
1667
- if parsed_result.debug_info.error:
1668
- logger.info(
1669
- f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.error}"
1670
- )
1704
+ if parsed_result is None or parsed_result.debug_info.error:
1671
1705
  return []
1672
1706
 
1673
1707
  cll: List[ColumnLineageInfo] = (
@@ -2031,6 +2065,8 @@ class TableauSiteSource:
2031
2065
  aspect_name=c.UPSTREAM_LINEAGE,
2032
2066
  aspect=upstream_lineage,
2033
2067
  )
2068
+ self.report.num_tables_with_upstream_lineage += 1
2069
+ self.report.num_upstream_table_lineage += len(upstream_tables)
2034
2070
 
2035
2071
  @staticmethod
2036
2072
  def _clean_tableau_query_parameters(query: str) -> str:
@@ -2130,7 +2166,7 @@ class TableauSiteSource:
2130
2166
  f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}"
2131
2167
  )
2132
2168
 
2133
- return create_lineage_sql_parsed_result(
2169
+ parsed_result = create_lineage_sql_parsed_result(
2134
2170
  query=query,
2135
2171
  default_db=upstream_db,
2136
2172
  platform=platform,
@@ -2140,6 +2176,21 @@ class TableauSiteSource:
2140
2176
  schema_aware=not self.config.sql_parsing_disable_schema_awareness,
2141
2177
  )
2142
2178
 
2179
+ assert parsed_result is not None
2180
+
2181
+ if parsed_result.debug_info.table_error:
2182
+ logger.warning(
2183
+ f"Failed to extract table lineage from datasource {datasource_urn}: {parsed_result.debug_info.table_error}"
2184
+ )
2185
+ self.report.num_upstream_table_lineage_failed_parse_sql += 1
2186
+ elif parsed_result.debug_info.column_error:
2187
+ logger.warning(
2188
+ f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.column_error}"
2189
+ )
2190
+ self.report.num_upstream_fine_grained_lineage_failed_parse_sql += 1
2191
+
2192
+ return parsed_result
2193
+
2143
2194
  def _enrich_database_tables_with_parsed_schemas(
2144
2195
  self, parsing_result: SqlParsingResult
2145
2196
  ) -> None:
@@ -2174,9 +2225,6 @@ class TableauSiteSource:
2174
2225
  )
2175
2226
 
2176
2227
  if parsed_result is None:
2177
- logger.info(
2178
- f"Failed to extract table level lineage for datasource {csql_urn}"
2179
- )
2180
2228
  return
2181
2229
 
2182
2230
  self._enrich_database_tables_with_parsed_schemas(parsed_result)
@@ -2196,12 +2244,14 @@ class TableauSiteSource:
2196
2244
  upstreams=upstream_tables,
2197
2245
  fineGrainedLineages=fine_grained_lineages,
2198
2246
  )
2199
-
2200
2247
  yield self.get_metadata_change_proposal(
2201
2248
  csql_urn,
2202
2249
  aspect_name=c.UPSTREAM_LINEAGE,
2203
2250
  aspect=upstream_lineage,
2204
2251
  )
2252
+ self.report.num_tables_with_upstream_lineage += 1
2253
+ self.report.num_upstream_table_lineage += len(upstream_tables)
2254
+ self.report.num_upstream_fine_grained_lineage += len(fine_grained_lineages)
2205
2255
 
2206
2256
  def _get_schema_metadata_for_datasource(
2207
2257
  self, datasource_fields: List[dict]
@@ -2278,12 +2328,11 @@ class TableauSiteSource:
2278
2328
  )
2279
2329
 
2280
2330
  # Tags
2281
- if datasource_info:
2331
+ if datasource_info and self.config.ingest_tags:
2282
2332
  tags = self.get_tags(datasource_info)
2283
- if tags:
2284
- dataset_snapshot.aspects.append(
2285
- builder.make_global_tag_aspect_with_tag_list(tags)
2286
- )
2333
+ dataset_snapshot.aspects.append(
2334
+ builder.make_global_tag_aspect_with_tag_list(tags)
2335
+ )
2287
2336
 
2288
2337
  # Browse path
2289
2338
  if browse_path and is_embedded_ds and workbook and workbook.get(c.NAME):
@@ -2352,6 +2401,11 @@ class TableauSiteSource:
2352
2401
  aspect_name=c.UPSTREAM_LINEAGE,
2353
2402
  aspect=upstream_lineage,
2354
2403
  )
2404
+ self.report.num_tables_with_upstream_lineage += 1
2405
+ self.report.num_upstream_table_lineage += len(upstream_tables)
2406
+ self.report.num_upstream_fine_grained_lineage += len(
2407
+ fine_grained_lineages
2408
+ )
2355
2409
 
2356
2410
  # Datasource Fields
2357
2411
  schema_metadata = self._get_schema_metadata_for_datasource(
@@ -2669,7 +2723,13 @@ class TableauSiteSource:
2669
2723
  c.SHEETS_CONNECTION,
2670
2724
  sheets_filter,
2671
2725
  ):
2672
- yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
2726
+ if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
2727
+ yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
2728
+ else:
2729
+ self.report.num_hidden_assets_skipped += 1
2730
+ logger.debug(
2731
+ f"Skip view {sheet.get(c.ID)} because it's hidden (luid is blank)."
2732
+ )
2673
2733
 
2674
2734
  def emit_sheets_as_charts(
2675
2735
  self, sheet: dict, workbook: Optional[Dict]
@@ -2760,11 +2820,17 @@ class TableauSiteSource:
2760
2820
  chart_snapshot.aspects.append(owner)
2761
2821
 
2762
2822
  # Tags
2763
- tags = self.get_tags(sheet)
2764
- if tags:
2823
+ if self.config.ingest_tags:
2824
+ tags = self.get_tags(sheet)
2825
+ if len(self.config.tags_for_hidden_assets) > 0 and self._is_hidden_view(
2826
+ sheet
2827
+ ):
2828
+ tags.extend(self.config.tags_for_hidden_assets)
2829
+
2765
2830
  chart_snapshot.aspects.append(
2766
2831
  builder.make_global_tag_aspect_with_tag_list(tags)
2767
2832
  )
2833
+
2768
2834
  yield self.get_metadata_change_event(chart_snapshot)
2769
2835
  if sheet_external_url is not None and self.config.ingest_embed_url is True:
2770
2836
  yield self.new_work_unit(
@@ -2846,7 +2912,7 @@ class TableauSiteSource:
2846
2912
  else None
2847
2913
  )
2848
2914
 
2849
- tags = self.get_tags(workbook)
2915
+ tags = self.get_tags(workbook) if self.config.ingest_tags else None
2850
2916
 
2851
2917
  parent_key = None
2852
2918
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
@@ -2977,17 +3043,23 @@ class TableauSiteSource:
2977
3043
  c.DASHBOARDS_CONNECTION,
2978
3044
  dashboards_filter,
2979
3045
  ):
2980
- yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
3046
+ if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
3047
+ yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
3048
+ else:
3049
+ self.report.num_hidden_assets_skipped += 1
3050
+ logger.debug(
3051
+ f"Skip dashboard {dashboard.get(c.ID)} because it's hidden (luid is blank)."
3052
+ )
2981
3053
 
2982
- def get_tags(self, obj: dict) -> Optional[List[str]]:
3054
+ def get_tags(self, obj: dict) -> List[str]:
2983
3055
  tag_list = obj.get(c.TAGS, [])
2984
- if tag_list and self.config.ingest_tags:
3056
+ if tag_list:
2985
3057
  tag_list_str = [
2986
3058
  t[c.NAME] for t in tag_list if t is not None and t.get(c.NAME)
2987
3059
  ]
2988
3060
 
2989
3061
  return tag_list_str
2990
- return None
3062
+ return []
2991
3063
 
2992
3064
  def emit_dashboard(
2993
3065
  self, dashboard: dict, workbook: Optional[Dict]
@@ -3038,8 +3110,13 @@ class TableauSiteSource:
3038
3110
  )
3039
3111
  dashboard_snapshot.aspects.append(dashboard_info_class)
3040
3112
 
3041
- tags = self.get_tags(dashboard)
3042
- if tags:
3113
+ if self.config.ingest_tags:
3114
+ tags = self.get_tags(dashboard)
3115
+ if len(self.config.tags_for_hidden_assets) > 0 and self._is_hidden_view(
3116
+ dashboard
3117
+ ):
3118
+ tags.extend(self.config.tags_for_hidden_assets)
3119
+
3043
3120
  dashboard_snapshot.aspects.append(
3044
3121
  builder.make_global_tag_aspect_with_tag_list(tags)
3045
3122
  )
@@ -102,7 +102,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
102
102
 
103
103
  Notes:
104
104
  If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
105
- it is converted to an Edge object and added with default audit stamps.
105
+ it is converted to an Edge object and added without any audit stamps.
106
106
  """
107
107
  if isinstance(input, Edge):
108
108
  input_urn: str = input.destinationUrn
@@ -114,8 +114,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
114
114
 
115
115
  input_edge = Edge(
116
116
  destinationUrn=input_urn,
117
- created=self._mint_auditstamp(),
118
- lastModified=self._mint_auditstamp(),
119
117
  )
120
118
 
121
119
  self._ensure_urn_type("dataJob", [input_edge], "add_input_datajob")
@@ -185,7 +183,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
185
183
 
186
184
  Notes:
187
185
  If `input` is an Edge object, it is used directly. If `input` is a Urn object or string,
188
- it is converted to an Edge object and added with default audit stamps.
186
+ it is converted to an Edge object and added without any audit stamps.
189
187
  """
190
188
  if isinstance(input, Edge):
191
189
  input_urn: str = input.destinationUrn
@@ -197,8 +195,6 @@ class DataJobPatchBuilder(MetadataPatchProposal):
197
195
 
198
196
  input_edge = Edge(
199
197
  destinationUrn=input_urn,
200
- created=self._mint_auditstamp(),
201
- lastModified=self._mint_auditstamp(),
202
198
  )
203
199
 
204
200
  self._ensure_urn_type("dataset", [input_edge], "add_input_dataset")
@@ -270,7 +266,7 @@ class DataJobPatchBuilder(MetadataPatchProposal):
270
266
 
271
267
  Notes:
272
268
  If `output` is an Edge object, it is used directly. If `output` is a Urn object or string,
273
- it is converted to an Edge object and added with default audit stamps.
269
+ it is converted to an Edge object and added without any audit stamps.
274
270
  """
275
271
  if isinstance(output, Edge):
276
272
  output_urn: str = output.destinationUrn
@@ -282,15 +278,13 @@ class DataJobPatchBuilder(MetadataPatchProposal):
282
278
 
283
279
  output_edge = Edge(
284
280
  destinationUrn=output_urn,
285
- created=self._mint_auditstamp(),
286
- lastModified=self._mint_auditstamp(),
287
281
  )
288
282
 
289
283
  self._ensure_urn_type("dataset", [output_edge], "add_output_dataset")
290
284
  self._add_patch(
291
285
  DataJobInputOutput.ASPECT_NAME,
292
286
  "add",
293
- path=f"/outputDatasetEdges/{self.quote(str(output))}",
287
+ path=f"/outputDatasetEdges/{self.quote(output_urn)}",
294
288
  value=output_edge,
295
289
  )
296
290
  return self