acryl-datahub-cloud 0.3.8.2rc3__py3-none-any.whl → 0.3.9rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub-cloud might be problematic. Click here for more details.

Files changed (78) hide show
  1. acryl_datahub_cloud/_codegen_config.json +1 -1
  2. acryl_datahub_cloud/acryl_cs_issues/acryl_customer.py +1 -1
  3. acryl_datahub_cloud/action_request/__init__.py +0 -0
  4. acryl_datahub_cloud/action_request/action_request_owner_source.py +174 -0
  5. acryl_datahub_cloud/api/__init__.py +1 -1
  6. acryl_datahub_cloud/api/client.py +2 -2
  7. acryl_datahub_cloud/datahub_reporting/datahub_dataset.py +6 -6
  8. acryl_datahub_cloud/datahub_reporting/datahub_form_reporting.py +67 -33
  9. acryl_datahub_cloud/datahub_reporting/extract_sql.py +4 -4
  10. acryl_datahub_cloud/datahub_usage_reporting/query_builder.py +1 -0
  11. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_patch_builder.py +21 -21
  12. acryl_datahub_cloud/datahub_usage_reporting/usage_feature_reporter.py +151 -141
  13. acryl_datahub_cloud/metadata/_urns/urn_defs.py +1064 -418
  14. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/actionrequest/__init__.py +6 -0
  15. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/dataplatforminstance/__init__.py +2 -0
  16. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/dataset/__init__.py +2 -0
  17. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executorglobalconfig/__init__.py +15 -0
  18. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/executorpool/__init__.py +4 -0
  19. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  20. acryl_datahub_cloud/metadata/com/linkedin/pegasus2avro/metric/__init__.py +29 -0
  21. acryl_datahub_cloud/metadata/schema.avsc +778 -42
  22. acryl_datahub_cloud/metadata/schema_classes.py +1089 -61
  23. acryl_datahub_cloud/metadata/schemas/ActionRequestInfo.avsc +422 -12
  24. acryl_datahub_cloud/metadata/schemas/ActionRequestStatus.avsc +12 -0
  25. acryl_datahub_cloud/metadata/schemas/AssertionAnalyticsRunEvent.avsc +5 -3
  26. acryl_datahub_cloud/metadata/schemas/AssertionInfo.avsc +5 -3
  27. acryl_datahub_cloud/metadata/schemas/AssertionRunEvent.avsc +5 -3
  28. acryl_datahub_cloud/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  29. acryl_datahub_cloud/metadata/schemas/BusinessAttributes.avsc +6 -0
  30. acryl_datahub_cloud/metadata/schemas/ChartInfo.avsc +1 -0
  31. acryl_datahub_cloud/metadata/schemas/ChartKey.avsc +3 -3
  32. acryl_datahub_cloud/metadata/schemas/ContainerKey.avsc +1 -1
  33. acryl_datahub_cloud/metadata/schemas/CorpGroupKey.avsc +1 -1
  34. acryl_datahub_cloud/metadata/schemas/DashboardKey.avsc +3 -3
  35. acryl_datahub_cloud/metadata/schemas/DataFlowKey.avsc +1 -1
  36. acryl_datahub_cloud/metadata/schemas/DataHubActionInfo.avsc +1 -1
  37. acryl_datahub_cloud/metadata/schemas/DataHubConnectionKey.avsc +2 -1
  38. acryl_datahub_cloud/metadata/schemas/DataHubIngestionSourceInfo.avsc +9 -4
  39. acryl_datahub_cloud/metadata/schemas/DataHubMetricCubeDefinition.avsc +185 -0
  40. acryl_datahub_cloud/metadata/schemas/DataHubMetricCubeEvent.avsc +184 -0
  41. acryl_datahub_cloud/metadata/schemas/DataHubMetricCubeKey.avsc +22 -0
  42. acryl_datahub_cloud/metadata/schemas/DataJobKey.avsc +4 -4
  43. acryl_datahub_cloud/metadata/schemas/DataPlatformInstanceKey.avsc +2 -1
  44. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
  45. acryl_datahub_cloud/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
  46. acryl_datahub_cloud/metadata/schemas/DatasetKey.avsc +14 -13
  47. acryl_datahub_cloud/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  48. acryl_datahub_cloud/metadata/schemas/ExecutionRequestInput.avsc +6 -1
  49. acryl_datahub_cloud/metadata/schemas/ExecutionRequestSignal.avsc +1 -1
  50. acryl_datahub_cloud/metadata/schemas/GlossaryTerms.avsc +3 -1
  51. acryl_datahub_cloud/metadata/schemas/IcebergCatalogInfo.avsc +28 -0
  52. acryl_datahub_cloud/metadata/schemas/IcebergWarehouseInfo.avsc +96 -0
  53. acryl_datahub_cloud/metadata/schemas/IncidentActivityEvent.avsc +4 -1
  54. acryl_datahub_cloud/metadata/schemas/IncidentInfo.avsc +4 -1
  55. acryl_datahub_cloud/metadata/schemas/InputFields.avsc +3 -1
  56. acryl_datahub_cloud/metadata/schemas/MLFeatureKey.avsc +1 -1
  57. acryl_datahub_cloud/metadata/schemas/MLFeatureTableKey.avsc +1 -1
  58. acryl_datahub_cloud/metadata/schemas/MLModelGroupKey.avsc +1 -1
  59. acryl_datahub_cloud/metadata/schemas/MLModelKey.avsc +3 -3
  60. acryl_datahub_cloud/metadata/schemas/MLPrimaryKeyKey.avsc +1 -1
  61. acryl_datahub_cloud/metadata/schemas/MetadataChangeEvent.avsc +399 -176
  62. acryl_datahub_cloud/metadata/schemas/MonitorInfo.avsc +6 -4
  63. acryl_datahub_cloud/metadata/schemas/NotebookKey.avsc +1 -1
  64. acryl_datahub_cloud/metadata/schemas/Operation.avsc +4 -2
  65. acryl_datahub_cloud/metadata/schemas/RemoteExecutorGlobalConfigKey.avsc +21 -0
  66. acryl_datahub_cloud/metadata/schemas/RemoteExecutorPoolGlobalConfig.avsc +16 -0
  67. acryl_datahub_cloud/metadata/schemas/RemoteExecutorPoolInfo.avsc +85 -0
  68. acryl_datahub_cloud/metadata/schemas/RemoteExecutorPoolKey.avsc +1 -1
  69. acryl_datahub_cloud/metadata/schemas/RemoteExecutorStatus.avsc +5 -5
  70. acryl_datahub_cloud/metadata/schemas/SchemaFieldKey.avsc +2 -2
  71. acryl_datahub_cloud/metadata/schemas/SchemaMetadata.avsc +3 -1
  72. acryl_datahub_cloud/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  73. {acryl_datahub_cloud-0.3.8.2rc3.dist-info → acryl_datahub_cloud-0.3.9rc1.dist-info}/METADATA +42 -42
  74. {acryl_datahub_cloud-0.3.8.2rc3.dist-info → acryl_datahub_cloud-0.3.9rc1.dist-info}/RECORD +77 -67
  75. {acryl_datahub_cloud-0.3.8.2rc3.dist-info → acryl_datahub_cloud-0.3.9rc1.dist-info}/WHEEL +1 -1
  76. {acryl_datahub_cloud-0.3.8.2rc3.dist-info → acryl_datahub_cloud-0.3.9rc1.dist-info}/entry_points.txt +1 -0
  77. acryl_datahub_cloud/api/entity_versioning.py +0 -167
  78. {acryl_datahub_cloud-0.3.8.2rc3.dist-info → acryl_datahub_cloud-0.3.9rc1.dist-info}/top_level.txt +0 -0
@@ -172,6 +172,15 @@ class DataHubUsageFeatureReportingSourceConfig(
172
172
  description="Flag to enable polars streaming mode.'",
173
173
  )
174
174
 
175
+ # Running the whole pipeline in streaming mode was very unstable in the past.
176
+ # It seems like with the latest version of Polars it is much more stable.
177
+ # This option is only needed here until we are sure that the streaming mode is stable.
178
+ # then we can remove it and control it with the streaming_mode option.
179
+ experimental_full_streaming: bool = Field(
180
+ False,
181
+ description="Flag to enable full streaming mode.'",
182
+ )
183
+
175
184
  disable_write_usage: bool = Field(
176
185
  True,
177
186
  description="Flag to disable write usage statistics collection.'",
@@ -300,6 +309,12 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
300
309
  if "siblings" in doc["_source"] and doc["_source"]["siblings"]
301
310
  else []
302
311
  ),
312
+ "combinedSearchRankingMultiplier": (
313
+ doc["_source"]["combinedSearchRankingMultiplier"]
314
+ if "combinedSearchRankingMultiplier" in doc["_source"]
315
+ and doc["_source"]["combinedSearchRankingMultiplier"]
316
+ else None
317
+ ),
303
318
  "isView": (
304
319
  "View" in doc["_source"]["typeNames"]
305
320
  if "typeNames" in doc["_source"] and doc["_source"]["typeNames"]
@@ -355,7 +370,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
355
370
  for doc in results:
356
371
  if "platform" not in doc["_source"] or not doc["_source"]["platform"]:
357
372
  logger.warning(
358
- f"Platform not found in query { doc['_source']['urn']}. Skipping..."
373
+ f"Platform not found in query {doc['_source']['urn']}. Skipping..."
359
374
  )
360
375
  continue
361
376
 
@@ -544,9 +559,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
544
559
  usageSearchScoreMultiplier=usage_search_score_multiplier,
545
560
  usageFreshnessScoreMultiplier=freshness_factor,
546
561
  customDatahubScoreMultiplier=regexp_factor,
547
- combinedSearchRankingMultiplier=usage_search_score_multiplier
548
- * freshness_factor
549
- * regexp_factor,
562
+ # We make sure the combinedSearchRankingMultiplier is never less than 1
563
+ combinedSearchRankingMultiplier=max(
564
+ 1, (usage_search_score_multiplier * freshness_factor * regexp_factor)
565
+ ),
550
566
  )
551
567
 
552
568
  def load_data_from_es(
@@ -672,10 +688,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
672
688
 
673
689
  @staticmethod
674
690
  def polars_to_arrow_schema(
675
- polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]]
691
+ polars_schema: Dict[str, Union[DataTypeClass, polars.DataType]],
676
692
  ) -> pa.Schema:
677
693
  def convert_dtype(
678
- polars_dtype: Union[DataTypeClass, polars.DataType]
694
+ polars_dtype: Union[DataTypeClass, polars.DataType],
679
695
  ) -> pa.DataType:
680
696
  type_mapping: Dict[Union[DataTypeClass, polars.DataType], pa.DataType] = {
681
697
  polars.Boolean(): pa.bool_(),
@@ -933,14 +949,14 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
933
949
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
934
950
  if self.config.dataset_usage_enabled:
935
951
  with self.report.dataset_usage_processing_time as timer:
936
- self.report.report_ingestion_stage_start("generate dataset usage")
952
+ self.report.new_stage("generate dataset usage")
937
953
  yield from self.generate_dataset_usage_mcps()
938
954
  time_taken = timer.elapsed_seconds()
939
955
  logger.info(f"Dataset Usage generation took {time_taken:.3f} seconds")
940
956
 
941
957
  if self.config.dashboard_usage_enabled:
942
958
  with self.report.dashboard_usage_processing_time as timer:
943
- self.report.report_ingestion_stage_start("generate dashboard usage")
959
+ self.report.new_stage("generate dashboard usage")
944
960
  yield from self.generate_dashboard_usage_mcps()
945
961
 
946
962
  time_taken = timer.elapsed_seconds()
@@ -948,7 +964,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
948
964
 
949
965
  if self.config.chart_usage_enabled:
950
966
  with self.report.chart_usage_processing_time as timer:
951
- self.report.report_ingestion_stage_start("generate chart usage")
967
+ self.report.new_stage("generate chart usage")
952
968
 
953
969
  yield from self.generate_chart_usage_mcps()
954
970
 
@@ -957,7 +973,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
957
973
 
958
974
  if self.config.query_usage_enabled:
959
975
  with self.report.query_usage_processing_time as timer:
960
- self.report.report_ingestion_stage_start("generate query usage")
976
+ self.report.new_stage("generate query usage")
961
977
 
962
978
  yield from self.generate_query_usage_mcps()
963
979
 
@@ -968,7 +984,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
968
984
  self, lazy_frame: polars.LazyFrame
969
985
  ) -> Iterable[MetadataWorkUnit]:
970
986
  num = 0
971
- for row in lazy_frame.collect().to_struct():
987
+ for row in lazy_frame.collect(
988
+ streaming=self.config.experimental_full_streaming
989
+ ).to_struct():
972
990
  num += 1
973
991
 
974
992
  if "siblings" in row and row["siblings"]:
@@ -979,113 +997,68 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
979
997
  )
980
998
 
981
999
  if "queries_rank_percentile" in row:
982
- search_ranking_multipliers = self.search_score(
983
- urn=row["urn"],
984
- last_update_time=(
985
- row["last_modified_at"]
986
- if "last_modified_at" in row and row["last_modified_at"]
987
- else 0
988
- ),
989
- usage_percentile=(
990
- row["queries_rank_percentile"]
991
- if row["queries_rank_percentile"]
992
- else 0
993
- ),
1000
+ # If usage data is missing we set the search ranking multipliers to 1
1001
+ search_ranking_multipliers = (
1002
+ self.search_score(
1003
+ urn=row["urn"],
1004
+ last_update_time=row.get("last_modified_at", 0) or 0,
1005
+ usage_percentile=row.get("queries_rank_percentile", 0) or 0,
1006
+ )
1007
+ if row.get("queries_rank_percentile", 0)
1008
+ else SearchRankingMultipliers()
994
1009
  )
995
1010
  elif "viewsCount30Days_rank_percentile" in row:
996
- search_ranking_multipliers = self.search_score(
997
- urn=row["urn"],
998
- last_update_time=(
999
- row["last_modified_at"]
1000
- if "last_modified_at" in row and row["last_modified_at"]
1001
- else 0
1002
- ),
1003
- usage_percentile=(
1004
- row["viewsCount30Days_rank_percentile"]
1005
- if row["viewsCount30Days_rank_percentile"]
1006
- else 0
1007
- ),
1011
+ # If usage data is missing we set the search ranking multipliers to 1
1012
+ search_ranking_multipliers = (
1013
+ self.search_score(
1014
+ urn=row["urn"],
1015
+ last_update_time=row.get("last_modified_at", 0) or 0,
1016
+ usage_percentile=row.get("viewsCount30Days_rank_percentile", 0)
1017
+ or 0,
1018
+ )
1019
+ if row.get("viewsCount30Days_rank_percentile", 0)
1020
+ else SearchRankingMultipliers()
1008
1021
  )
1009
1022
  logger.debug(f"Urn: {row['urn']} Score: {search_ranking_multipliers}")
1010
1023
 
1011
1024
  usage_feature = UsageFeaturesClass(
1012
- queryCountLast30Days=(
1013
- int(row["totalSqlQueries"])
1014
- if "totalSqlQueries" in row and row["totalSqlQueries"]
1015
- else 0
1016
- ),
1017
- usageCountLast30Days=(
1018
- int(row["totalSqlQueries"])
1019
- if "totalSqlQueries" in row and row["totalSqlQueries"]
1020
- else 0
1021
- ),
1022
- queryCountRankLast30Days=(
1023
- int(row["queries_rank"])
1024
- if "queries_rank" in row and row["queries_rank"] is not None
1025
- else None
1026
- ),
1027
- queryCountPercentileLast30Days=(
1028
- int(row["queries_rank_percentile"])
1029
- if "queries_rank_percentile" in row
1030
- and row["queries_rank_percentile"]
1031
- else 0
1032
- ),
1025
+ queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
1026
+ usageCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
1027
+ queryCountRankLast30Days=int(row.get("queries_rank"))
1028
+ if row.get("queries_rank")
1029
+ else None,
1030
+ queryCountPercentileLast30Days=row.get("queries_rank_percentile", 0)
1031
+ or 0,
1033
1032
  # queryCountPercentileLast30Days=int(
1034
1033
  # row["queries_rank_percentile"]) if "queries_rank_percentile" in row and row[
1035
1034
  # "queries_rank_percentile"] else 0,
1036
1035
  topUsersLast30Days=(
1037
- list(chain.from_iterable(row["top_users"]))
1038
- if row["top_users"]
1039
- else None
1040
- ),
1041
- uniqueUserCountLast30Days=(
1042
- int(row["distinct_user"]) if row["distinct_user"] else 0
1043
- ),
1044
- uniqueUserRankLast30Days=(
1045
- int(row["distinct_user_rank"])
1046
- if "distinct_user_rank" in row
1047
- and row["distinct_user_rank"] is not None
1036
+ list(chain.from_iterable(row.get("top_users")))
1037
+ if row.get("top_users")
1048
1038
  else None
1049
1039
  ),
1050
- uniqueUserPercentileLast30Days=(
1051
- int(row["distinct_user_rank_percentile"])
1052
- if "distinct_user_rank_percentile" in row
1053
- and row["distinct_user_rank_percentile"]
1054
- else 0
1055
- ),
1056
- writeCountLast30Days=(
1057
- int(row["write_count"])
1058
- if "write_count" in row and row["write_count"]
1059
- else 0
1060
- if not self.config.disable_write_usage
1061
- else None
1062
- ),
1063
- writeCountPercentileLast30Days=(
1064
- int(row["write_rank_percentile"])
1065
- if "write_count" in row and row["write_rank_percentile"]
1066
- else 0
1067
- if not self.config.disable_write_usage
1068
- else None
1069
- ),
1070
- writeCountRankLast30Days=(
1071
- int(row["write_rank"])
1072
- if "write_rank" in row and row["write_rank"]
1073
- else None
1040
+ uniqueUserCountLast30Days=int(row.get("distinct_user", 0) or 0),
1041
+ uniqueUserRankLast30Days=int(row.get("distinct_user_rank"))
1042
+ if row.get("distinct_user_rank")
1043
+ else None,
1044
+ uniqueUserPercentileLast30Days=int(
1045
+ row.get("distinct_user_rank_percentile", 0) or 0
1074
1046
  ),
1075
- viewCountTotal=(
1076
- int(row["viewsTotal"])
1077
- if "viewsTotal" in row and row["viewsTotal"]
1078
- else 0
1079
- ),
1080
- viewCountLast30Days=(
1081
- int(row["viewsCount30Days"])
1082
- if "viewsCount30Days" in row and row["viewsCount30Days"]
1083
- else 0
1084
- ),
1085
- viewCountPercentileLast30Days=(
1086
- int(row["viewsCount30Days_rank_percentile"])
1087
- if "viewsCount30Days_rank_percentile" in row
1088
- else 0
1047
+ writeCountLast30Days=int(row.get("write_rank_percentile", 0) or 0)
1048
+ if not self.config.disable_write_usage
1049
+ else None,
1050
+ writeCountPercentileLast30Days=int(
1051
+ row.get("write_rank_percentile", 0) or 0
1052
+ )
1053
+ if not self.config.disable_write_usage
1054
+ else None,
1055
+ writeCountRankLast30Days=int(row.get("write_rank") or 0)
1056
+ if not self.config.disable_write_usage
1057
+ else None,
1058
+ viewCountTotal=int(row.get("viewsTotal", 0) or 0),
1059
+ viewCountLast30Days=int(row.get("viewsCount30Days", 0) or 0),
1060
+ viewCountPercentileLast30Days=int(
1061
+ row.get("viewsCount30Days_rank_percentile", 0) or 0
1089
1062
  ),
1090
1063
  usageSearchScoreMultiplier=search_ranking_multipliers.usageSearchScoreMultiplier,
1091
1064
  usageFreshnessScoreMultiplier=search_ranking_multipliers.usageFreshnessScoreMultiplier,
@@ -1095,11 +1068,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1095
1068
 
1096
1069
  yield from self.generate_usage_feature_mcp(row["urn"], usage_feature)
1097
1070
 
1098
- if (
1099
- "siblings" in row
1100
- and row["siblings"]
1101
- and self.config.sibling_usage_enabled
1102
- ):
1071
+ if row.get("siblings") and self.config.sibling_usage_enabled:
1103
1072
  for sibling in row["siblings"]:
1104
1073
  if dbt_platform_regexp.match(sibling):
1105
1074
  yield from self.generate_usage_feature_mcp(
@@ -1114,26 +1083,15 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1114
1083
  num += 1
1115
1084
 
1116
1085
  query_usage_features = QueryUsageFeaturesClass(
1117
- queryCountLast30Days=(
1118
- int(row["totalSqlQueries"])
1119
- if "totalSqlQueries" in row and row["totalSqlQueries"]
1120
- else 0
1121
- ),
1086
+ queryCountLast30Days=int(row.get("totalSqlQueries", 0) or 0),
1122
1087
  queryCountTotal=None, # This is not implemented
1123
- runsPercentileLast30days=(
1124
- int(row["queries_rank_percentile"])
1125
- if "queries_rank_percentile" in row
1126
- and row["queries_rank_percentile"]
1127
- else 0
1128
- ),
1129
- lastExecutedAt=(
1130
- int(row["last_modified_at"])
1131
- if "last_modified_at" in row and row["last_modified_at"]
1132
- else 0
1088
+ runsPercentileLast30days=int(
1089
+ row.get("queries_rank_percentile", 0) or 0
1133
1090
  ),
1091
+ lastExecutedAt=int(row.get("last_modified_at", 0)),
1134
1092
  topUsersLast30Days=(
1135
- list(chain.from_iterable(row["top_users"]))
1136
- if row["top_users"]
1093
+ list(chain.from_iterable(row.get("top_users", [])))
1094
+ if row.get("top_users")
1137
1095
  else None
1138
1096
  ),
1139
1097
  queryCostLast30Days=None, # Not implemented yet
@@ -1180,16 +1138,17 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1180
1138
  def generate_dashboard_chart_usage(
1181
1139
  self, entity_index: str, usage_index: str
1182
1140
  ) -> polars.LazyFrame:
1183
- soft_deleted_schema = {
1141
+ entity_schema = {
1184
1142
  "entity_urn": polars.Categorical,
1185
1143
  "removed": polars.Boolean,
1186
1144
  "last_modified_at": polars.Int64,
1187
1145
  "siblings": polars.List(polars.String),
1146
+ "combinedSearchRankingMultiplier": polars.Float64,
1188
1147
  "isView": polars.Boolean,
1189
1148
  }
1190
1149
 
1191
- soft_deleted_df = self.load_data_from_es_to_lf(
1192
- schema=soft_deleted_schema,
1150
+ entities_df = self.load_data_from_es_to_lf(
1151
+ schema=entity_schema,
1193
1152
  index=entity_index,
1194
1153
  query=QueryBuilder.get_dataset_entities_query(),
1195
1154
  process_function=self.soft_deleted_batch,
@@ -1220,7 +1179,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1220
1179
  )
1221
1180
 
1222
1181
  lf = (
1223
- lf.join(soft_deleted_df, left_on="urn", right_on="entity_urn", how="inner")
1182
+ lf.join(entities_df, left_on="urn", right_on="entity_urn", how="inner")
1224
1183
  .filter(polars.col("removed") == False) # noqa: E712
1225
1184
  .drop(["removed"])
1226
1185
  )
@@ -1268,8 +1227,10 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1268
1227
  )
1269
1228
  .drop(["first_viewsCount"])
1270
1229
  )
1271
- lf = views_sum_with_top_users.join(incremental_views_sum, on="urn", how="left")
1272
- lf = lf.with_columns(
1230
+ views_with_inceremental_sum = views_sum_with_top_users.join(
1231
+ incremental_views_sum, on="urn", how="left"
1232
+ )
1233
+ total_views = views_with_inceremental_sum.with_columns(
1273
1234
  polars.when(
1274
1235
  polars.col("total_user_count")
1275
1236
  .is_null()
@@ -1280,11 +1241,54 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1280
1241
  .alias("viewsCount30Days")
1281
1242
  )
1282
1243
 
1283
- lf = self.gen_rank_and_percentile(
1284
- lf, "viewsCount30Days", "urn", "platform", "viewsCount30Days_"
1244
+ total_views_with_rank_and_percentiles = self.gen_rank_and_percentile(
1245
+ total_views, "viewsCount30Days", "urn", "platform", "viewsCount30Days_"
1246
+ ).drop(["siblings_right"])
1247
+
1248
+ total_views_with_rank_and_percentiles_with_zeroed_stale_usages = (
1249
+ self.generate_empty_usage_for_stale_entities(
1250
+ entities_df, total_views_with_rank_and_percentiles
1251
+ )
1285
1252
  )
1286
1253
 
1287
- return lf
1254
+ return total_views_with_rank_and_percentiles_with_zeroed_stale_usages
1255
+
1256
+ def generate_empty_usage_for_stale_entities(
1257
+ self, entities_lf: polars.LazyFrame, usages_lf: polars.LazyFrame
1258
+ ) -> polars.LazyFrame:
1259
+ # We need to merge datasets with existing search scores to make sure we can downrank them if there were no usage in the last n days
1260
+ # We drop last_modified_at to not use it in merge because we are getting last_modified_at from the usage index
1261
+ df_with_search_scores = (
1262
+ entities_lf.filter(
1263
+ polars.col("combinedSearchRankingMultiplier")
1264
+ .is_not_null()
1265
+ # We only want to downrank datasets that have a search score multiplier greater than 1. 1 is the minimum score of a dataset
1266
+ .and_(polars.col("combinedSearchRankingMultiplier").ne(1))
1267
+ ) # noqa: E712
1268
+ .filter(polars.col("removed") == False) # noqa: E712
1269
+ .drop(["removed"])
1270
+ .drop(["last_modified_at"])
1271
+ # We set this to 0 because we want to downrank datasets that have no usage
1272
+ .with_columns(polars.lit(0).alias("combinedSearchRankingMultiplier"))
1273
+ .rename({"entity_urn": "urn"})
1274
+ )
1275
+ common_fields = list(
1276
+ set(usages_lf.columns).intersection(set(df_with_search_scores.columns))
1277
+ )
1278
+ usages_lf = df_with_search_scores.join(
1279
+ usages_lf, on="urn", how="full", suffix="_right"
1280
+ )
1281
+ ## Merge all common fields automatically
1282
+ for common_field in common_fields:
1283
+ right_col = f"{common_field}_right"
1284
+ usages_lf = usages_lf.with_columns(
1285
+ [
1286
+ polars.col(common_field)
1287
+ .fill_null(polars.col(right_col))
1288
+ .alias(common_field)
1289
+ ]
1290
+ ).drop(right_col)
1291
+ return usages_lf
1288
1292
 
1289
1293
  def generate_query_usage(self) -> polars.LazyFrame:
1290
1294
  usage_index = "query_queryusagestatisticsaspect_v1"
@@ -1365,16 +1369,21 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1365
1369
 
1366
1370
  # Polaris/pandas join merges the join column into one column and that's why we need to filter based on the removed column
1367
1371
  lf = (
1368
- lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="inner")
1372
+ lf.join(datasets_lf, left_on="urn", right_on="entity_urn", how="left")
1369
1373
  .filter(polars.col("removed") == False) # noqa: E712
1370
1374
  .drop(["removed"])
1371
1375
  )
1376
+
1372
1377
  total_queries = lf.group_by("urn", "platform").agg(
1373
1378
  polars.col("totalSqlQueries").sum(),
1374
1379
  polars.col("last_modified_at").max().alias("last_modified_at"),
1375
1380
  polars.col("siblings").first().alias("siblings"),
1376
1381
  )
1377
1382
 
1383
+ total_queries = self.generate_empty_usage_for_stale_entities(
1384
+ datasets_lf, total_queries
1385
+ )
1386
+
1378
1387
  top_users = self.generate_top_users(lf)
1379
1388
 
1380
1389
  usage_with_top_users = total_queries.join(top_users, on="urn", how="left")
@@ -1443,9 +1452,9 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1443
1452
  if not self.config.streaming_mode:
1444
1453
  return polars.LazyFrame(data, schema)
1445
1454
  else:
1446
- assert (
1447
- self.temp_dir is not None
1448
- ), "In Streaming mode temp dir should be set. Normally this should not happen..."
1455
+ assert self.temp_dir is not None, (
1456
+ "In Streaming mode temp dir should be set. Normally this should not happen..."
1457
+ )
1449
1458
 
1450
1459
  with tempfile.NamedTemporaryFile(
1451
1460
  delete=False,
@@ -1510,6 +1519,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1510
1519
  "removed": polars.Boolean,
1511
1520
  "last_modified_at": polars.Int64,
1512
1521
  "siblings": polars.List(polars.String),
1522
+ "combinedSearchRankingMultiplier": polars.Float64,
1513
1523
  "isView": polars.Boolean,
1514
1524
  }
1515
1525
 
@@ -1614,7 +1624,7 @@ class DataHubUsageFeatureReportingSource(StatefulIngestionSourceBase):
1614
1624
  time_taken = timer.elapsed_seconds()
1615
1625
  processed_count += len(results["hits"]["hits"])
1616
1626
  logger.info(
1617
- f"Processed {len(results['hits']['hits'''])} data from {index} index in {time_taken:.3f} seconds. Total: {processed_count} processed."
1627
+ f"Processed {len(results['hits']['hits'])} data from {index} index in {time_taken:.3f} seconds. Total: {processed_count} processed."
1618
1628
  )
1619
1629
  if len(results["hits"]["hits"]) < batch_size:
1620
1630
  break