acryl-datahub 1.1.0.5rc7__py3-none-any.whl → 1.1.0.5rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (59) hide show
  1. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/METADATA +2620 -2622
  2. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/RECORD +59 -59
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/autogenerated/capability_summary.json +88 -23
  26. datahub/ingestion/extractor/schema_util.py +13 -4
  27. datahub/ingestion/graph/client.py +2 -2
  28. datahub/ingestion/run/pipeline.py +43 -0
  29. datahub/ingestion/source/bigquery_v2/bigquery.py +9 -1
  30. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  31. datahub/ingestion/source/dremio/dremio_source.py +1 -4
  32. datahub/ingestion/source/gcs/gcs_source.py +9 -1
  33. datahub/ingestion/source/identity/okta.py +0 -13
  34. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  35. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  36. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  37. datahub/ingestion/source/sigma/sigma.py +6 -1
  38. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  39. datahub/ingestion/source/snowflake/snowflake_queries.py +100 -58
  40. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  41. datahub/ingestion/source/snowflake/stored_proc_lineage.py +1 -1
  42. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  43. datahub/ingestion/source/sql/sql_common.py +8 -0
  44. datahub/ingestion/source/sql/teradata.py +993 -234
  45. datahub/ingestion/source/sql/vertica.py +0 -4
  46. datahub/ingestion/source/sql_queries.py +2 -2
  47. datahub/ingestion/source/superset.py +56 -1
  48. datahub/ingestion/source/tableau/tableau.py +40 -34
  49. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  50. datahub/ingestion/source/unity/source.py +9 -1
  51. datahub/sdk/lineage_client.py +2 -2
  52. datahub/sql_parsing/sql_parsing_aggregator.py +21 -12
  53. datahub/sql_parsing/sqlglot_lineage.py +40 -15
  54. datahub/upgrade/upgrade.py +46 -13
  55. datahub/utilities/server_config_util.py +8 -0
  56. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/WHEEL +0 -0
  57. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/entry_points.txt +0 -0
  58. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/licenses/LICENSE +0 -0
  59. {acryl_datahub-1.1.0.5rc7.dist-info → acryl_datahub-1.1.0.5rc9.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass
4
+ from enum import Enum
4
5
  from typing import Dict, List, Optional, Set
5
6
 
6
7
  import pydantic
@@ -53,6 +54,11 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
53
54
  ]
54
55
 
55
56
 
57
+ class QueryDedupStrategyType(Enum):
58
+ STANDARD = "STANDARD"
59
+ NONE = "NONE"
60
+
61
+
56
62
  class TagOption(StrEnum):
57
63
  with_lineage = "with_lineage"
58
64
  without_lineage = "without_lineage"
@@ -248,6 +254,11 @@ class SnowflakeV2Config(
248
254
  "This is useful if you have a large number of schemas and want to avoid bulk fetching the schema for each table/view.",
249
255
  )
250
256
 
257
+ query_dedup_strategy: QueryDedupStrategyType = Field(
258
+ default=QueryDedupStrategyType.STANDARD,
259
+ description=f"Experimental: Choose the strategy for query deduplication (default value is appropriate for most use-cases; make sure you understand performance implications before changing it). Allowed values are: {', '.join([s.name for s in QueryDedupStrategyType])}",
260
+ )
261
+
251
262
  _check_role_grants_removed = pydantic_removed_field("check_role_grants")
252
263
  _provision_role_removed = pydantic_removed_field("provision_role")
253
264
 
@@ -28,6 +28,7 @@ from datahub.ingestion.graph.client import DataHubGraph
28
28
  from datahub.ingestion.source.snowflake.constants import SnowflakeObjectDomain
29
29
  from datahub.ingestion.source.snowflake.snowflake_config import (
30
30
  DEFAULT_TEMP_TABLES_PATTERNS,
31
+ QueryDedupStrategyType,
31
32
  SnowflakeFilterConfig,
32
33
  SnowflakeIdentifierConfig,
33
34
  )
@@ -118,6 +119,8 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
118
119
  include_query_usage_statistics: bool = True
119
120
  include_operations: bool = True
120
121
 
122
+ query_dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD
123
+
121
124
 
122
125
  class SnowflakeQueriesSourceConfig(
123
126
  SnowflakeQueriesExtractorConfig, SnowflakeIdentifierConfig, SnowflakeFilterConfig
@@ -374,12 +377,13 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
374
377
  ) -> Iterable[
375
378
  Union[PreparsedQuery, TableRename, TableSwap, ObservedQuery, StoredProcCall]
376
379
  ]:
377
- query_log_query = _build_enriched_query_log_query(
380
+ query_log_query = QueryLogQueryBuilder(
378
381
  start_time=self.config.window.start_time,
379
382
  end_time=self.config.window.end_time,
380
383
  bucket_duration=self.config.window.bucket_duration,
381
384
  deny_usernames=self.config.pushdown_deny_usernames,
382
- )
385
+ dedup_strategy=self.config.query_dedup_strategy,
386
+ ).build_enriched_query_log_query()
383
387
 
384
388
  with self.structured_reporter.report_exc(
385
389
  "Error fetching query log from Snowflake"
@@ -710,63 +714,101 @@ class SnowflakeQueriesSource(Source):
710
714
  super().close()
711
715
 
712
716
 
713
- # Make sure we don't try to generate too much info for a single query.
714
- _MAX_TABLES_PER_QUERY = 20
715
-
716
-
717
- def _build_enriched_query_log_query(
718
- start_time: datetime,
719
- end_time: datetime,
720
- bucket_duration: BucketDuration,
721
- deny_usernames: Optional[List[str]],
722
- ) -> str:
723
- start_time_millis = int(start_time.timestamp() * 1000)
724
- end_time_millis = int(end_time.timestamp() * 1000)
725
-
726
- users_filter = "TRUE"
727
- if deny_usernames:
728
- user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
729
- users_filter = f"user_name NOT IN ({user_not_in})"
730
-
731
- time_bucket_size = bucket_duration.value
732
- assert time_bucket_size in ("HOUR", "DAY", "MONTH")
717
+ class QueryLogQueryBuilder:
718
+ def __init__(
719
+ self,
720
+ start_time: datetime,
721
+ end_time: datetime,
722
+ bucket_duration: BucketDuration,
723
+ deny_usernames: Optional[List[str]],
724
+ max_tables_per_query: int = 20,
725
+ dedup_strategy: QueryDedupStrategyType = QueryDedupStrategyType.STANDARD,
726
+ ):
727
+ self.start_time = start_time
728
+ self.end_time = end_time
729
+ self.start_time_millis = int(start_time.timestamp() * 1000)
730
+ self.end_time_millis = int(end_time.timestamp() * 1000)
731
+ self.max_tables_per_query = max_tables_per_query
732
+ self.dedup_strategy = dedup_strategy
733
+
734
+ self.users_filter = "TRUE"
735
+ if deny_usernames:
736
+ user_not_in = ",".join(f"'{user.upper()}'" for user in deny_usernames)
737
+ self.users_filter = f"user_name NOT IN ({user_not_in})"
738
+
739
+ self.time_bucket_size = bucket_duration.value
740
+ assert self.time_bucket_size in ("HOUR", "DAY", "MONTH")
741
+
742
+ def _query_fingerprinted_queries(self):
743
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
744
+ secondary_fingerprint_sql = """
745
+ CASE
746
+ WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
747
+ -- Extract project id and hash it
748
+ THEN CAST(HASH(
749
+ REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
750
+ REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
751
+ ) AS VARCHAR)
752
+ ELSE NULL
753
+ END"""
754
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
755
+ secondary_fingerprint_sql = "NULL"
756
+ else:
757
+ raise NotImplementedError(
758
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
759
+ )
760
+ return f"""
761
+ SELECT *,
762
+ -- TODO: Generate better fingerprints for each query by pushing down regex logic.
763
+ query_history.query_parameterized_hash as query_fingerprint,
764
+ -- Optional and additional hash to be used for query deduplication and final query identity
765
+ {secondary_fingerprint_sql} as query_secondary_fingerprint
766
+ FROM
767
+ snowflake.account_usage.query_history
768
+ WHERE
769
+ query_history.start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
770
+ AND query_history.start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
771
+ AND execution_status = 'SUCCESS'
772
+ AND {self.users_filter}"""
773
+
774
+ def _query_deduplicated_queries(self):
775
+ if self.dedup_strategy == QueryDedupStrategyType.STANDARD:
776
+ return f"""
777
+ SELECT
778
+ *,
779
+ DATE_TRUNC(
780
+ {self.time_bucket_size},
781
+ CONVERT_TIMEZONE('UTC', start_time)
782
+ ) AS bucket_start_time,
783
+ COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
784
+ FROM
785
+ fingerprinted_queries
786
+ QUALIFY
787
+ ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1"""
788
+ elif self.dedup_strategy == QueryDedupStrategyType.NONE:
789
+ return f"""
790
+ SELECT
791
+ *,
792
+ DATE_TRUNC(
793
+ {self.time_bucket_size},
794
+ CONVERT_TIMEZONE('UTC', start_time)
795
+ ) AS bucket_start_time,
796
+ 1 AS query_count,
797
+ FROM
798
+ fingerprinted_queries"""
799
+ else:
800
+ raise NotImplementedError(
801
+ f"Strategy {self.dedup_strategy} is not implemented by the QueryLogQueryBuilder"
802
+ )
733
803
 
734
- return f"""\
804
+ def build_enriched_query_log_query(self) -> str:
805
+ return f"""\
735
806
  WITH
736
807
  fingerprinted_queries as (
737
- SELECT *,
738
- -- TODO: Generate better fingerprints for each query by pushing down regex logic.
739
- query_history.query_parameterized_hash as query_fingerprint,
740
- -- Optional and additional hash to be used for query deduplication and final query identity
741
- CASE
742
- WHEN CONTAINS(query_history.query_text, '-- Hex query metadata:')
743
- -- Extract project id and hash it
744
- THEN CAST(HASH(
745
- REGEXP_SUBSTR(query_history.query_text, '"project_id"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1),
746
- REGEXP_SUBSTR(query_history.query_text, '"context"\\\\s*:\\\\s*"([^"]+)"', 1, 1, 'e', 1)
747
- ) AS VARCHAR)
748
- ELSE NULL
749
- END as query_secondary_fingerprint
750
- FROM
751
- snowflake.account_usage.query_history
752
- WHERE
753
- query_history.start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
754
- AND query_history.start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
755
- AND execution_status = 'SUCCESS'
756
- AND {users_filter}
808
+ {self._query_fingerprinted_queries()}
757
809
  )
758
810
  , deduplicated_queries as (
759
- SELECT
760
- *,
761
- DATE_TRUNC(
762
- {time_bucket_size},
763
- CONVERT_TIMEZONE('UTC', start_time)
764
- ) AS bucket_start_time,
765
- COUNT(*) OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint) AS query_count,
766
- FROM
767
- fingerprinted_queries
768
- QUALIFY
769
- ROW_NUMBER() OVER (PARTITION BY bucket_start_time, query_fingerprint, query_secondary_fingerprint ORDER BY start_time DESC) = 1
811
+ {self._query_deduplicated_queries()}
770
812
  )
771
813
  , raw_access_history AS (
772
814
  SELECT
@@ -780,9 +822,9 @@ fingerprinted_queries as (
780
822
  FROM
781
823
  snowflake.account_usage.access_history
782
824
  WHERE
783
- query_start_time >= to_timestamp_ltz({start_time_millis}, 3) -- {start_time.isoformat()}
784
- AND query_start_time < to_timestamp_ltz({end_time_millis}, 3) -- {end_time.isoformat()}
785
- AND {users_filter}
825
+ query_start_time >= to_timestamp_ltz({self.start_time_millis}, 3) -- {self.start_time.isoformat()}
826
+ AND query_start_time < to_timestamp_ltz({self.end_time_millis}, 3) -- {self.end_time.isoformat()}
827
+ AND {self.users_filter}
786
828
  AND query_id IN (
787
829
  SELECT query_id FROM deduplicated_queries
788
830
  )
@@ -795,7 +837,7 @@ fingerprinted_queries as (
795
837
  query_start_time,
796
838
  ARRAY_SLICE(
797
839
  FILTER(direct_objects_accessed, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}),
798
- 0, {_MAX_TABLES_PER_QUERY}
840
+ 0, {self.max_tables_per_query}
799
841
  ) as direct_objects_accessed,
800
842
  -- TODO: Drop the columns.baseSources subfield.
801
843
  FILTER(objects_modified, o -> o:objectDomain IN {SnowflakeQuery.ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER}) as objects_modified,
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
32
32
  )
33
33
  from datahub.ingestion.api.source_helpers import auto_workunit
34
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
35
36
  from datahub.ingestion.source.snowflake.constants import (
36
37
  GENERIC_PERMISSION_ERROR_KEY,
37
38
  SnowflakeEdition,
@@ -97,7 +98,14 @@ logger: logging.Logger = logging.getLogger(__name__)
97
98
  @support_status(SupportStatus.CERTIFIED)
98
99
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
99
100
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
100
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
101
+ @capability(
102
+ SourceCapability.CONTAINERS,
103
+ "Enabled by default",
104
+ subtype_modifier=[
105
+ SourceCapabilityModifier.DATABASE,
106
+ SourceCapabilityModifier.SCHEMA,
107
+ ],
108
+ )
101
109
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
102
110
  @capability(
103
111
  SourceCapability.DATA_PROFILING,
@@ -577,6 +585,7 @@ class SnowflakeV2Source(
577
585
 
578
586
  queries_extractor = SnowflakeQueriesExtractor(
579
587
  connection=self.connection,
588
+ # TODO: this should be its own section in main recipe
580
589
  config=SnowflakeQueriesExtractorConfig(
581
590
  window=BaseTimeWindowConfig(
582
591
  start_time=self.config.start_time,
@@ -591,6 +600,7 @@ class SnowflakeV2Source(
591
600
  include_query_usage_statistics=self.config.include_query_usage_statistics,
592
601
  user_email_pattern=self.config.user_email_pattern,
593
602
  pushdown_deny_usernames=self.config.pushdown_deny_usernames,
603
+ query_dedup_strategy=self.config.query_dedup_strategy,
594
604
  ),
595
605
  structured_report=self.report,
596
606
  filters=self.filters,
@@ -65,7 +65,7 @@ class StoredProcLineageTracker(Closeable):
65
65
  # { root_query_id -> StoredProcExecutionLineage }
66
66
  self._stored_proc_execution_lineage: FileBackedDict[
67
67
  StoredProcExecutionLineage
68
- ] = FileBackedDict(shared_connection)
68
+ ] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
69
69
 
70
70
  def add_stored_proc_call(self, call: StoredProcCall) -> None:
71
71
  """Add a stored procedure call to track."""
@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
52
52
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
53
53
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
54
54
  from datahub.metadata.schema_classes import (
55
- ChangeTypeClass,
56
55
  DatasetPropertiesClass,
57
56
  SubTypesClass,
58
57
  ViewPropertiesClass,
@@ -601,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
601
600
  yield dpi_aspect
602
601
 
603
602
  yield MetadataChangeProposalWrapper(
604
- entityType="dataset",
605
- changeType=ChangeTypeClass.UPSERT,
606
603
  entityUrn=dataset_urn,
607
- aspectName="subTypes",
608
604
  aspect=SubTypesClass(typeNames=[self.table_subtype]),
609
605
  ).as_workunit()
610
606
 
@@ -810,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
810
806
 
811
807
  # Add views subtype
812
808
  yield MetadataChangeProposalWrapper(
813
- entityType="dataset",
814
- changeType=ChangeTypeClass.UPSERT,
815
809
  entityUrn=dataset_urn,
816
- aspectName="subTypes",
817
810
  aspect=SubTypesClass(typeNames=[self.view_subtype]),
818
811
  ).as_workunit()
819
812
 
@@ -824,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
824
817
  viewLogic=dataset.view_definition if dataset.view_definition else "",
825
818
  )
826
819
  yield MetadataChangeProposalWrapper(
827
- entityType="dataset",
828
- changeType=ChangeTypeClass.UPSERT,
829
820
  entityUrn=dataset_urn,
830
- aspectName="viewProperties",
831
821
  aspect=view_properties_aspect,
832
822
  ).as_workunit()
833
823
 
@@ -292,6 +292,10 @@ class ProfileMetadata:
292
292
  SourceCapability.CONTAINERS,
293
293
  "Enabled by default",
294
294
  supported=True,
295
+ subtype_modifier=[
296
+ SourceCapabilityModifier.DATABASE,
297
+ SourceCapabilityModifier.SCHEMA,
298
+ ],
295
299
  )
296
300
  @capability(
297
301
  SourceCapability.DESCRIPTIONS,
@@ -589,6 +593,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
589
593
  )
590
594
 
591
595
  # Generate workunit for aggregated SQL parsing results
596
+ yield from self._generate_aggregator_workunits()
597
+
598
+ def _generate_aggregator_workunits(self) -> Iterable[MetadataWorkUnit]:
599
+ """Generate work units from SQL parsing aggregator. Can be overridden by subclasses."""
592
600
  for mcp in self.aggregator.gen_metadata():
593
601
  yield mcp.as_workunit()
594
602