acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -192,6 +192,11 @@ class SupportedDataPlatform(Enum):
192
192
  datahub_data_platform_name="mysql",
193
193
  )
194
194
 
195
+ ODBC = DataPlatformPair(
196
+ powerbi_data_platform_name="Odbc",
197
+ datahub_data_platform_name="odbc",
198
+ )
199
+
195
200
 
196
201
  @dataclass
197
202
  class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
@@ -341,6 +346,13 @@ class PowerBiDashboardSourceConfig(
341
346
  "For Google BigQuery the datasource's server is google bigquery project name. "
342
347
  "For Databricks Unity Catalog the datasource's server is workspace FQDN.",
343
348
  )
349
+ # ODBC DSN to platform mapping
350
+ dsn_to_platform_name: Dict[str, str] = pydantic.Field(
351
+ default={},
352
+ description="A mapping of ODBC DSN to DataHub data platform name. "
353
+ "For example with an ODBC connection string 'DSN=database' where the database type "
354
+ "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
355
+ )
344
356
  # deprecated warning
345
357
  _dataset_type_mapping = pydantic_field_deprecated(
346
358
  "dataset_type_mapping",
@@ -501,7 +513,7 @@ class PowerBiDashboardSourceConfig(
501
513
  include_workspace_name_in_dataset_urn: bool = pydantic.Field(
502
514
  default=False,
503
515
  description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
504
- "Read section #11560 at https://datahubproject.io/docs/how/updating-datahub/ before enabling this option."
516
+ "Read section #11560 at https://docs.datahub.com/docs/how/updating-datahub/ before enabling this option."
505
517
  "To maintain backward compatibility, this is set to False.",
506
518
  )
507
519
 
@@ -75,3 +75,4 @@ class FunctionName(Enum):
75
75
  AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
77
  MYSQL_DATA_ACCESS = "MySQL.Database"
78
+ ODBC_DATA_ACCESS = "Odbc.DataSource"
@@ -0,0 +1,185 @@
1
+ import re
2
+ from typing import Optional, Tuple, Union
3
+
4
+ server_patterns = [
5
+ r"Server=([^:]+)[:][0-9]+/.*",
6
+ r"SERVER=\{([^}]*)\}",
7
+ r"SERVER=([^;]*)",
8
+ r"HOST=\{([^}]*)\}",
9
+ r"HOST=([^;]*)",
10
+ r"DATA SOURCE=\{([^}]*)\}",
11
+ r"DATA SOURCE=([^;]*)",
12
+ r"DSN=\{([^}]*)\}",
13
+ r"DSN=([^;]*)",
14
+ r"Server=([^;]*)",
15
+ r"S3OutputLocation=([^;]*)",
16
+ r"HTTPPath=([^;]*)",
17
+ r"Host=([^;]*)",
18
+ ]
19
+
20
+ dsn_patterns = [
21
+ r"DSN\s*=\s*\"([^\"]+)\"",
22
+ r"DSN\s*=\s*\'([^\']+)\'",
23
+ r"DSN\s*=\s*([^;]+)",
24
+ ]
25
+
26
+ platform_patterns = {
27
+ "mysql": r"mysql",
28
+ "postgres": r"post(gre(s|sql)?|gres)",
29
+ "mssql": r"(sql\s*server|mssql|sqlncli)",
30
+ "oracle": r"oracle",
31
+ "db2": r"db2",
32
+ "sqlite": r"sqlite",
33
+ "access": r"(access|\.mdb|\.accdb)",
34
+ "excel": r"(excel|\.xls)",
35
+ "firebird": r"firebird",
36
+ "informix": r"informix",
37
+ "sybase": r"sybase",
38
+ "teradata": r"teradata",
39
+ "hadoop": r"(hadoop|hive)",
40
+ "snowflake": r"snowflake",
41
+ "redshift": r"redshift",
42
+ "bigquery": r"bigquery",
43
+ "athena": r"(athena|aws\s*athena)",
44
+ "databricks": r"(databricks|spark)",
45
+ }
46
+
47
+ powerbi_platform_names = {
48
+ "mysql": "MySQL",
49
+ "postgres": "PostgreSQL",
50
+ "mssql": "SQL Server",
51
+ "oracle": "Oracle",
52
+ "db2": "IBM DB2",
53
+ "sqlite": "SQLite",
54
+ "access": "Microsoft Access",
55
+ "excel": "Microsoft Excel",
56
+ "firebird": "Firebird",
57
+ "informix": "IBM Informix",
58
+ "sybase": "SAP Sybase",
59
+ "teradata": "Teradata",
60
+ "hadoop": "Hadoop",
61
+ "snowflake": "Snowflake",
62
+ "redshift": "Amazon Redshift",
63
+ "bigquery": "Google BigQuery",
64
+ "athena": "Amazon Athena",
65
+ "databricks": "Databricks",
66
+ }
67
+
68
+
69
+ def extract_driver(connection_string: str) -> Union[str, None]:
70
+ """
71
+ Parse an ODBC connection string and extract the driver name.
72
+ Handles whitespace in driver names and various connection string formats.
73
+
74
+ Args:
75
+ connection_string (str): The ODBC connection string
76
+
77
+ Returns:
78
+ str: The extracted driver name, or None if not found
79
+ """
80
+ # Match DRIVER={driver name} pattern
81
+ driver_match = re.search(r"DRIVER=\{([^}]*)}", connection_string, re.IGNORECASE)
82
+
83
+ if driver_match:
84
+ return driver_match.group(1).strip()
85
+
86
+ # Alternative pattern for DRIVER=driver
87
+ driver_match = re.search(r"DRIVER=([^;]*)", connection_string, re.IGNORECASE)
88
+
89
+ if driver_match:
90
+ return driver_match.group(1).strip()
91
+
92
+ return None
93
+
94
+
95
+ def extract_dsn(connection_string: str) -> Union[str, None]:
96
+ """
97
+ Extract the DSN value from an ODBC connection string.
98
+
99
+ Args:
100
+ connection_string (str): The ODBC connection string
101
+
102
+ Returns:
103
+ str or None: The extracted DSN value, or None if not found
104
+ """
105
+ for pattern in dsn_patterns:
106
+ match = re.search(pattern, connection_string, re.IGNORECASE)
107
+ if match:
108
+ return match.group(1).strip()
109
+
110
+ return None
111
+
112
+
113
+ def extract_server(connection_string: str) -> Union[str, None]:
114
+ """
115
+ Parse an ODBC connection string and extract the server name.
116
+ Handles various parameter names for server (SERVER, Host, Data Source, etc.)
117
+
118
+ Args:
119
+ connection_string (str): The ODBC connection string
120
+
121
+ Returns:
122
+ str: The extracted server name, or None if not found
123
+ """
124
+ for pattern in server_patterns:
125
+ server_match = re.search(pattern, connection_string, re.IGNORECASE)
126
+ if server_match:
127
+ return server_match.group(1).strip()
128
+
129
+ # Special case for Athena: extract from AwsRegion if no server found
130
+ region_match = re.search(r"AwsRegion=([^;]*)", connection_string, re.IGNORECASE)
131
+ if region_match:
132
+ return f"aws-athena-{region_match.group(1).strip()}"
133
+
134
+ # Special case for Databricks: try to extract hostname from JDBC URL
135
+ jdbc_match = re.search(r"jdbc:spark://([^:;/]+)", connection_string, re.IGNORECASE)
136
+ if jdbc_match:
137
+ return jdbc_match.group(1).strip()
138
+
139
+ return None
140
+
141
+
142
+ def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[str]]:
143
+ """
144
+ Extract the database platform name from the ODBC driver name.
145
+ Returns the lowercase platform name.
146
+
147
+ Args:
148
+ connection_string (str): The ODBC connection string
149
+
150
+ Returns:
151
+ tuple: A tuple containing the normalized platform name and the corresponding
152
+ Power BI platform name, or None if not recognized.
153
+ """
154
+ driver_name = extract_driver(connection_string)
155
+ if not driver_name:
156
+ return None, None
157
+
158
+ driver_lower = driver_name.lower()
159
+
160
+ for platform, pattern in platform_patterns.items():
161
+ if re.search(pattern, driver_lower):
162
+ return platform, powerbi_platform_names.get(platform)
163
+
164
+ return None, None
165
+
166
+
167
+ def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]]:
168
+ """
169
+ Normalizes the platform name by matching it with predefined patterns and maps it to
170
+ a corresponding Power BI platform name.
171
+
172
+ Args:
173
+ platform (str): The platform name to normalize
174
+
175
+ Returns:
176
+ tuple: A tuple containing the normalized platform name and the corresponding
177
+ Power BI platform name, or None if not recognized.
178
+ """
179
+ platform_lower = platform.lower()
180
+
181
+ for platform, pattern in platform_patterns.items():
182
+ if re.search(pattern, platform_lower):
183
+ return platform, powerbi_platform_names.get(platform)
184
+
185
+ return None, None
@@ -29,6 +29,12 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
29
29
  Lineage,
30
30
  ReferencedTable,
31
31
  )
32
+ from datahub.ingestion.source.powerbi.m_query.odbc import (
33
+ extract_dsn,
34
+ extract_platform,
35
+ extract_server,
36
+ normalize_platform_name,
37
+ )
32
38
  from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
39
  from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
34
40
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -155,6 +161,7 @@ class AbstractLineage(ABC):
155
161
  tree_function.token_values(arg_list)
156
162
  ),
157
163
  )
164
+ logger.debug(f"DB Details: {arguments}")
158
165
 
159
166
  if len(arguments) < 2:
160
167
  logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
@@ -940,6 +947,147 @@ class NativeQueryLineage(AbstractLineage):
940
947
  )
941
948
 
942
949
 
950
+ class OdbcLineage(AbstractLineage):
951
+ def create_lineage(
952
+ self, data_access_func_detail: DataAccessFunctionDetail
953
+ ) -> Lineage:
954
+ logger.debug(
955
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} "
956
+ f"data-access function detail {data_access_func_detail}"
957
+ )
958
+
959
+ connect_string, _ = self.get_db_detail_from_argument(
960
+ data_access_func_detail.arg_list
961
+ )
962
+
963
+ if not connect_string:
964
+ self.reporter.warning(
965
+ title="Can not extract ODBC connect string",
966
+ message="Can not extract ODBC connect string from data access function. Skipping Lineage creation.",
967
+ context=f"table-name={self.table.full_name}, data-access-func-detail={data_access_func_detail}",
968
+ )
969
+ return Lineage.empty()
970
+
971
+ logger.debug(f"ODBC connect string: {connect_string}")
972
+ data_platform, powerbi_platform = extract_platform(connect_string)
973
+ server_name = extract_server(connect_string)
974
+
975
+ if not data_platform:
976
+ dsn = extract_dsn(connect_string)
977
+ if dsn:
978
+ logger.debug(f"Extracted DSN: {dsn}")
979
+ server_name = dsn
980
+ if dsn and self.config.dsn_to_platform_name:
981
+ logger.debug(f"Attempting to map DSN {dsn} to platform")
982
+ name = self.config.dsn_to_platform_name.get(dsn)
983
+ if name:
984
+ logger.debug(f"Found DSN {dsn} mapped to platform {name}")
985
+ data_platform, powerbi_platform = normalize_platform_name(name)
986
+
987
+ if not data_platform or not powerbi_platform:
988
+ self.reporter.warning(
989
+ title="Can not determine ODBC platform",
990
+ message="Can not determine platform from ODBC connect string. Skipping Lineage creation.",
991
+ context=f"table-name={self.table.full_name}, connect-string={connect_string}",
992
+ )
993
+ return Lineage.empty()
994
+
995
+ platform_pair: DataPlatformPair = self.create_platform_pair(
996
+ data_platform, powerbi_platform
997
+ )
998
+
999
+ if not server_name and self.config.server_to_platform_instance:
1000
+ self.reporter.warning(
1001
+ title="Can not determine ODBC server name",
1002
+ message="Can not determine server name with server_to_platform_instance mapping. Skipping Lineage creation.",
1003
+ context=f"table-name={self.table.full_name}",
1004
+ )
1005
+ return Lineage.empty()
1006
+ elif not server_name:
1007
+ server_name = "unknown"
1008
+
1009
+ database_name = None
1010
+ schema_name = None
1011
+ table_name = None
1012
+ qualified_table_name = None
1013
+
1014
+ temp_accessor: Optional[IdentifierAccessor] = (
1015
+ data_access_func_detail.identifier_accessor
1016
+ )
1017
+
1018
+ while temp_accessor:
1019
+ logger.debug(
1020
+ f"identifier = {temp_accessor.identifier} items = {temp_accessor.items}"
1021
+ )
1022
+ if temp_accessor.items.get("Kind") == "Database":
1023
+ database_name = temp_accessor.items["Name"]
1024
+
1025
+ if temp_accessor.items.get("Kind") == "Schema":
1026
+ schema_name = temp_accessor.items["Name"]
1027
+
1028
+ if temp_accessor.items.get("Kind") == "Table":
1029
+ table_name = temp_accessor.items["Name"]
1030
+
1031
+ if temp_accessor.next is not None:
1032
+ temp_accessor = temp_accessor.next
1033
+ else:
1034
+ break
1035
+
1036
+ if (
1037
+ database_name is not None
1038
+ and schema_name is not None
1039
+ and table_name is not None
1040
+ ):
1041
+ qualified_table_name = f"{database_name}.{schema_name}.{table_name}"
1042
+ elif database_name is not None and table_name is not None:
1043
+ qualified_table_name = f"{database_name}.{table_name}"
1044
+
1045
+ if not qualified_table_name:
1046
+ self.reporter.warning(
1047
+ title="Can not determine qualified table name",
1048
+ message="Can not determine qualified table name for ODBC data source. Skipping Lineage creation.",
1049
+ context=f"table-name={self.table.full_name}, data-platform={data_platform}",
1050
+ )
1051
+ logger.warning(
1052
+ f"Can not determine qualified table name for ODBC data source {data_platform} "
1053
+ f"table {self.table.full_name}."
1054
+ )
1055
+ return Lineage.empty()
1056
+
1057
+ logger.debug(
1058
+ f"ODBC Platform {data_platform} found qualified table name {qualified_table_name}"
1059
+ )
1060
+
1061
+ urn = make_urn(
1062
+ config=self.config,
1063
+ platform_instance_resolver=self.platform_instance_resolver,
1064
+ data_platform_pair=platform_pair,
1065
+ server=server_name,
1066
+ qualified_table_name=qualified_table_name,
1067
+ )
1068
+
1069
+ column_lineage = self.create_table_column_lineage(urn)
1070
+
1071
+ return Lineage(
1072
+ upstreams=[
1073
+ DataPlatformTable(
1074
+ data_platform_pair=platform_pair,
1075
+ urn=urn,
1076
+ )
1077
+ ],
1078
+ column_lineage=column_lineage,
1079
+ )
1080
+
1081
+ @staticmethod
1082
+ def create_platform_pair(
1083
+ data_platform: str, powerbi_platform: str
1084
+ ) -> DataPlatformPair:
1085
+ return DataPlatformPair(data_platform, powerbi_platform)
1086
+
1087
+ def get_platform_pair(self) -> DataPlatformPair:
1088
+ return SupportedDataPlatform.ODBC.value
1089
+
1090
+
943
1091
  class SupportedPattern(Enum):
944
1092
  DATABRICKS_QUERY = (
945
1093
  DatabricksLineage,
@@ -991,6 +1139,11 @@ class SupportedPattern(Enum):
991
1139
  FunctionName.NATIVE_QUERY,
992
1140
  )
993
1141
 
1142
+ ODBC = (
1143
+ OdbcLineage,
1144
+ FunctionName.ODBC_DATA_ACCESS,
1145
+ )
1146
+
994
1147
  def handler(self) -> Type[AbstractLineage]:
995
1148
  return self.value[0]
996
1149
 
@@ -63,10 +63,10 @@ class SessionWithTimeout(requests.Session):
63
63
  super().__init__(*args, **kwargs)
64
64
  self.timeout = timeout
65
65
 
66
- def request(self, method, url, **kwargs):
66
+ def request(self, method, url, *args, **kwargs):
67
67
  # Set the default timeout if none is provided
68
68
  kwargs.setdefault("timeout", self.timeout)
69
- return super().request(method, url, **kwargs)
69
+ return super().request(method, url, *args, **kwargs)
70
70
 
71
71
 
72
72
  class DataResolverBase(ABC):
@@ -182,15 +182,16 @@ class RedshiftUsageExtractor:
182
182
  self.report.num_operational_stats_filtered = 0
183
183
 
184
184
  if self.config.include_operational_stats:
185
- with self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS):
186
- with PerfTimer() as timer:
187
- # Generate operation aspect workunits
188
- yield from self._gen_operation_aspect_workunits(
189
- self.connection, all_tables
190
- )
191
- self.report.operational_metadata_extraction_sec[
192
- self.config.database
193
- ] = timer.elapsed_seconds(digits=2)
185
+ with self.report.new_stage(
186
+ USAGE_EXTRACTION_OPERATIONAL_STATS
187
+ ), PerfTimer() as timer:
188
+ # Generate operation aspect workunits
189
+ yield from self._gen_operation_aspect_workunits(
190
+ self.connection, all_tables
191
+ )
192
+ self.report.operational_metadata_extraction_sec[
193
+ self.config.database
194
+ ] = timer.elapsed_seconds(digits=2)
194
195
 
195
196
  # Generate aggregate events
196
197
  with self.report.new_stage(USAGE_EXTRACTION_USAGE_AGGREGATION):
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import Dict, Optional
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  import pydantic
6
+ from pydantic import BaseModel, Field
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
8
9
  from datahub.configuration.source_common import (
@@ -53,15 +54,82 @@ class Constant:
53
54
  DEFAULT_API_URL = "https://aws-api.sigmacomputing.com/v2"
54
55
 
55
56
 
57
+ class WorkspaceCounts(BaseModel):
58
+ workbooks_count: int = 0
59
+ datasets_count: int = 0
60
+ elements_count: int = 0
61
+ pages_count: int = 0
62
+
63
+ def is_empty(self) -> bool:
64
+ return (
65
+ self.workbooks_count == 0
66
+ and self.datasets_count == 0
67
+ and self.elements_count == 0
68
+ and self.pages_count == 0
69
+ )
70
+
71
+ def as_obj(self) -> dict:
72
+ return {
73
+ "workbooks_count": self.workbooks_count,
74
+ "datasets_count": self.datasets_count,
75
+ "elements_count": self.elements_count,
76
+ "pages_count": self.pages_count,
77
+ }
78
+
79
+
80
+ class SigmaWorkspaceEntityFilterReport(EntityFilterReport):
81
+ type: str = "workspace"
82
+
83
+ workspace_counts: Dict[str, WorkspaceCounts] = Field(
84
+ default_factory=dict,
85
+ description="Counts of workbooks, datasets, elements and pages in each workspace.",
86
+ )
87
+
88
+ def increment_workbooks_count(self, workspace_id: str) -> None:
89
+ if workspace_id not in self.workspace_counts:
90
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
91
+ self.workspace_counts[workspace_id].workbooks_count += 1
92
+
93
+ def increment_datasets_count(self, workspace_id: str) -> None:
94
+ if workspace_id not in self.workspace_counts:
95
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
96
+ self.workspace_counts[workspace_id].datasets_count += 1
97
+
98
+ def increment_elements_count(self, workspace_id: str) -> None:
99
+ if workspace_id not in self.workspace_counts:
100
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
101
+ self.workspace_counts[workspace_id].elements_count += 1
102
+
103
+ def increment_pages_count(self, workspace_id: str) -> None:
104
+ if workspace_id not in self.workspace_counts:
105
+ self.workspace_counts[workspace_id] = WorkspaceCounts()
106
+ self.workspace_counts[workspace_id].pages_count += 1
107
+
108
+ def as_obj(self) -> dict:
109
+ return {
110
+ "filtered": self.dropped_entities.as_obj(),
111
+ "processed": self.processed_entities.as_obj(),
112
+ "workspace_counts": {
113
+ key: item.as_obj() for key, item in self.workspace_counts.items()
114
+ },
115
+ }
116
+
117
+
56
118
  @dataclass
57
119
  class SigmaSourceReport(StaleEntityRemovalSourceReport):
58
- workspaces: EntityFilterReport = EntityFilterReport.field(type="workspace")
59
- number_of_workspaces: Optional[int] = None
120
+ workspaces: SigmaWorkspaceEntityFilterReport = field(
121
+ default_factory=SigmaWorkspaceEntityFilterReport
122
+ )
60
123
  non_accessible_workspaces_count: int = 0
61
- shared_entities_count: int = 0
62
- number_of_datasets: int = 0
63
- number_of_workbooks: int = 0
124
+
125
+ datasets: EntityFilterReport = EntityFilterReport.field(type="dataset")
126
+ datasets_without_workspace: int = 0
127
+
128
+ workbooks: EntityFilterReport = EntityFilterReport.field(type="workbook")
129
+ workbooks_without_workspace: int = 0
130
+
64
131
  number_of_files_metadata: Dict[str, int] = field(default_factory=dict)
132
+ empty_workspaces: List[str] = field(default_factory=list)
65
133
 
66
134
 
67
135
  class PlatformDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
@@ -35,6 +35,7 @@ from datahub.ingestion.source.sigma.config import (
35
35
  PlatformDetail,
36
36
  SigmaSourceConfig,
37
37
  SigmaSourceReport,
38
+ WorkspaceCounts,
38
39
  )
39
40
  from datahub.ingestion.source.sigma.data_classes import (
40
41
  Element,
@@ -163,7 +164,6 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
163
164
  def _get_allowed_workspaces(self) -> List[Workspace]:
164
165
  all_workspaces = self.sigma_api.workspaces.values()
165
166
  logger.info(f"Number of workspaces = {len(all_workspaces)}")
166
- self.reporter.number_of_workspaces = len(all_workspaces)
167
167
 
168
168
  allowed_workspaces = []
169
169
  for workspace in all_workspaces:
@@ -285,6 +285,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
285
285
  yield self._gen_dataset_properties(dataset_urn, dataset)
286
286
 
287
287
  if dataset.workspaceId:
288
+ self.reporter.workspaces.increment_datasets_count(dataset.workspaceId)
288
289
  yield from add_entity_to_container(
289
290
  container_key=self._gen_workspace_key(dataset.workspaceId),
290
291
  entity_type="dataset",
@@ -468,6 +469,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
468
469
  ).as_workunit()
469
470
 
470
471
  if workbook.workspaceId:
472
+ self.reporter.workspaces.increment_elements_count(workbook.workspaceId)
473
+
471
474
  yield self._gen_entity_browsepath_aspect(
472
475
  entity_urn=chart_urn,
473
476
  parent_entity_urn=builder.make_container_urn(
@@ -525,6 +528,7 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
525
528
  all_input_fields: List[InputFieldClass] = []
526
529
 
527
530
  if workbook.workspaceId:
531
+ self.reporter.workspaces.increment_pages_count(workbook.workspaceId)
528
532
  yield self._gen_entity_browsepath_aspect(
529
533
  entity_urn=dashboard_urn,
530
534
  parent_entity_urn=builder.make_container_urn(
@@ -614,6 +618,8 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
614
618
 
615
619
  paths = workbook.path.split("/")[1:]
616
620
  if workbook.workspaceId:
621
+ self.reporter.workspaces.increment_workbooks_count(workbook.workspaceId)
622
+
617
623
  yield self._gen_entity_browsepath_aspect(
618
624
  entity_urn=dashboard_urn,
619
625
  parent_entity_urn=builder.make_container_urn(
@@ -667,6 +673,15 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
667
673
  f"{workspace.name} ({workspace.workspaceId})"
668
674
  )
669
675
  yield from self._gen_workspace_workunit(workspace)
676
+ if self.reporter.workspaces.workspace_counts.get(
677
+ workspace.workspaceId, WorkspaceCounts()
678
+ ).is_empty():
679
+ logger.warning(
680
+ f"Workspace {workspace.name} ({workspace.workspaceId}) is empty. If this is not expected, add the user associated with the Client ID/Secret to each workspace with missing metadata"
681
+ )
682
+ self.reporter.empty_workspaces.append(
683
+ f"{workspace.name} ({workspace.workspaceId})"
684
+ )
670
685
  yield from self._gen_sigma_dataset_upstream_lineage_workunit()
671
686
 
672
687
  def get_report(self) -> SourceReport: