acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show
  1. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
  2. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
  3. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +1 -1
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +3 -5
  46. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  47. datahub/ingestion/source/delta_lake/config.py +8 -1
  48. datahub/ingestion/source/delta_lake/report.py +4 -2
  49. datahub/ingestion/source/delta_lake/source.py +20 -5
  50. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  51. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  52. datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
  53. datahub/ingestion/source/elastic_search.py +26 -6
  54. datahub/ingestion/source/feast.py +27 -8
  55. datahub/ingestion/source/file.py +6 -3
  56. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  57. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  58. datahub/ingestion/source/ge_data_profiler.py +12 -15
  59. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  60. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  61. datahub/ingestion/source/identity/okta.py +37 -7
  62. datahub/ingestion/source/kafka/kafka.py +1 -1
  63. datahub/ingestion/source/kafka_connect/common.py +2 -7
  64. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  65. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  66. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  67. datahub/ingestion/source/looker/looker_common.py +3 -3
  68. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  69. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  70. datahub/ingestion/source/looker/looker_source.py +1 -1
  71. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  72. datahub/ingestion/source/looker/lookml_source.py +3 -2
  73. datahub/ingestion/source/metabase.py +57 -35
  74. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  75. datahub/ingestion/source/metadata/lineage.py +2 -2
  76. datahub/ingestion/source/mlflow.py +365 -35
  77. datahub/ingestion/source/mode.py +18 -8
  78. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  79. datahub/ingestion/source/nifi.py +37 -11
  80. datahub/ingestion/source/openapi.py +1 -1
  81. datahub/ingestion/source/openapi_parser.py +49 -17
  82. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  83. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  84. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  85. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  86. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  87. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  88. datahub/ingestion/source/preset.py +7 -4
  89. datahub/ingestion/source/pulsar.py +3 -2
  90. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  91. datahub/ingestion/source/redash.py +31 -7
  92. datahub/ingestion/source/redshift/config.py +4 -0
  93. datahub/ingestion/source/redshift/datashares.py +236 -0
  94. datahub/ingestion/source/redshift/lineage.py +6 -2
  95. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  96. datahub/ingestion/source/redshift/profile.py +1 -1
  97. datahub/ingestion/source/redshift/query.py +133 -33
  98. datahub/ingestion/source/redshift/redshift.py +46 -73
  99. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  100. datahub/ingestion/source/redshift/report.py +3 -0
  101. datahub/ingestion/source/s3/config.py +5 -5
  102. datahub/ingestion/source/s3/source.py +20 -41
  103. datahub/ingestion/source/salesforce.py +550 -275
  104. datahub/ingestion/source/schema_inference/object.py +1 -1
  105. datahub/ingestion/source/sigma/sigma.py +1 -1
  106. datahub/ingestion/source/slack/slack.py +31 -10
  107. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  108. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  109. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  110. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  111. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  112. datahub/ingestion/source/sql/athena.py +10 -16
  113. datahub/ingestion/source/sql/druid.py +1 -5
  114. datahub/ingestion/source/sql/hive.py +15 -6
  115. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  116. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  117. datahub/ingestion/source/sql/mssql/source.py +11 -5
  118. datahub/ingestion/source/sql/oracle.py +127 -63
  119. datahub/ingestion/source/sql/sql_common.py +6 -12
  120. datahub/ingestion/source/sql/sql_types.py +2 -2
  121. datahub/ingestion/source/sql/teradata.py +7 -5
  122. datahub/ingestion/source/sql/trino.py +2 -2
  123. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  124. datahub/ingestion/source/superset.py +222 -62
  125. datahub/ingestion/source/tableau/tableau.py +22 -6
  126. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  127. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  128. datahub/ingestion/source/unity/source.py +11 -1
  129. datahub/ingestion/source/vertexai.py +697 -0
  130. datahub/ingestion/source_config/pulsar.py +3 -1
  131. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  132. datahub/lite/duckdb_lite.py +3 -10
  133. datahub/lite/lite_local.py +1 -1
  134. datahub/lite/lite_util.py +4 -3
  135. datahub/metadata/_schema_classes.py +714 -417
  136. datahub/metadata/_urns/urn_defs.py +1673 -1649
  137. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  138. datahub/metadata/schema.avsc +16438 -16603
  139. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  140. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  141. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  142. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  143. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  144. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  145. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  146. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  147. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  148. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  149. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  150. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  151. datahub/metadata/schemas/DomainKey.avsc +2 -1
  152. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  153. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  154. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  155. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  156. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  157. datahub/metadata/schemas/InputFields.avsc +3 -1
  158. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  159. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  160. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  162. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  163. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  164. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  165. datahub/metadata/schemas/PostKey.avsc +2 -1
  166. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  168. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  169. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  170. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  171. datahub/pydantic/__init__.py +0 -0
  172. datahub/pydantic/compat.py +58 -0
  173. datahub/sdk/__init__.py +30 -12
  174. datahub/sdk/_all_entities.py +1 -1
  175. datahub/sdk/_attribution.py +4 -0
  176. datahub/sdk/_shared.py +251 -16
  177. datahub/sdk/_utils.py +35 -0
  178. datahub/sdk/container.py +29 -5
  179. datahub/sdk/dataset.py +118 -20
  180. datahub/sdk/{_entity.py → entity.py} +24 -1
  181. datahub/sdk/entity_client.py +1 -1
  182. datahub/sdk/main_client.py +23 -0
  183. datahub/sdk/resolver_client.py +17 -29
  184. datahub/sdk/search_client.py +50 -0
  185. datahub/sdk/search_filters.py +374 -0
  186. datahub/specific/dataset.py +3 -4
  187. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  188. datahub/sql_parsing/schema_resolver.py +1 -1
  189. datahub/sql_parsing/split_statements.py +20 -13
  190. datahub/sql_parsing/sql_parsing_common.py +7 -0
  191. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  192. datahub/sql_parsing/sqlglot_utils.py +1 -4
  193. datahub/testing/check_sql_parser_result.py +5 -6
  194. datahub/testing/compare_metadata_json.py +7 -6
  195. datahub/testing/pytest_hooks.py +56 -0
  196. datahub/upgrade/upgrade.py +2 -2
  197. datahub/utilities/file_backed_collections.py +3 -14
  198. datahub/utilities/ingest_utils.py +106 -0
  199. datahub/utilities/mapping.py +1 -1
  200. datahub/utilities/memory_footprint.py +3 -2
  201. datahub/utilities/sentinels.py +22 -0
  202. datahub/utilities/unified_diff.py +5 -1
  203. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  204. {acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,14 @@
1
1
  import dataclasses
2
2
  import json
3
3
  import logging
4
- from typing import Any, Dict, Iterable, List, Optional
4
+ from typing import Any, Dict, Iterable, List, Optional, Union
5
5
 
6
6
  from datahub.emitter.mce_builder import (
7
- make_data_platform_urn,
8
- make_dataplatform_instance_urn,
9
7
  make_dataset_urn_with_platform_instance,
10
8
  make_schema_field_urn,
11
9
  )
12
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
10
  from datahub.emitter.mcp_builder import (
14
11
  ContainerKey,
15
- add_dataset_to_container,
16
- gen_containers,
17
12
  )
18
13
  from datahub.ingestion.api.common import PipelineContext
19
14
  from datahub.ingestion.api.decorators import (
@@ -31,6 +26,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
31
26
  CassandraColumn,
32
27
  CassandraEntities,
33
28
  CassandraKeyspace,
29
+ CassandraSharedDatasetFields,
34
30
  CassandraTable,
35
31
  CassandraView,
36
32
  )
@@ -51,24 +47,21 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
51
47
  from datahub.ingestion.source.state.stateful_ingestion_base import (
52
48
  StatefulIngestionSourceBase,
53
49
  )
54
- from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass
55
50
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
56
51
  SchemaField,
57
- SchemaMetadata,
58
52
  )
59
53
  from datahub.metadata.schema_classes import (
60
- DataPlatformInstanceClass,
61
54
  DatasetLineageTypeClass,
62
- DatasetPropertiesClass,
63
55
  FineGrainedLineageClass,
64
56
  FineGrainedLineageDownstreamTypeClass,
65
57
  FineGrainedLineageUpstreamTypeClass,
66
- OtherSchemaClass,
67
- SubTypesClass,
68
58
  UpstreamClass,
69
59
  UpstreamLineageClass,
70
60
  ViewPropertiesClass,
71
61
  )
62
+ from datahub.sdk.container import Container
63
+ from datahub.sdk.dataset import Dataset
64
+ from datahub.sdk.entity import Entity
72
65
 
73
66
  logger = logging.getLogger(__name__)
74
67
 
@@ -133,6 +126,13 @@ class CassandraSource(StatefulIngestionSourceBase):
133
126
  def get_workunits_internal(
134
127
  self,
135
128
  ) -> Iterable[MetadataWorkUnit]:
129
+ for metadata in self._get_metadata():
130
+ if isinstance(metadata, MetadataWorkUnit):
131
+ yield metadata
132
+ else:
133
+ yield from metadata.as_workunits()
134
+
135
+ def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
136
136
  if not self.cassandra_api.authenticate():
137
137
  return
138
138
  keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
@@ -145,7 +145,7 @@ class CassandraSource(StatefulIngestionSourceBase):
145
145
  self.report.report_dropped(keyspace_name)
146
146
  continue
147
147
 
148
- yield from self._generate_keyspace_container(keyspace)
148
+ yield self._generate_keyspace_container(keyspace)
149
149
 
150
150
  try:
151
151
  yield from self._extract_tables_from_keyspace(keyspace_name)
@@ -170,21 +170,20 @@ class CassandraSource(StatefulIngestionSourceBase):
170
170
  if self.config.is_profiling_enabled():
171
171
  yield from self.profiler.get_workunits(self.cassandra_data)
172
172
 
173
- def _generate_keyspace_container(
174
- self, keyspace: CassandraKeyspace
175
- ) -> Iterable[MetadataWorkUnit]:
173
+ def _generate_keyspace_container(self, keyspace: CassandraKeyspace) -> Container:
176
174
  keyspace_container_key = self._generate_keyspace_container_key(
177
175
  keyspace.keyspace_name
178
176
  )
179
- yield from gen_containers(
180
- container_key=keyspace_container_key,
181
- name=keyspace.keyspace_name,
177
+
178
+ return Container(
179
+ keyspace_container_key,
180
+ display_name=keyspace.keyspace_name,
182
181
  qualified_name=keyspace.keyspace_name,
182
+ subtype=DatasetContainerSubTypes.KEYSPACE,
183
183
  extra_properties={
184
184
  "durable_writes": str(keyspace.durable_writes),
185
185
  "replication": json.dumps(keyspace.replication),
186
186
  },
187
- sub_types=[DatasetContainerSubTypes.KEYSPACE],
188
187
  )
189
188
 
190
189
  def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey:
@@ -196,105 +195,55 @@ class CassandraSource(StatefulIngestionSourceBase):
196
195
  )
197
196
 
198
197
  # get all tables for a given keyspace, iterate over them to extract column metadata
199
- def _extract_tables_from_keyspace(
200
- self, keyspace_name: str
201
- ) -> Iterable[MetadataWorkUnit]:
198
+ def _extract_tables_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
202
199
  self.cassandra_data.keyspaces.append(keyspace_name)
203
200
  tables: List[CassandraTable] = self.cassandra_api.get_tables(keyspace_name)
204
201
  for table in tables:
205
- # define the dataset urn for this table to be used downstream
206
- table_name: str = table.table_name
207
- dataset_name: str = f"{keyspace_name}.{table_name}"
208
-
209
- if not self.config.table_pattern.allowed(dataset_name):
210
- self.report.report_dropped(dataset_name)
211
- continue
212
-
213
- self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name)
214
- self.report.report_entity_scanned(dataset_name, ent_type="Table")
215
-
216
- dataset_urn = make_dataset_urn_with_platform_instance(
217
- platform=self.platform,
218
- name=dataset_name,
219
- env=self.config.env,
220
- platform_instance=self.config.platform_instance,
202
+ dataset = self._generate_table(keyspace_name, table)
203
+ if dataset:
204
+ yield dataset
205
+
206
+ def _generate_table(
207
+ self, keyspace_name: str, table: CassandraTable
208
+ ) -> Optional[Dataset]:
209
+ table_name: str = table.table_name
210
+ dataset_name: str = f"{keyspace_name}.{table_name}"
211
+
212
+ self.report.report_entity_scanned(dataset_name, ent_type="Table")
213
+ if not self.config.table_pattern.allowed(dataset_name):
214
+ self.report.report_dropped(dataset_name)
215
+ return None
216
+
217
+ self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name)
218
+
219
+ schema_fields = None
220
+ try:
221
+ schema_fields = self._extract_columns_from_table(keyspace_name, table_name)
222
+ except Exception as e:
223
+ self.report.failure(
224
+ message="Failed to extract columns from table",
225
+ context=dataset_name,
226
+ exc=e,
221
227
  )
222
228
 
223
- # 1. Extract columns from table, then construct and emit the schemaMetadata aspect.
224
- try:
225
- yield from self._extract_columns_from_table(
226
- keyspace_name, table_name, dataset_urn
227
- )
228
- except Exception as e:
229
- self.report.failure(
230
- message="Failed to extract columns from table",
231
- context=table_name,
232
- exc=e,
233
- )
234
-
235
- yield MetadataChangeProposalWrapper(
236
- entityUrn=dataset_urn,
237
- aspect=StatusClass(removed=False),
238
- ).as_workunit()
239
-
240
- yield MetadataChangeProposalWrapper(
241
- entityUrn=dataset_urn,
242
- aspect=SubTypesClass(
243
- typeNames=[
244
- DatasetSubTypes.TABLE,
245
- ]
246
- ),
247
- ).as_workunit()
248
-
249
- yield MetadataChangeProposalWrapper(
250
- entityUrn=dataset_urn,
251
- aspect=DatasetPropertiesClass(
252
- name=table_name,
253
- qualifiedName=f"{keyspace_name}.{table_name}",
254
- description=table.comment,
255
- customProperties={
256
- "bloom_filter_fp_chance": str(table.bloom_filter_fp_chance),
257
- "caching": json.dumps(table.caching),
258
- "compaction": json.dumps(table.compaction),
259
- "compression": json.dumps(table.compression),
260
- "crc_check_chance": str(table.crc_check_chance),
261
- "dclocal_read_repair_chance": str(
262
- table.dclocal_read_repair_chance
263
- ),
264
- "default_time_to_live": str(table.default_time_to_live),
265
- "extensions": json.dumps(table.extensions),
266
- "gc_grace_seconds": str(table.gc_grace_seconds),
267
- "max_index_interval": str(table.max_index_interval),
268
- "min_index_interval": str(table.min_index_interval),
269
- "memtable_flush_period_in_ms": str(
270
- table.memtable_flush_period_in_ms
271
- ),
272
- "read_repair_chance": str(table.read_repair_chance),
273
- "speculative_retry": str(table.speculative_retry),
274
- },
275
- ),
276
- ).as_workunit()
277
-
278
- yield from add_dataset_to_container(
279
- container_key=self._generate_keyspace_container_key(keyspace_name),
280
- dataset_urn=dataset_urn,
281
- )
282
-
283
- if self.config.platform_instance:
284
- yield MetadataChangeProposalWrapper(
285
- entityUrn=dataset_urn,
286
- aspect=DataPlatformInstanceClass(
287
- platform=make_data_platform_urn(self.platform),
288
- instance=make_dataplatform_instance_urn(
289
- self.platform, self.config.platform_instance
290
- ),
291
- ),
292
- ).as_workunit()
229
+ return Dataset(
230
+ platform=self.platform,
231
+ name=dataset_name,
232
+ env=self.config.env,
233
+ platform_instance=self.config.platform_instance,
234
+ subtype=DatasetSubTypes.TABLE,
235
+ parent_container=self._generate_keyspace_container_key(keyspace_name),
236
+ schema=schema_fields,
237
+ display_name=table_name,
238
+ qualified_name=dataset_name,
239
+ description=table.comment,
240
+ custom_properties=self._get_dataset_custom_props(table),
241
+ )
293
242
 
294
243
  # get all columns for a given table, iterate over them to extract column metadata
295
244
  def _extract_columns_from_table(
296
- self, keyspace_name: str, table_name: str, dataset_urn: str
297
- ) -> Iterable[MetadataWorkUnit]:
245
+ self, keyspace_name: str, table_name: str
246
+ ) -> Optional[List[SchemaField]]:
298
247
  column_infos: List[CassandraColumn] = self.cassandra_api.get_columns(
299
248
  keyspace_name, table_name
300
249
  )
@@ -305,147 +254,117 @@ class CassandraSource(StatefulIngestionSourceBase):
305
254
  self.report.report_warning(
306
255
  message="Table has no columns, skipping", context=table_name
307
256
  )
308
- return
257
+ return None
309
258
 
259
+ # Tricky: we also save the column info to a global store.
310
260
  jsonable_column_infos: List[Dict[str, Any]] = []
311
261
  for column in column_infos:
312
262
  self.cassandra_data.columns.setdefault(table_name, []).append(column)
313
263
  jsonable_column_infos.append(dataclasses.asdict(column))
314
264
 
315
- schema_metadata: SchemaMetadata = SchemaMetadata(
316
- schemaName=table_name,
317
- platform=make_data_platform_urn(self.platform),
318
- version=0,
319
- hash="",
320
- platformSchema=OtherSchemaClass(
321
- rawSchema=json.dumps(jsonable_column_infos)
322
- ),
323
- fields=schema_fields,
324
- )
325
-
326
- yield MetadataChangeProposalWrapper(
327
- entityUrn=dataset_urn,
328
- aspect=schema_metadata,
329
- ).as_workunit()
265
+ return schema_fields
330
266
 
331
- def _extract_views_from_keyspace(
332
- self, keyspace_name: str
333
- ) -> Iterable[MetadataWorkUnit]:
267
+ def _extract_views_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]:
334
268
  views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name)
335
269
  for view in views:
336
- view_name: str = view.view_name
337
- dataset_name: str = f"{keyspace_name}.{view_name}"
338
- self.report.report_entity_scanned(dataset_name)
339
- dataset_urn: str = make_dataset_urn_with_platform_instance(
340
- platform=self.platform,
341
- name=dataset_name,
342
- env=self.config.env,
343
- platform_instance=self.config.platform_instance,
270
+ dataset = self._generate_view(keyspace_name, view)
271
+ if dataset:
272
+ yield dataset
273
+
274
+ def _generate_view(
275
+ self, keyspace_name: str, view: CassandraView
276
+ ) -> Optional[Dataset]:
277
+ view_name: str = view.view_name
278
+ dataset_name: str = f"{keyspace_name}.{view_name}"
279
+
280
+ self.report.report_entity_scanned(dataset_name, ent_type="View")
281
+ if not self.config.table_pattern.allowed(dataset_name):
282
+ # TODO: Maybe add a view_pattern instead of reusing table_pattern?
283
+ self.report.report_dropped(dataset_name)
284
+ return None
285
+
286
+ schema_fields = None
287
+ try:
288
+ schema_fields = self._extract_columns_from_table(keyspace_name, view_name)
289
+ except Exception as e:
290
+ self.report.failure(
291
+ message="Failed to extract columns from views",
292
+ context=view_name,
293
+ exc=e,
344
294
  )
345
295
 
346
- yield MetadataChangeProposalWrapper(
347
- entityUrn=dataset_urn,
348
- aspect=StatusClass(removed=False),
349
- ).as_workunit()
350
-
351
- yield MetadataChangeProposalWrapper(
352
- entityUrn=dataset_urn,
353
- aspect=SubTypesClass(
354
- typeNames=[
355
- DatasetSubTypes.VIEW,
356
- ]
357
- ),
358
- ).as_workunit()
359
-
360
- yield MetadataChangeProposalWrapper(
361
- entityUrn=dataset_urn,
362
- aspect=ViewPropertiesClass(
296
+ dataset = Dataset(
297
+ platform=self.platform,
298
+ name=dataset_name,
299
+ env=self.config.env,
300
+ platform_instance=self.config.platform_instance,
301
+ subtype=DatasetSubTypes.VIEW,
302
+ parent_container=self._generate_keyspace_container_key(keyspace_name),
303
+ schema=schema_fields,
304
+ display_name=view_name,
305
+ qualified_name=dataset_name,
306
+ description=view.comment,
307
+ custom_properties=self._get_dataset_custom_props(view),
308
+ extra_aspects=[
309
+ ViewPropertiesClass(
363
310
  materialized=True,
364
311
  viewLogic=view.where_clause, # Use the WHERE clause as view logic
365
312
  viewLanguage="CQL", # Use "CQL" as the language
366
313
  ),
367
- ).as_workunit()
368
-
369
- yield MetadataChangeProposalWrapper(
370
- entityUrn=dataset_urn,
371
- aspect=DatasetPropertiesClass(
372
- name=view_name,
373
- qualifiedName=f"{keyspace_name}.{view_name}",
374
- description=view.comment,
375
- customProperties={
376
- "bloom_filter_fp_chance": str(view.bloom_filter_fp_chance),
377
- "caching": json.dumps(view.caching),
378
- "compaction": json.dumps(view.compaction),
379
- "compression": json.dumps(view.compression),
380
- "crc_check_chance": str(view.crc_check_chance),
381
- "include_all_columns": str(view.include_all_columns),
382
- "dclocal_read_repair_chance": str(
383
- view.dclocal_read_repair_chance
384
- ),
385
- "default_time_to_live": str(view.default_time_to_live),
386
- "extensions": json.dumps(view.extensions),
387
- "gc_grace_seconds": str(view.gc_grace_seconds),
388
- "max_index_interval": str(view.max_index_interval),
389
- "min_index_interval": str(view.min_index_interval),
390
- "memtable_flush_period_in_ms": str(
391
- view.memtable_flush_period_in_ms
392
- ),
393
- "read_repair_chance": str(view.read_repair_chance),
394
- "speculative_retry": str(view.speculative_retry),
395
- },
396
- ),
397
- ).as_workunit()
314
+ ],
315
+ )
398
316
 
399
- try:
400
- yield from self._extract_columns_from_table(
401
- keyspace_name, view_name, dataset_urn
402
- )
403
- except Exception as e:
404
- self.report.failure(
405
- message="Failed to extract columns from views",
406
- context=view_name,
407
- exc=e,
317
+ # Construct and emit lineage off of 'base_table_name'
318
+ # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name
319
+ upstream_urn: str = make_dataset_urn_with_platform_instance(
320
+ platform=self.platform,
321
+ name=f"{keyspace_name}.{view.base_table_name}",
322
+ env=self.config.env,
323
+ platform_instance=self.config.platform_instance,
324
+ )
325
+ fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource(
326
+ view_name, str(dataset.urn), upstream_urn
327
+ )
328
+ upstream_lineage = UpstreamLineageClass(
329
+ upstreams=[
330
+ UpstreamClass(
331
+ dataset=upstream_urn,
332
+ type=DatasetLineageTypeClass.VIEW,
408
333
  )
334
+ ],
335
+ fineGrainedLineages=fineGrainedLineages,
336
+ )
409
337
 
410
- # Construct and emit lineage off of 'base_table_name'
411
- # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name
412
- upstream_urn: str = make_dataset_urn_with_platform_instance(
413
- platform=self.platform,
414
- name=f"{keyspace_name}.{view.table_name}",
415
- env=self.config.env,
416
- platform_instance=self.config.platform_instance,
417
- )
418
- fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource(
419
- view_name, dataset_urn, upstream_urn
420
- )
421
- yield MetadataChangeProposalWrapper(
422
- entityUrn=dataset_urn,
423
- aspect=UpstreamLineageClass(
424
- upstreams=[
425
- UpstreamClass(
426
- dataset=upstream_urn,
427
- type=DatasetLineageTypeClass.VIEW,
428
- )
429
- ],
430
- fineGrainedLineages=fineGrainedLineages,
431
- ),
432
- ).as_workunit()
433
-
434
- yield from add_dataset_to_container(
435
- container_key=self._generate_keyspace_container_key(keyspace_name),
436
- dataset_urn=dataset_urn,
338
+ dataset.set_upstreams(upstream_lineage)
339
+
340
+ return dataset
341
+
342
+ def _get_dataset_custom_props(
343
+ self, dataset: CassandraSharedDatasetFields
344
+ ) -> Dict[str, str]:
345
+ props = {
346
+ "bloom_filter_fp_chance": str(dataset.bloom_filter_fp_chance),
347
+ "caching": json.dumps(dataset.caching),
348
+ "compaction": json.dumps(dataset.compaction),
349
+ "compression": json.dumps(dataset.compression),
350
+ "crc_check_chance": str(dataset.crc_check_chance),
351
+ "dclocal_read_repair_chance": str(dataset.dclocal_read_repair_chance),
352
+ "default_time_to_live": str(dataset.default_time_to_live),
353
+ "extensions": json.dumps(dataset.extensions),
354
+ "gc_grace_seconds": str(dataset.gc_grace_seconds),
355
+ "max_index_interval": str(dataset.max_index_interval),
356
+ "min_index_interval": str(dataset.min_index_interval),
357
+ "memtable_flush_period_in_ms": str(dataset.memtable_flush_period_in_ms),
358
+ "read_repair_chance": str(dataset.read_repair_chance),
359
+ "speculative_retry": str(dataset.speculative_retry),
360
+ }
361
+ if isinstance(dataset, CassandraView):
362
+ props.update(
363
+ {
364
+ "include_all_columns": str(dataset.include_all_columns),
365
+ }
437
366
  )
438
-
439
- if self.config.platform_instance:
440
- yield MetadataChangeProposalWrapper(
441
- entityUrn=dataset_urn,
442
- aspect=DataPlatformInstanceClass(
443
- platform=make_data_platform_urn(self.platform),
444
- instance=make_dataplatform_instance_urn(
445
- self.platform, self.config.platform_instance
446
- ),
447
- ),
448
- ).as_workunit()
367
+ return props
449
368
 
450
369
  def get_upstream_fields_of_field_in_datasource(
451
370
  self, table_name: str, dataset_urn: str, upstream_urn: str
@@ -23,9 +23,9 @@ class CassandraKeyspace:
23
23
 
24
24
 
25
25
  @dataclass
26
- class CassandraTable:
26
+ class CassandraSharedDatasetFields:
27
27
  keyspace_name: str
28
- table_name: str
28
+
29
29
  bloom_filter_fp_chance: Optional[float]
30
30
  caching: Optional[Dict[str, str]]
31
31
  comment: Optional[str]
@@ -43,6 +43,11 @@ class CassandraTable:
43
43
  speculative_retry: Optional[str]
44
44
 
45
45
 
46
+ @dataclass
47
+ class CassandraTable(CassandraSharedDatasetFields):
48
+ table_name: str
49
+
50
+
46
51
  @dataclass
47
52
  class CassandraColumn:
48
53
  keyspace_name: str
@@ -55,8 +60,10 @@ class CassandraColumn:
55
60
 
56
61
 
57
62
  @dataclass
58
- class CassandraView(CassandraTable):
63
+ class CassandraView(CassandraSharedDatasetFields):
59
64
  view_name: str
65
+
66
+ base_table_name: str
60
67
  include_all_columns: Optional[bool]
61
68
  where_clause: str = ""
62
69
 
@@ -152,7 +159,8 @@ class CassandraAPI:
152
159
  self.report.failure(message="Failed to authenticate to Cassandra", exc=e)
153
160
  return False
154
161
 
155
- def get(self, query: str, parameters: Optional[List] = []) -> List:
162
+ def get(self, query: str, parameters: Optional[List] = None) -> List:
163
+ parameters = parameters or []
156
164
  if not self._cassandra_session:
157
165
  return []
158
166
 
@@ -261,7 +269,7 @@ class CassandraAPI:
261
269
  views = self.get(CassandraQueries.GET_VIEWS_QUERY, [keyspace_name])
262
270
  view_list = [
263
271
  CassandraView(
264
- table_name=row.base_table_name,
272
+ base_table_name=row.base_table_name,
265
273
  keyspace_name=row.keyspace_name,
266
274
  view_name=row.view_name,
267
275
  bloom_filter_fp_chance=row.bloom_filter_fp_chance,
@@ -0,0 +1,53 @@
1
+ import json
2
+ import tempfile
3
+ from typing import Any, Dict, Optional
4
+
5
+ from pydantic import Field, root_validator
6
+
7
+ from datahub.configuration import ConfigModel
8
+ from datahub.configuration.validate_multiline_string import pydantic_multiline_string
9
+
10
+
11
+ class GCPCredential(ConfigModel):
12
+ project_id: Optional[str] = Field(description="Project id to set the credentials")
13
+ private_key_id: str = Field(description="Private key id")
14
+ private_key: str = Field(
15
+ description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
16
+ )
17
+ client_email: str = Field(description="Client email")
18
+ client_id: str = Field(description="Client Id")
19
+ auth_uri: str = Field(
20
+ default="https://accounts.google.com/o/oauth2/auth",
21
+ description="Authentication uri",
22
+ )
23
+ token_uri: str = Field(
24
+ default="https://oauth2.googleapis.com/token", description="Token uri"
25
+ )
26
+ auth_provider_x509_cert_url: str = Field(
27
+ default="https://www.googleapis.com/oauth2/v1/certs",
28
+ description="Auth provider x509 certificate url",
29
+ )
30
+ type: str = Field(default="service_account", description="Authentication type")
31
+ client_x509_cert_url: Optional[str] = Field(
32
+ default=None,
33
+ description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email",
34
+ )
35
+
36
+ _fix_private_key_newlines = pydantic_multiline_string("private_key")
37
+
38
+ @root_validator(skip_on_failure=True)
39
+ def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
40
+ if values.get("client_x509_cert_url") is None:
41
+ values["client_x509_cert_url"] = (
42
+ f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}"
43
+ )
44
+ return values
45
+
46
+ def create_credential_temp_file(self, project_id: Optional[str] = None) -> str:
47
+ configs = self.dict()
48
+ if project_id:
49
+ configs["project_id"] = project_id
50
+ with tempfile.NamedTemporaryFile(delete=False) as fp:
51
+ cred_json = json.dumps(configs, indent=4, separators=(",", ": "))
52
+ fp.write(cred_json.encode())
53
+ return fp.name
@@ -60,8 +60,15 @@ class BIContainerSubTypes(StrEnum):
60
60
  MODE_COLLECTION = "Collection"
61
61
 
62
62
 
63
+ class FlowContainerSubTypes(StrEnum):
64
+ MSSQL_JOB = "Job"
65
+ MSSQL_PROCEDURE_CONTAINER = "Procedures Container"
66
+
67
+
63
68
  class JobContainerSubTypes(StrEnum):
64
69
  NIFI_PROCESS_GROUP = "Process Group"
70
+ MSSQL_JOBSTEP = "Job Step"
71
+ MSSQL_STORED_PROCEDURE = "Stored Procedure"
65
72
 
66
73
 
67
74
  class BIAssetSubTypes(StrEnum):
@@ -85,3 +92,8 @@ class BIAssetSubTypes(StrEnum):
85
92
  # SAP Analytics Cloud
86
93
  SAC_STORY = "Story"
87
94
  SAC_APPLICATION = "Application"
95
+
96
+
97
+ class MLAssetSubTypes(StrEnum):
98
+ MLFLOW_TRAINING_RUN = "ML Training Run"
99
+ MLFLOW_EXPERIMENT = "ML Experiment"
@@ -314,7 +314,7 @@ class CSVEnricherSource(Source):
314
314
  "datajob": EditableDataJobPropertiesClass,
315
315
  "dataflow": EditableDataFlowPropertiesClass,
316
316
  "notebook": EditableNotebookPropertiesClass,
317
- }.get(entityType, None)
317
+ }.get(entityType)
318
318
 
319
319
  if not entityClass:
320
320
  raise ValueError(
@@ -640,8 +640,8 @@ class CSVEnricherSource(Source):
640
640
  )
641
641
  except Exception as e:
642
642
  raise ConfigurationError(
643
- f"Cannot read remote file {self.config.filename}, error:{e}"
644
- )
643
+ f"Cannot read remote file {self.config.filename}: {e}"
644
+ ) from e
645
645
  else:
646
646
  with open(pathlib.Path(self.config.filename), encoding="utf-8-sig") as f:
647
647
  rows = list(csv.DictReader(f, delimiter=self.config.delimiter))
@@ -454,10 +454,8 @@ class PathSpec(ConfigModel):
454
454
  return None
455
455
  partition = partition_split[0]
456
456
  # If partition is in the form of /value1/value2/value3 we infer it from the path and assign partition_0, partition_1, partition_2 etc
457
- num = 0
458
- for partition_value in partition.split("/"):
457
+ for num, partition_value in enumerate(partition.split("/")):
459
458
  partition_keys.append((f"partition_{num}", partition_value))
460
- num += 1
461
459
  return partition_keys
462
460
 
463
461
  return None