acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
33
33
  TestableSource,
34
34
  TestConnectionReport,
35
35
  )
36
- from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder
36
+ from datahub.ingestion.api.source_helpers import (
37
+ auto_workunit,
38
+ create_dataset_props_patch_builder,
39
+ )
37
40
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
41
  from datahub.ingestion.glossary.classification_mixin import (
39
42
  ClassificationHandler,
@@ -45,6 +48,7 @@ from datahub.ingestion.source.common.subtypes import (
45
48
  DatasetSubTypes,
46
49
  )
47
50
  from datahub.ingestion.source.redshift.config import RedshiftConfig
51
+ from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
48
52
  from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
49
53
  from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
50
54
  from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
@@ -52,6 +56,7 @@ from datahub.ingestion.source.redshift.profile import RedshiftProfiler
52
56
  from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
53
57
  from datahub.ingestion.source.redshift.redshift_schema import (
54
58
  RedshiftColumn,
59
+ RedshiftDatabase,
55
60
  RedshiftDataDictionary,
56
61
  RedshiftSchema,
57
62
  RedshiftTable,
@@ -150,76 +155,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
150
155
  - Table, row, and column statistics via optional SQL profiling
151
156
  - Table lineage
152
157
  - Usage statistics
153
-
154
- ### Prerequisites
155
-
156
- This source needs to access system tables that require extra permissions.
157
- To grant these permissions, please alter your datahub Redshift user the following way:
158
- ```sql
159
- ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED;
160
- GRANT SELECT ON pg_catalog.svv_table_info to datahub_user;
161
- GRANT SELECT ON pg_catalog.svl_user_info to datahub_user;
162
- ```
163
-
164
- :::note
165
-
166
- Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements.
167
-
168
- :::
169
-
170
- ### Lineage
171
-
172
- There are multiple lineage collector implementations as Redshift does not support table lineage out of the box.
173
-
174
- #### stl_scan_based
175
- The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to
176
- discover lineage between tables.
177
- Pros:
178
- - Fast
179
- - Reliable
180
-
181
- Cons:
182
- - Does not work with Spectrum/external tables because those scans do not show up in stl_scan table.
183
- - If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies.
184
-
185
- #### sql_based
186
- The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries
187
- and uses sql parsing to discover the dependencies.
188
-
189
- Pros:
190
- - Works with Spectrum tables
191
- - Views are connected properly if a table depends on it
192
-
193
- Cons:
194
- - Slow.
195
- - Less reliable as the query parser can fail on certain queries
196
-
197
- #### mixed
198
- Using both collector above and first applying the sql based and then the stl_scan based one.
199
-
200
- Pros:
201
- - Works with Spectrum tables
202
- - Views are connected properly if a table depends on it
203
- - A bit more reliable than the sql_based one only
204
-
205
- Cons:
206
- - Slow
207
- - May be incorrect at times as the query parser can fail on certain queries
208
-
209
- :::note
210
-
211
- The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window.
212
-
213
- :::
214
-
215
- ### Profiling
216
- Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled.
217
-
218
- If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data.
219
- ```yaml
220
- profiling:
221
- profile_table_level_only: true
222
- ```
223
158
  """
224
159
 
225
160
  # TODO: Replace with standardized types in sql_types.py
@@ -330,6 +265,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
330
265
  self.config: RedshiftConfig = config
331
266
  self.report: RedshiftReport = RedshiftReport()
332
267
  self.classification_handler = ClassificationHandler(self.config, self.report)
268
+ self.datashares_helper = RedshiftDatasharesHelper(
269
+ self.config, self.report, self.ctx.graph
270
+ )
333
271
  self.platform = "redshift"
334
272
  self.domain_registry = None
335
273
  if self.config.domain:
@@ -361,6 +299,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
361
299
  is_serverless=self.config.is_serverless
362
300
  )
363
301
 
302
+ self.db: Optional[RedshiftDatabase] = None
364
303
  self.db_tables: Dict[str, Dict[str, List[RedshiftTable]]] = {}
365
304
  self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {}
366
305
  self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {}
@@ -424,6 +363,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
424
363
 
425
364
  database = self.config.database
426
365
  logger.info(f"Processing db {database}")
366
+
367
+ self.db = self.data_dictionary.get_database_details(connection, database)
368
+ self.report.is_shared_database = (
369
+ self.db is not None and self.db.is_shared_database()
370
+ )
427
371
  with self.report.new_stage(METADATA_EXTRACTION):
428
372
  self.db_tables[database] = defaultdict()
429
373
  self.db_views[database] = defaultdict()
@@ -563,7 +507,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
563
507
 
564
508
  schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {}
565
509
  schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema(
566
- conn=connection, schema=schema
510
+ conn=connection,
511
+ database=database,
512
+ schema=schema,
513
+ is_shared_database=self.report.is_shared_database,
567
514
  )
568
515
 
569
516
  if self.config.include_tables:
@@ -883,10 +830,14 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
883
830
  domain_config=self.config.domain,
884
831
  )
885
832
 
886
- def cache_tables_and_views(self, connection, database):
833
+ def cache_tables_and_views(
834
+ self, connection: redshift_connector.Connection, database: str
835
+ ) -> None:
887
836
  tables, views = self.data_dictionary.get_tables_and_views(
888
837
  conn=connection,
838
+ database=database,
889
839
  skip_external_tables=self.config.skip_external_tables,
840
+ is_shared_database=self.report.is_shared_database,
890
841
  )
891
842
  for schema in tables:
892
843
  if not is_schema_allowed(
@@ -1029,6 +980,28 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1029
980
  database: str,
1030
981
  lineage_extractor: RedshiftSqlLineageV2,
1031
982
  ) -> Iterable[MetadataWorkUnit]:
983
+ if self.config.include_share_lineage:
984
+ outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
985
+ yield from auto_workunit(
986
+ self.datashares_helper.to_platform_resource(list(outbound_shares))
987
+ )
988
+
989
+ if self.db and self.db.is_shared_database():
990
+ inbound_share = self.db.get_inbound_share()
991
+ if inbound_share is None:
992
+ self.report.warning(
993
+ title="Upstream lineage of inbound datashare will be missing",
994
+ message="Database options do not contain sufficient information",
995
+ context=f"Database: {database}, Options {self.db.options}",
996
+ )
997
+ else:
998
+ for known_lineage in self.datashares_helper.generate_lineage(
999
+ inbound_share, self.get_all_tables()[database]
1000
+ ):
1001
+ lineage_extractor.aggregator.add(known_lineage)
1002
+
1003
+ # TODO: distinguish between definition level lineage and audit log based lineage.
1004
+ # Definition level lineage should never be skipped
1032
1005
  if not self._should_ingest_lineage():
1033
1006
  return
1034
1007
 
@@ -1,7 +1,8 @@
1
1
  import logging
2
+ import re
2
3
  from dataclasses import dataclass, field
3
4
  from datetime import datetime, timezone
4
- from typing import Dict, Iterable, List, Optional, Tuple
5
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
5
6
 
6
7
  import redshift_connector
7
8
 
@@ -41,6 +42,9 @@ class RedshiftTable(BaseTable):
41
42
  serde_parameters: Optional[str] = None
42
43
  last_altered: Optional[datetime] = None
43
44
 
45
+ def is_external_table(self) -> bool:
46
+ return self.type == "EXTERNAL_TABLE"
47
+
44
48
 
45
49
  @dataclass
46
50
  class RedshiftView(BaseTable):
@@ -51,6 +55,9 @@ class RedshiftView(BaseTable):
51
55
  size_in_bytes: Optional[int] = None
52
56
  rows_count: Optional[int] = None
53
57
 
58
+ def is_external_table(self) -> bool:
59
+ return self.type == "EXTERNAL_TABLE"
60
+
54
61
 
55
62
  @dataclass
56
63
  class RedshiftSchema:
@@ -59,8 +66,119 @@ class RedshiftSchema:
59
66
  type: str
60
67
  owner: Optional[str] = None
61
68
  option: Optional[str] = None
69
+ external_platform: Optional[str] = None
62
70
  external_database: Optional[str] = None
63
71
 
72
+ def is_external_schema(self) -> bool:
73
+ return self.type == "external"
74
+
75
+ def get_upstream_schema_name(self) -> Optional[str]:
76
+ """Gets the schema name from the external schema option.
77
+
78
+ Returns:
79
+ Optional[str]: The schema name from the external schema option
80
+ if this is an external schema and has a valid option format, None otherwise.
81
+ """
82
+
83
+ if not self.is_external_schema() or not self.option:
84
+ return None
85
+
86
+ # For external schema on redshift, option is in form
87
+ # {"SCHEMA":"tickit"}
88
+ schema_match = re.search(r'"SCHEMA"\s*:\s*"([^"]*)"', self.option)
89
+ if not schema_match:
90
+ return None
91
+ else:
92
+ return schema_match.group(1)
93
+
94
+
95
+ @dataclass
96
+ class PartialInboundDatashare:
97
+ share_name: str
98
+ producer_namespace_prefix: str
99
+ consumer_database: str
100
+
101
+ def get_description(self) -> str:
102
+ return (
103
+ f"Namespace Prefix {self.producer_namespace_prefix} Share {self.share_name}"
104
+ )
105
+
106
+
107
+ @dataclass
108
+ class OutboundDatashare:
109
+ share_name: str
110
+ producer_namespace: str
111
+ source_database: str
112
+
113
+ def get_key(self) -> str:
114
+ return f"{self.producer_namespace}.{self.share_name}"
115
+
116
+
117
+ @dataclass
118
+ class InboundDatashare:
119
+ share_name: str
120
+ producer_namespace: str
121
+ consumer_database: str
122
+
123
+ def get_key(self) -> str:
124
+ return f"{self.producer_namespace}.{self.share_name}"
125
+
126
+ def get_description(self) -> str:
127
+ return f"Namespace {self.producer_namespace} Share {self.share_name}"
128
+
129
+
130
+ @dataclass
131
+ class RedshiftDatabase:
132
+ name: str
133
+ type: str
134
+ options: Optional[str] = None
135
+
136
+ def is_shared_database(self) -> bool:
137
+ return self.type == "shared"
138
+
139
+ # NOTE: ideally options are in form
140
+ # {"datashare_name":"xxx","datashare_producer_account":"1234","datashare_producer_namespace":"yyy"}
141
+ # however due to varchar(128) type of database table that captures options
142
+ # we may receive only partial information about inbound share
143
+ def get_inbound_share(
144
+ self,
145
+ ) -> Optional[Union[InboundDatashare, PartialInboundDatashare]]:
146
+ if not self.is_shared_database() or not self.options:
147
+ return None
148
+
149
+ # Convert into single regex ??
150
+ share_name_match = re.search(r'"datashare_name"\s*:\s*"([^"]*)"', self.options)
151
+ namespace_match = re.search(
152
+ r'"datashare_producer_namespace"\s*:\s*"([^"]*)"', self.options
153
+ )
154
+ partial_namespace_match = re.search(
155
+ r'"datashare_producer_namespace"\s*:\s*"([^"]*)$', self.options
156
+ )
157
+
158
+ if not share_name_match:
159
+ # We will always at least get share name
160
+ return None
161
+
162
+ share_name = share_name_match.group(1)
163
+ if namespace_match:
164
+ return InboundDatashare(
165
+ share_name=share_name,
166
+ producer_namespace=namespace_match.group(1),
167
+ consumer_database=self.name,
168
+ )
169
+ elif partial_namespace_match:
170
+ return PartialInboundDatashare(
171
+ share_name=share_name,
172
+ producer_namespace_prefix=partial_namespace_match.group(1),
173
+ consumer_database=self.name,
174
+ )
175
+ else:
176
+ return PartialInboundDatashare(
177
+ share_name=share_name,
178
+ producer_namespace_prefix="",
179
+ consumer_database=self.name,
180
+ )
181
+
64
182
 
65
183
  @dataclass
66
184
  class RedshiftExtraTableMeta:
@@ -141,13 +259,31 @@ class RedshiftDataDictionary:
141
259
 
142
260
  return [db[0] for db in dbs]
143
261
 
262
+ @staticmethod
263
+ def get_database_details(
264
+ conn: redshift_connector.Connection, database: str
265
+ ) -> Optional[RedshiftDatabase]:
266
+ cursor = RedshiftDataDictionary.get_query_result(
267
+ conn,
268
+ RedshiftCommonQuery.get_database_details(database),
269
+ )
270
+
271
+ row = cursor.fetchone()
272
+ if row is None:
273
+ return None
274
+ return RedshiftDatabase(
275
+ name=database,
276
+ type=row[1],
277
+ options=row[2],
278
+ )
279
+
144
280
  @staticmethod
145
281
  def get_schemas(
146
282
  conn: redshift_connector.Connection, database: str
147
283
  ) -> List[RedshiftSchema]:
148
284
  cursor = RedshiftDataDictionary.get_query_result(
149
285
  conn,
150
- RedshiftCommonQuery.list_schemas.format(database_name=database),
286
+ RedshiftCommonQuery.list_schemas(database),
151
287
  )
152
288
 
153
289
  schemas = cursor.fetchall()
@@ -158,8 +294,8 @@ class RedshiftDataDictionary:
158
294
  database=database,
159
295
  name=schema[field_names.index("schema_name")],
160
296
  type=schema[field_names.index("schema_type")],
161
- owner=schema[field_names.index("schema_owner_name")],
162
297
  option=schema[field_names.index("schema_option")],
298
+ external_platform=schema[field_names.index("external_platform")],
163
299
  external_database=schema[field_names.index("external_database")],
164
300
  )
165
301
  for schema in schemas
@@ -202,7 +338,9 @@ class RedshiftDataDictionary:
202
338
  def get_tables_and_views(
203
339
  self,
204
340
  conn: redshift_connector.Connection,
341
+ database: str,
205
342
  skip_external_tables: bool = False,
343
+ is_shared_database: bool = False,
206
344
  ) -> Tuple[Dict[str, List[RedshiftTable]], Dict[str, List[RedshiftView]]]:
207
345
  tables: Dict[str, List[RedshiftTable]] = {}
208
346
  views: Dict[str, List[RedshiftView]] = {}
@@ -213,7 +351,11 @@ class RedshiftDataDictionary:
213
351
 
214
352
  cur = RedshiftDataDictionary.get_query_result(
215
353
  conn,
216
- RedshiftCommonQuery.list_tables(skip_external_tables=skip_external_tables),
354
+ RedshiftCommonQuery.list_tables(
355
+ database=database,
356
+ skip_external_tables=skip_external_tables,
357
+ is_shared_database=is_shared_database,
358
+ ),
217
359
  )
218
360
  field_names = [i[0] for i in cur.description]
219
361
  db_tables = cur.fetchall()
@@ -358,11 +500,18 @@ class RedshiftDataDictionary:
358
500
 
359
501
  @staticmethod
360
502
  def get_columns_for_schema(
361
- conn: redshift_connector.Connection, schema: RedshiftSchema
503
+ conn: redshift_connector.Connection,
504
+ database: str,
505
+ schema: RedshiftSchema,
506
+ is_shared_database: bool = False,
362
507
  ) -> Dict[str, List[RedshiftColumn]]:
363
508
  cursor = RedshiftDataDictionary.get_query_result(
364
509
  conn,
365
- RedshiftCommonQuery.list_columns.format(schema_name=schema.name),
510
+ RedshiftCommonQuery.list_columns(
511
+ database_name=database,
512
+ schema_name=schema.name,
513
+ is_shared_database=is_shared_database,
514
+ ),
366
515
  )
367
516
 
368
517
  table_columns: Dict[str, List[RedshiftColumn]] = {}
@@ -508,3 +657,34 @@ class RedshiftDataDictionary:
508
657
  start_time=row[field_names.index("start_time")],
509
658
  )
510
659
  rows = cursor.fetchmany()
660
+
661
+ @staticmethod
662
+ def get_outbound_datashares(
663
+ conn: redshift_connector.Connection,
664
+ ) -> Iterable[OutboundDatashare]:
665
+ cursor = conn.cursor()
666
+ cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
667
+ for item in cursor.fetchall():
668
+ yield OutboundDatashare(
669
+ share_name=item[1],
670
+ producer_namespace=item[2],
671
+ source_database=item[3],
672
+ )
673
+
674
+ # NOTE: this is not used right now as it requires superuser privilege
675
+ # We can use this in future if the permissions are lowered.
676
+ @staticmethod
677
+ def get_inbound_datashare(
678
+ conn: redshift_connector.Connection,
679
+ database: str,
680
+ ) -> Optional[InboundDatashare]:
681
+ cursor = conn.cursor()
682
+ cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
683
+ item = cursor.fetchone()
684
+ if item:
685
+ return InboundDatashare(
686
+ share_name=item[1],
687
+ producer_namespace=item[2],
688
+ consumer_database=item[3],
689
+ )
690
+ return None
@@ -60,5 +60,8 @@ class RedshiftReport(
60
60
  sql_aggregator: Optional[SqlAggregatorReport] = None
61
61
  lineage_phases_timer: Dict[str, PerfTimer] = field(default_factory=dict)
62
62
 
63
+ is_shared_database: bool = False
64
+ outbound_shares_count: Optional[int] = None
65
+
63
66
  def report_dropped(self, key: str) -> None:
64
67
  self.filtered.append(key)
@@ -5,7 +5,9 @@ import pydantic
5
5
  from pydantic.fields import Field
6
6
 
7
7
  from datahub.configuration.common import AllowDenyPattern
8
- from datahub.configuration.source_common import DatasetSourceConfigMixin
8
+ from datahub.configuration.source_common import (
9
+ DatasetSourceConfigMixin,
10
+ )
9
11
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
10
12
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
11
13
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
@@ -152,10 +154,8 @@ class DataLakeSourceConfig(
152
154
  return path_specs
153
155
 
154
156
  @pydantic.validator("platform", always=True)
155
- def platform_valid(cls, platform: str, values: dict) -> str:
156
- inferred_platform = values.get(
157
- "platform", None
158
- ) # we may have inferred it above
157
+ def platform_valid(cls, platform: Any, values: dict) -> str:
158
+ inferred_platform = values.get("platform") # we may have inferred it above
159
159
  platform = platform or inferred_platform
160
160
  if not platform:
161
161
  raise ValueError("platform must not be empty")
@@ -834,7 +834,7 @@ class S3Source(StatefulIngestionSourceBase):
834
834
  min=min,
835
835
  )
836
836
  folders.extend(folders_list)
837
- if not path_spec.traversal_method == FolderTraversalMethod.ALL:
837
+ if path_spec.traversal_method != FolderTraversalMethod.ALL:
838
838
  return folders
839
839
  if folders:
840
840
  return folders
@@ -847,7 +847,7 @@ class S3Source(StatefulIngestionSourceBase):
847
847
  path_spec: PathSpec,
848
848
  bucket: "Bucket",
849
849
  prefix: str,
850
- ) -> List[Folder]:
850
+ ) -> Iterable[Folder]:
851
851
  """
852
852
  Retrieves all the folders in a path by listing all the files in the prefix.
853
853
  If the prefix is a full path then only that folder will be extracted.
@@ -877,51 +877,30 @@ class S3Source(StatefulIngestionSourceBase):
877
877
  s3_objects = (
878
878
  obj
879
879
  for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
880
- if _is_allowed_path(path_spec, f"s3://{obj.bucket_name}/{obj.key}")
880
+ if _is_allowed_path(
881
+ path_spec, self.create_s3_path(obj.bucket_name, obj.key)
882
+ )
881
883
  )
882
-
883
- partitions: List[Folder] = []
884
884
  grouped_s3_objects_by_dirname = groupby_unsorted(
885
885
  s3_objects,
886
886
  key=lambda obj: obj.key.rsplit("/", 1)[0],
887
887
  )
888
- for key, group in grouped_s3_objects_by_dirname:
889
- file_size = 0
890
- creation_time = None
891
- modification_time = None
892
-
893
- for item in group:
894
- file_size += item.size
895
- if creation_time is None or item.last_modified < creation_time:
896
- creation_time = item.last_modified
897
- if modification_time is None or item.last_modified > modification_time:
898
- modification_time = item.last_modified
899
- max_file = item
900
-
901
- if modification_time is None:
902
- logger.warning(
903
- f"Unable to find any files in the folder {key}. Skipping..."
904
- )
905
- continue
906
-
907
- id = path_spec.get_partition_from_path(
908
- self.create_s3_path(max_file.bucket_name, max_file.key)
888
+ for _, group in grouped_s3_objects_by_dirname:
889
+ max_file = max(group, key=lambda x: x.last_modified)
890
+ max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
891
+
892
+ # If partition_id is None, it means the folder is not a partition
893
+ partition_id = path_spec.get_partition_from_path(max_file_s3_path)
894
+
895
+ yield Folder(
896
+ partition_id=partition_id,
897
+ is_partition=bool(partition_id),
898
+ creation_time=min(obj.last_modified for obj in group),
899
+ modification_time=max_file.last_modified,
900
+ sample_file=max_file_s3_path,
901
+ size=sum(obj.size for obj in group),
909
902
  )
910
903
 
911
- # If id is None, it means the folder is not a partition
912
- partitions.append(
913
- Folder(
914
- partition_id=id,
915
- is_partition=bool(id),
916
- creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
917
- modification_time=modification_time,
918
- sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
919
- size=file_size,
920
- )
921
- )
922
-
923
- return partitions
924
-
925
904
  def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
926
905
  if self.source_config.aws_config is None:
927
906
  raise ValueError("aws_config not set. Cannot browse s3")
@@ -1000,7 +979,7 @@ class S3Source(StatefulIngestionSourceBase):
1000
979
  min=True,
1001
980
  )
1002
981
  dirs_to_process.append(dirs_to_process_min[0])
1003
- folders = []
982
+ folders: List[Folder] = []
1004
983
  for dir in dirs_to_process:
1005
984
  logger.info(f"Getting files from folder: {dir}")
1006
985
  prefix_to_process = urlparse(dir).path.lstrip("/")