acryl-datahub 0.15.0.2rc8__py3-none-any.whl → 0.15.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=H_T6wHYf0kR8j3IPqRxXJTiVy6-s7Wx9qPis7M9HDuM,576
1
+ datahub/__init__.py,sha256=W_09oIthIpoet0P4t-RgCWaJ-k83wzO6HCCmtceQw44,573
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=IMtLWvGuiqoUSnNaCaFjhd86NHwuXSWXp2kUL-xDkk0,7950
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -61,7 +61,7 @@ datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
61
  datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
62
62
  datahub/cli/cli_utils.py,sha256=onbG7z9hIm0zCAm0a2ulTOsHC_NVkdIsbg__EMj02DQ,13540
63
63
  datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
64
- datahub/cli/delete_cli.py,sha256=rJpyQhuRb_BnA1Fyot3Yu6-_x_3CoSsgjrpyJJCwEJY,23050
64
+ datahub/cli/delete_cli.py,sha256=oQ4Yy6hxZHcl67MYJiQumLs_8QmFEj7SPZFzxFXvDk8,23481
65
65
  datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
66
66
  datahub/cli/docker_cli.py,sha256=w9ZQMRVlHwfJI2XDe7mO0lwnT7-dZoK6tPadSMgwEM8,36493
67
67
  datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
@@ -296,9 +296,9 @@ datahub/ingestion/source/dynamodb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
296
296
  datahub/ingestion/source/dynamodb/data_reader.py,sha256=vC77KpcP8LJN0g8wsPRDVw4sebv0ZWIP3tJkEIHaomA,3120
297
297
  datahub/ingestion/source/dynamodb/dynamodb.py,sha256=wcEQSfQak45yPNZN7pCUEQFmjyWCpqRk1WjJJz9E2Go,22395
298
298
  datahub/ingestion/source/fivetran/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
299
- datahub/ingestion/source/fivetran/config.py,sha256=kTfsu7oC4BSzkcFtetZr0UmiQ-B2af-_jbNYSFbDim4,8749
299
+ datahub/ingestion/source/fivetran/config.py,sha256=BP3KRfAQ6H5qyEeJNu9vNfZNwLoyj4Tl2kXiLVR5DNM,9027
300
300
  datahub/ingestion/source/fivetran/data_classes.py,sha256=ecdUJH5BEze0yv-uFpKWPNaNmV1gORDA2XMFk0zhcBw,595
301
- datahub/ingestion/source/fivetran/fivetran.py,sha256=mJ3gi4LWYqul0NyHdZ0U4fDv3WuKEl_yxc2oOd3q6bw,13318
301
+ datahub/ingestion/source/fivetran/fivetran.py,sha256=CVJhW7_os5BTRlzaUX2KOK6CkAVJ0mWQtgTnE6F3fhE,13760
302
302
  datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
303
303
  datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
304
304
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -307,7 +307,7 @@ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeN
307
307
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=VbZ-Xzryl5TMRapu7nlxlsXS8T8lFZcHK9AJnEadJ8Q,11111
308
308
  datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=_oTXN0fzB4kYyFclah9X_1ds32bLayQyyWgoPeHQMw4,12923
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
- datahub/ingestion/source/gcs/gcs_source.py,sha256=1cKnFR2s7JhNWklvB_XCrqDaFNG4RW3Itj8GSXF1s_A,6210
310
+ datahub/ingestion/source/gcs/gcs_source.py,sha256=5EZkrDqjRNQz_aUL1MLp0PTFm0Ztubmk0NYJGZTRLjU,6276
311
311
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
312
312
  datahub/ingestion/source/git/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
313
313
  datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvkYGr9judHJFsOk,4143
@@ -392,7 +392,7 @@ datahub/ingestion/source/redshift/exception.py,sha256=dxzYUIv5B_FAWhOuzG2u5We7FX
392
392
  datahub/ingestion/source/redshift/lineage.py,sha256=bUy0uJowrqSc33Z50fIxFlJkyhe-OPM_qgPh-smSTgM,43983
393
393
  datahub/ingestion/source/redshift/lineage_v2.py,sha256=OcVW_27sSaZOYZPTd2j-LS9SzFQ1kXz6cMzM2ZDWhJQ,16751
394
394
  datahub/ingestion/source/redshift/profile.py,sha256=T4H79ycq2tPobLM1tTLRtu581Qa8LlKxEok49m0AirU,4294
395
- datahub/ingestion/source/redshift/query.py,sha256=bY1D9RoOHaw89LgcXal7GYlJN0RG7PxXRRC-YKIdC8E,43105
395
+ datahub/ingestion/source/redshift/query.py,sha256=X0KlDPzM68j0SYKXhq50DkLbFUIbGuPmGCYYmr8E0v0,44353
396
396
  datahub/ingestion/source/redshift/redshift.py,sha256=x9dKocJdGPaNs2fRdaddaBtZNxmTJFwYDhXY5nl_5zM,44444
397
397
  datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
398
398
  datahub/ingestion/source/redshift/redshift_schema.py,sha256=9IYeUsnISenq3eVB3k-s7zK8nInWDAYViFnDrNjtkb0,19149
@@ -990,8 +990,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
990
990
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
991
991
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
992
992
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
993
- acryl_datahub-0.15.0.2rc8.dist-info/METADATA,sha256=qGt6zoYtVHkPYg1T2zfPy8vfb2R1mrre9uADpF_lrQ8,173250
994
- acryl_datahub-0.15.0.2rc8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
995
- acryl_datahub-0.15.0.2rc8.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
996
- acryl_datahub-0.15.0.2rc8.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
997
- acryl_datahub-0.15.0.2rc8.dist-info/RECORD,,
993
+ acryl_datahub-0.15.0.3.dist-info/METADATA,sha256=jiS4oA2DAbgkw-RvujSYKPpN8mEjXv5qmPywbUU7h9M,173241
994
+ acryl_datahub-0.15.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
995
+ acryl_datahub-0.15.0.3.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
996
+ acryl_datahub-0.15.0.3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
997
+ acryl_datahub-0.15.0.3.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.2rc8"
6
+ __version__ = "0.15.0.3"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
datahub/cli/delete_cli.py CHANGED
@@ -265,6 +265,11 @@ def undo_by_filter(
265
265
  type=str,
266
266
  help="Urn of the entity to delete, for single entity deletion",
267
267
  )
268
+ @click.option(
269
+ "--urn-file",
270
+ required=False,
271
+ help="Path of file with urns (one per line) to be deleted",
272
+ )
268
273
  @click.option(
269
274
  "-a",
270
275
  "--aspect",
@@ -353,6 +358,7 @@ def undo_by_filter(
353
358
  @telemetry.with_telemetry()
354
359
  def by_filter(
355
360
  urn: Optional[str],
361
+ urn_file: Optional[str],
356
362
  aspect: Optional[str],
357
363
  force: bool,
358
364
  soft: bool,
@@ -373,6 +379,7 @@ def by_filter(
373
379
  # Validate the cli arguments.
374
380
  _validate_user_urn_and_filters(
375
381
  urn=urn,
382
+ urn_file=urn_file,
376
383
  entity_type=entity_type,
377
384
  platform=platform,
378
385
  env=env,
@@ -429,6 +436,12 @@ def by_filter(
429
436
  batch_size=batch_size,
430
437
  )
431
438
  )
439
+ elif urn_file:
440
+ with open(urn_file, "r") as r:
441
+ urns = []
442
+ for line in r.readlines():
443
+ urn = line.strip().strip('"')
444
+ urns.append(urn)
432
445
  else:
433
446
  urns = list(
434
447
  graph.get_urns_by_filter(
@@ -537,6 +550,7 @@ def _delete_urns_parallel(
537
550
 
538
551
  def _validate_user_urn_and_filters(
539
552
  urn: Optional[str],
553
+ urn_file: Optional[str],
540
554
  entity_type: Optional[str],
541
555
  platform: Optional[str],
542
556
  env: Optional[str],
@@ -549,9 +563,9 @@ def _validate_user_urn_and_filters(
549
563
  raise click.UsageError(
550
564
  "You cannot provide both an urn and a filter rule (entity-type / platform / env / query)."
551
565
  )
552
- elif not urn and not (entity_type or platform or env or query):
566
+ elif not urn and not urn_file and not (entity_type or platform or env or query):
553
567
  raise click.UsageError(
554
- "You must provide either an urn or at least one filter (entity-type / platform / env / query) in order to delete entities."
568
+ "You must provide either an urn or urn_file or at least one filter (entity-type / platform / env / query) in order to delete entities."
555
569
  )
556
570
  elif query:
557
571
  logger.warning(
@@ -167,6 +167,10 @@ class PlatformDetail(ConfigModel):
167
167
  description="The database that all assets produced by this connector belong to. "
168
168
  "For destinations, this defaults to the fivetran log config's database.",
169
169
  )
170
+ include_schema_in_urn: bool = pydantic.Field(
171
+ default=True,
172
+ description="Include schema in the dataset URN. In some cases, the schema is not relevant to the dataset URN and Fivetran sets it to the source and destination table names in the connector.",
173
+ )
170
174
 
171
175
 
172
176
  class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
@@ -119,21 +119,31 @@ class FivetranSource(StatefulIngestionSourceBase):
119
119
  )
120
120
 
121
121
  for lineage in connector.lineage:
122
+ source_table = (
123
+ lineage.source_table
124
+ if source_details.include_schema_in_urn
125
+ else lineage.source_table.split(".", 1)[1]
126
+ )
122
127
  input_dataset_urn = DatasetUrn.create_from_ids(
123
128
  platform_id=source_details.platform,
124
129
  table_name=(
125
- f"{source_details.database.lower()}.{lineage.source_table}"
130
+ f"{source_details.database.lower()}.{source_table}"
126
131
  if source_details.database
127
- else lineage.source_table
132
+ else source_table
128
133
  ),
129
134
  env=source_details.env,
130
135
  platform_instance=source_details.platform_instance,
131
136
  )
132
137
  input_dataset_urn_list.append(input_dataset_urn)
133
138
 
139
+ destination_table = (
140
+ lineage.destination_table
141
+ if destination_details.include_schema_in_urn
142
+ else lineage.destination_table.split(".", 1)[1]
143
+ )
134
144
  output_dataset_urn = DatasetUrn.create_from_ids(
135
145
  platform_id=destination_details.platform,
136
- table_name=f"{destination_details.database.lower()}.{lineage.destination_table}",
146
+ table_name=f"{destination_details.database.lower()}.{destination_table}",
137
147
  env=destination_details.env,
138
148
  platform_instance=destination_details.platform_instance,
139
149
  )
@@ -176,12 +186,12 @@ class FivetranSource(StatefulIngestionSourceBase):
176
186
  **{
177
187
  f"source.{k}": str(v)
178
188
  for k, v in source_details.dict().items()
179
- if v is not None
189
+ if v is not None and not isinstance(v, bool)
180
190
  },
181
191
  **{
182
192
  f"destination.{k}": str(v)
183
193
  for k, v in destination_details.dict().items()
184
- if v is not None
194
+ if v is not None and not isinstance(v, bool)
185
195
  },
186
196
  )
187
197
 
@@ -88,6 +88,7 @@ class GCSSource(StatefulIngestionSourceBase):
88
88
  super().__init__(config, ctx)
89
89
  self.config = config
90
90
  self.report = GCSSourceReport()
91
+ self.platform: str = PLATFORM_GCS
91
92
  self.s3_source = self.create_equivalent_s3_source(ctx)
92
93
 
93
94
  @classmethod
@@ -135,7 +136,7 @@ class GCSSource(StatefulIngestionSourceBase):
135
136
 
136
137
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
137
138
  config = self.create_equivalent_s3_config()
138
- return self.s3_source_overrides(S3Source(config, ctx))
139
+ return self.s3_source_overrides(S3Source(config, PipelineContext(ctx.run_id)))
139
140
 
140
141
  def s3_source_overrides(self, source: S3Source) -> S3Source:
141
142
  source.source_config.platform = PLATFORM_GCS
@@ -797,61 +797,91 @@ class RedshiftServerlessQuery(RedshiftCommonQuery):
797
797
  db_name: str, start_time: datetime, end_time: datetime
798
798
  ) -> str:
799
799
  return """
800
- SELECT
801
- distinct cluster,
802
- target_schema,
803
- target_table,
804
- username,
805
- source_schema,
806
- source_table,
807
- query_text AS ddl,
808
- start_time AS timestamp
809
- FROM
810
- (
811
- SELECT
812
- sti.schema AS target_schema,
813
- sti.table AS target_table,
814
- sti.database AS cluster,
815
- qi.table_id AS target_table_id,
816
- qi.query_id AS query_id,
817
- qi.start_time AS start_time
818
- FROM
819
- SYS_QUERY_DETAIL qi
820
- JOIN
821
- SVV_TABLE_INFO sti on sti.table_id = qi.table_id
822
- WHERE
823
- start_time >= '{start_time}' and
824
- start_time < '{end_time}' and
825
- cluster = '{db_name}' and
826
- step_name = 'insert'
827
- ) AS target_tables
828
- JOIN
829
- (
800
+ WITH queries AS (
830
801
  SELECT
831
- sti.schema AS source_schema,
832
- sti.table AS source_table,
833
- qs.table_id AS source_table_id,
834
- qs.query_id AS query_id,
835
- sui.user_name AS username,
836
- LISTAGG(qt."text") WITHIN GROUP (ORDER BY sequence) AS query_text
802
+ sti.database as cluster,
803
+ sti.schema AS "schema",
804
+ sti.table AS "table",
805
+ qs.table_id AS table_id,
806
+ qs.query_id as query_id,
807
+ qs.step_name as step_name,
808
+ sui.user_name as username,
809
+ source,
810
+ MIN(qs.start_time) as "timestamp" -- multiple duplicate records with start_time increasing slightly by miliseconds
837
811
  FROM
838
812
  SYS_QUERY_DETAIL qs
839
813
  JOIN
840
814
  SVV_TABLE_INFO sti ON sti.table_id = qs.table_id
841
815
  LEFT JOIN
842
- SYS_QUERY_TEXT qt ON qt.query_id = qs.query_id
843
- LEFT JOIN
844
816
  SVV_USER_INFO sui ON qs.user_id = sui.user_id
845
817
  WHERE
846
- qs.step_name = 'scan' AND
847
- qs.source = 'Redshift(local)' AND
848
- qt.sequence < 16 AND -- See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext
849
- sti.database = '{db_name}' AND -- this was required to not retrieve some internal redshift tables, try removing to see what happens
850
- sui.user_name <> 'rdsdb' -- not entirely sure about this filter
851
- GROUP BY sti.schema, sti.table, qs.table_id, qs.query_id, sui.user_name
852
- ) AS source_tables ON target_tables.query_id = source_tables.query_id
853
- WHERE source_tables.source_table_id <> target_tables.target_table_id
854
- ORDER BY cluster, target_schema, target_table, start_time ASC
818
+ cluster = '{db_name}' AND
819
+ qs.user_id <> 1 AND -- this is user 'rdsdb'
820
+ qs.start_time >= '{start_time}' AND
821
+ qs.start_time < '{end_time}'
822
+ GROUP BY cluster, "schema", "table", qs.table_id, query_id, step_name, username, source -- to be sure we are not making duplicates ourselves the list of group by must match whatever we use in "group by" and "where" of subsequent queries ("cluster" is already set to single value in this query)
823
+ ),
824
+ unique_query_text AS (
825
+ SELECT
826
+ query_id,
827
+ sequence,
828
+ text
829
+ FROM (
830
+ SELECT
831
+ query_id,
832
+ "sequence",
833
+ text,
834
+ ROW_NUMBER() OVER (
835
+ PARTITION BY query_id, sequence
836
+ ) as rn
837
+ FROM SYS_QUERY_TEXT
838
+ )
839
+ WHERE rn = 1
840
+ ),
841
+ scan_queries AS (
842
+ SELECT
843
+ "schema" as source_schema,
844
+ "table" as source_table,
845
+ table_id as source_table_id,
846
+ queries.query_id as query_id,
847
+ username,
848
+ LISTAGG(qt."text") WITHIN GROUP (ORDER BY sequence) AS query_text
849
+ FROM
850
+ "queries" LEFT JOIN
851
+ unique_query_text qt ON qt.query_id = queries.query_id
852
+ WHERE
853
+ source = 'Redshift(local)' AND
854
+ step_name = 'scan' AND
855
+ qt.sequence < 16 -- truncating query to not exceed Redshift limit on LISTAGG function (each sequence has at most 4k characters, limit is 64k, divided by 4k gives 16, starts count from 0)
856
+ GROUP BY source_schema, source_table, source_table_id, queries.query_id, username
857
+ ),
858
+ insert_queries AS (
859
+ SELECT
860
+ "schema" as target_schema,
861
+ "table" as target_table,
862
+ table_id as target_table_id,
863
+ query_id,
864
+ cluster,
865
+ min("timestamp") as "timestamp"
866
+ FROM
867
+ queries
868
+ WHERE
869
+ step_name = 'insert'
870
+ GROUP BY cluster, target_schema, target_table, target_table_id, query_id
871
+ )
872
+ SELECT
873
+ cluster,
874
+ target_schema,
875
+ target_table,
876
+ username,
877
+ source_schema,
878
+ source_table,
879
+ query_text AS ddl,
880
+ "timestamp"
881
+ FROM scan_queries
882
+ JOIN insert_queries on insert_queries.query_id = scan_queries.query_id
883
+ WHERE source_table_id <> target_table_id
884
+ ORDER BY cluster, target_schema, target_table, "timestamp" ASC;
855
885
  """.format(
856
886
  # We need the original database name for filtering
857
887
  db_name=db_name,