PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc8__py3-none-any.whl → 0.15.0.3__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc8py3-none-any.whl → 0.15.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (11) hide show

{acryl_datahub-0.15.0.2rc8.dist-info → acryl_datahub-0.15.0.3.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datahub/__init__.py,sha256=H_T6wHYf0kR8j3IPqRxXJTiVy6-s7Wx9qPis7M9HDuM,576
+datahub/__init__.py,sha256=W_09oIthIpoet0P4t-RgCWaJ-k83wzO6HCCmtceQw44,573
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
 datahub/entrypoints.py,sha256=IMtLWvGuiqoUSnNaCaFjhd86NHwuXSWXp2kUL-xDkk0,7950
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -61,7 +61,7 @@ datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
 datahub/cli/cli_utils.py,sha256=onbG7z9hIm0zCAm0a2ulTOsHC_NVkdIsbg__EMj02DQ,13540
 datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
-datahub/cli/delete_cli.py,sha256=rJpyQhuRb_BnA1Fyot3Yu6-_x_3CoSsgjrpyJJCwEJY,23050
+datahub/cli/delete_cli.py,sha256=oQ4Yy6hxZHcl67MYJiQumLs_8QmFEj7SPZFzxFXvDk8,23481
 datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
 datahub/cli/docker_cli.py,sha256=w9ZQMRVlHwfJI2XDe7mO0lwnT7-dZoK6tPadSMgwEM8,36493
 datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
@@ -296,9 +296,9 @@ datahub/ingestion/source/dynamodb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
 datahub/ingestion/source/dynamodb/data_reader.py,sha256=vC77KpcP8LJN0g8wsPRDVw4sebv0ZWIP3tJkEIHaomA,3120
 datahub/ingestion/source/dynamodb/dynamodb.py,sha256=wcEQSfQak45yPNZN7pCUEQFmjyWCpqRk1WjJJz9E2Go,22395
 datahub/ingestion/source/fivetran/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/fivetran/config.py,sha256=kTfsu7oC4BSzkcFtetZr0UmiQ-B2af-_jbNYSFbDim4,8749
+datahub/ingestion/source/fivetran/config.py,sha256=BP3KRfAQ6H5qyEeJNu9vNfZNwLoyj4Tl2kXiLVR5DNM,9027
 datahub/ingestion/source/fivetran/data_classes.py,sha256=ecdUJH5BEze0yv-uFpKWPNaNmV1gORDA2XMFk0zhcBw,595
-datahub/ingestion/source/fivetran/fivetran.py,sha256=mJ3gi4LWYqul0NyHdZ0U4fDv3WuKEl_yxc2oOd3q6bw,13318
+datahub/ingestion/source/fivetran/fivetran.py,sha256=CVJhW7_os5BTRlzaUX2KOK6CkAVJ0mWQtgTnE6F3fhE,13760
 datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP_CyAT5Cian2N4a-lb8x1NKHk,12776
 datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
 datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -307,7 +307,7 @@ datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeN
 datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=VbZ-Xzryl5TMRapu7nlxlsXS8T8lFZcHK9AJnEadJ8Q,11111
 datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=_oTXN0fzB4kYyFclah9X_1ds32bLayQyyWgoPeHQMw4,12923
 datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/gcs/gcs_source.py,sha256=1cKnFR2s7JhNWklvB_XCrqDaFNG4RW3Itj8GSXF1s_A,6210
+datahub/ingestion/source/gcs/gcs_source.py,sha256=5EZkrDqjRNQz_aUL1MLp0PTFm0Ztubmk0NYJGZTRLjU,6276
 datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
 datahub/ingestion/source/git/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvkYGr9judHJFsOk,4143
@@ -392,7 +392,7 @@ datahub/ingestion/source/redshift/exception.py,sha256=dxzYUIv5B_FAWhOuzG2u5We7FX
 datahub/ingestion/source/redshift/lineage.py,sha256=bUy0uJowrqSc33Z50fIxFlJkyhe-OPM_qgPh-smSTgM,43983
 datahub/ingestion/source/redshift/lineage_v2.py,sha256=OcVW_27sSaZOYZPTd2j-LS9SzFQ1kXz6cMzM2ZDWhJQ,16751
 datahub/ingestion/source/redshift/profile.py,sha256=T4H79ycq2tPobLM1tTLRtu581Qa8LlKxEok49m0AirU,4294
-datahub/ingestion/source/redshift/query.py,sha256=bY1D9RoOHaw89LgcXal7GYlJN0RG7PxXRRC-YKIdC8E,43105
+datahub/ingestion/source/redshift/query.py,sha256=X0KlDPzM68j0SYKXhq50DkLbFUIbGuPmGCYYmr8E0v0,44353
 datahub/ingestion/source/redshift/redshift.py,sha256=x9dKocJdGPaNs2fRdaddaBtZNxmTJFwYDhXY5nl_5zM,44444
 datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
 datahub/ingestion/source/redshift/redshift_schema.py,sha256=9IYeUsnISenq3eVB3k-s7zK8nInWDAYViFnDrNjtkb0,19149
@@ -990,8 +990,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-0.15.0.2rc8.dist-info/METADATA,sha256=qGt6zoYtVHkPYg1T2zfPy8vfb2R1mrre9uADpF_lrQ8,173250
-acryl_datahub-0.15.0.2rc8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-acryl_datahub-0.15.0.2rc8.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
-acryl_datahub-0.15.0.2rc8.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-0.15.0.2rc8.dist-info/RECORD,,
+acryl_datahub-0.15.0.3.dist-info/METADATA,sha256=jiS4oA2DAbgkw-RvujSYKPpN8mEjXv5qmPywbUU7h9M,173241
+acryl_datahub-0.15.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+acryl_datahub-0.15.0.3.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
+acryl_datahub-0.15.0.3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-0.15.0.3.dist-info/RECORD,,

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.15.0.2rc8"
+__version__ = "0.15.0.3"
 def is_dev_mode() -> bool:

datahub/cli/delete_cli.py CHANGED Viewed

@@ -265,6 +265,11 @@ def undo_by_filter(
     type=str,
     help="Urn of the entity to delete, for single entity deletion",
 )
+@click.option(
+    "--urn-file",
+    required=False,
+    help="Path of file with urns (one per line) to be deleted",
+)
 @click.option(
     "-a",
     "--aspect",
@@ -353,6 +358,7 @@ def undo_by_filter(
 @telemetry.with_telemetry()
 def by_filter(
     urn: Optional[str],
+    urn_file: Optional[str],
     aspect: Optional[str],
     force: bool,
     soft: bool,
@@ -373,6 +379,7 @@ def by_filter(
     # Validate the cli arguments.
     _validate_user_urn_and_filters(
         urn=urn,
+        urn_file=urn_file,
         entity_type=entity_type,
         platform=platform,
         env=env,
@@ -429,6 +436,12 @@ def by_filter(
                         batch_size=batch_size,
                     )
                 )
+    elif urn_file:
+        with open(urn_file, "r") as r:
+            urns = []
+            for line in r.readlines():
+                urn = line.strip().strip('"')
+                urns.append(urn)
     else:
         urns = list(
             graph.get_urns_by_filter(
@@ -537,6 +550,7 @@ def _delete_urns_parallel(
 def _validate_user_urn_and_filters(
     urn: Optional[str],
+    urn_file: Optional[str],
     entity_type: Optional[str],
     platform: Optional[str],
     env: Optional[str],
@@ -549,9 +563,9 @@ def _validate_user_urn_and_filters(
             raise click.UsageError(
                 "You cannot provide both an urn and a filter rule (entity-type / platform / env / query)."
             )
-    elif not urn and not (entity_type or platform or env or query):
+    elif not urn and not urn_file and not (entity_type or platform or env or query):
         raise click.UsageError(
-            "You must provide either an urn or at least one filter (entity-type / platform / env / query) in order to delete entities."
+            "You must provide either an urn or urn_file or at least one filter (entity-type / platform / env / query) in order to delete entities."
         )
     elif query:
         logger.warning(

datahub/ingestion/source/fivetran/config.py CHANGED Viewed

@@ -167,6 +167,10 @@ class PlatformDetail(ConfigModel):
         description="The database that all assets produced by this connector belong to. "
         "For destinations, this defaults to the fivetran log config's database.",
     )
+    include_schema_in_urn: bool = pydantic.Field(
+        default=True,
+        description="Include schema in the dataset URN. In some cases, the schema is not relevant to the dataset URN and Fivetran sets it to the source and destination table names in the connector.",
+    )
 class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):

datahub/ingestion/source/fivetran/fivetran.py CHANGED Viewed

@@ -119,21 +119,31 @@ class FivetranSource(StatefulIngestionSourceBase):
             )
         for lineage in connector.lineage:
+            source_table = (
+                lineage.source_table
+                if source_details.include_schema_in_urn
+                else lineage.source_table.split(".", 1)[1]
+            )
             input_dataset_urn = DatasetUrn.create_from_ids(
                 platform_id=source_details.platform,
                 table_name=(
-                    f"{source_details.database.lower()}.{lineage.source_table}"
+                    f"{source_details.database.lower()}.{source_table}"
                     if source_details.database
-                    else lineage.source_table
+                    else source_table
                 ),
                 env=source_details.env,
                 platform_instance=source_details.platform_instance,
             )
             input_dataset_urn_list.append(input_dataset_urn)
+            destination_table = (
+                lineage.destination_table
+                if destination_details.include_schema_in_urn
+                else lineage.destination_table.split(".", 1)[1]
+            )
             output_dataset_urn = DatasetUrn.create_from_ids(
                 platform_id=destination_details.platform,
-                table_name=f"{destination_details.database.lower()}.{lineage.destination_table}",
+                table_name=f"{destination_details.database.lower()}.{destination_table}",
                 env=destination_details.env,
                 platform_instance=destination_details.platform_instance,
             )
@@ -176,12 +186,12 @@ class FivetranSource(StatefulIngestionSourceBase):
             **{
                 f"source.{k}": str(v)
                 for k, v in source_details.dict().items()
-                if v is not None
+                if v is not None and not isinstance(v, bool)
             },
             **{
                 f"destination.{k}": str(v)
                 for k, v in destination_details.dict().items()
-                if v is not None
+                if v is not None and not isinstance(v, bool)
             },
         )

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -88,6 +88,7 @@ class GCSSource(StatefulIngestionSourceBase):
         super().__init__(config, ctx)
         self.config = config
         self.report = GCSSourceReport()
+        self.platform: str = PLATFORM_GCS
         self.s3_source = self.create_equivalent_s3_source(ctx)
     @classmethod
@@ -135,7 +136,7 @@ class GCSSource(StatefulIngestionSourceBase):
     def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
         config = self.create_equivalent_s3_config()
-        return self.s3_source_overrides(S3Source(config, ctx))
+        return self.s3_source_overrides(S3Source(config, PipelineContext(ctx.run_id)))
     def s3_source_overrides(self, source: S3Source) -> S3Source:
         source.source_config.platform = PLATFORM_GCS

datahub/ingestion/source/redshift/query.py CHANGED Viewed

@@ -797,61 +797,91 @@ class RedshiftServerlessQuery(RedshiftCommonQuery):
         db_name: str, start_time: datetime, end_time: datetime
     ) -> str:
         return """
-            SELECT
-                distinct cluster,
-                target_schema,
-                target_table,
-                username,
-                source_schema,
-                source_table,
-                query_text AS ddl,
-                start_time AS timestamp
-            FROM
-            (
-                SELECT
-                    sti.schema AS target_schema,
-                    sti.table AS target_table,
-                    sti.database AS cluster,
-                    qi.table_id AS target_table_id,
-                    qi.query_id AS query_id,
-                    qi.start_time AS start_time
-                FROM
-                    SYS_QUERY_DETAIL qi
-                    JOIN
-                    SVV_TABLE_INFO sti on sti.table_id = qi.table_id
-                WHERE
-                    start_time >= '{start_time}' and
-                    start_time < '{end_time}' and
-                    cluster = '{db_name}' and
-                    step_name = 'insert'
-            ) AS target_tables
-            JOIN
-            (
+            WITH queries AS (
                 SELECT
-                    sti.schema AS source_schema,
-                    sti.table AS source_table,
-                    qs.table_id AS source_table_id,
-                    qs.query_id AS query_id,
-                    sui.user_name AS username,
-                    LISTAGG(qt."text") WITHIN GROUP (ORDER BY sequence) AS query_text
+                    sti.database as cluster,
+                    sti.schema AS "schema",
+                    sti.table AS "table",
+                    qs.table_id AS table_id,
+                    qs.query_id as query_id,
+                    qs.step_name as step_name,
+                    sui.user_name as username,
+                    source,
+                    MIN(qs.start_time) as "timestamp" -- multiple duplicate records with start_time increasing slightly by miliseconds
                 FROM
                     SYS_QUERY_DETAIL qs
                     JOIN
                     SVV_TABLE_INFO sti ON sti.table_id = qs.table_id
                     LEFT JOIN
-                    SYS_QUERY_TEXT qt ON qt.query_id = qs.query_id
-                    LEFT JOIN
                     SVV_USER_INFO sui ON qs.user_id = sui.user_id
                 WHERE
-                    qs.step_name = 'scan' AND
-                    qs.source = 'Redshift(local)' AND
-                    qt.sequence < 16 AND -- See https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext
-                    sti.database = '{db_name}' AND -- this was required to not retrieve some internal redshift tables, try removing to see what happens
-                    sui.user_name <> 'rdsdb' -- not entirely sure about this filter
-                GROUP BY sti.schema, sti.table, qs.table_id, qs.query_id, sui.user_name
-            ) AS source_tables ON target_tables.query_id = source_tables.query_id
-            WHERE source_tables.source_table_id <> target_tables.target_table_id
-            ORDER BY cluster, target_schema, target_table, start_time ASC
+                    cluster = '{db_name}' AND
+                    qs.user_id <> 1 AND -- this is user 'rdsdb'
+                    qs.start_time >= '{start_time}' AND
+                    qs.start_time < '{end_time}'
+                GROUP BY cluster, "schema", "table", qs.table_id, query_id, step_name, username, source -- to be sure we are not making duplicates ourselves the list of group by must match whatever we use in "group by" and "where" of subsequent queries ("cluster" is already set to single value in this query)
+            ),
+            unique_query_text AS (
+                SELECT
+                    query_id,
+                    sequence,
+                    text
+                FROM (
+                    SELECT
+                        query_id,
+                        "sequence",
+                        text,
+                        ROW_NUMBER() OVER (
+                        PARTITION BY query_id, sequence
+                        ) as rn
+                    FROM SYS_QUERY_TEXT
+                    )
+                WHERE rn = 1
+            ),
+            scan_queries AS (
+                SELECT
+                    "schema" as source_schema,
+                    "table" as source_table,
+                    table_id as source_table_id,
+                    queries.query_id as query_id,
+                    username,
+                    LISTAGG(qt."text") WITHIN GROUP (ORDER BY sequence) AS query_text
+                FROM
+                    "queries" LEFT JOIN
+                    unique_query_text qt ON qt.query_id = queries.query_id
+                WHERE
+                    source = 'Redshift(local)' AND
+                    step_name = 'scan' AND
+                    qt.sequence < 16 -- truncating query to not exceed Redshift limit on LISTAGG function (each sequence has at most 4k characters, limit is 64k, divided by 4k gives 16, starts count from 0)
+                GROUP BY source_schema, source_table, source_table_id, queries.query_id, username
+            ),
+            insert_queries AS (
+                SELECT
+                    "schema" as target_schema,
+                    "table" as target_table,
+                    table_id as target_table_id,
+                    query_id,
+                    cluster,
+                    min("timestamp") as "timestamp"
+                FROM
+                    queries
+                WHERE
+                    step_name = 'insert'
+                GROUP BY cluster, target_schema, target_table, target_table_id, query_id
+            )
+            SELECT
+                cluster,
+                target_schema,
+                target_table,
+                username,
+                source_schema,
+                source_table,
+                query_text AS ddl,
+                "timestamp"
+            FROM scan_queries
+                JOIN insert_queries on insert_queries.query_id = scan_queries.query_id
+            WHERE source_table_id <> target_table_id
+            ORDER BY cluster, target_schema, target_table, "timestamp" ASC;
                     """.format(
             # We need the original database name for filtering
             db_name=db_name,

{acryl_datahub-0.15.0.2rc8.dist-info → acryl_datahub-0.15.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.2rc8.dist-info → acryl_datahub-0.15.0.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.2rc8.dist-info → acryl_datahub-0.15.0.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0.2rc8__py3-none-any.whl → 0.15.0.3__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc8py3-none-any.whl → 0.15.0.3py3-none-any.whl