PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc2py3-none-any.whl → 0.15.0.2rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (44) hide show

{acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2333 -2333
{acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +44 -44
datahub/__init__.py +1 -1
datahub/api/entities/structuredproperties/structuredproperties.py +12 -1
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/ingestion/api/source.py +2 -0
datahub/ingestion/graph/client.py +4 -2
datahub/ingestion/source/aws/glue.py +14 -1
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/bigquery_v2/bigquery.py +31 -33
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -11
datahub/ingestion/source/bigquery_v2/lineage.py +7 -7
datahub/ingestion/source/bigquery_v2/usage.py +57 -57
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +0 -3
datahub/ingestion/source/datahub/config.py +6 -0
datahub/ingestion/source/datahub/datahub_source.py +12 -2
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/gc/datahub_gc.py +10 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
datahub/ingestion/source/looker/looker_config.py +8 -3
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/redshift/redshift.py +32 -34
datahub/ingestion/source/redshift/usage.py +29 -29
datahub/ingestion/source/s3/source.py +10 -14
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +18 -16
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +46 -47
datahub/ingestion/source/snowflake/snowflake_v2.py +38 -39
datahub/ingestion/source/sql/teradata.py +2 -2
datahub/ingestion/source/tableau/tableau.py +119 -31
datahub/ingestion/source/unity/source.py +71 -71
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/metadata/_schema_classes.py +2 -2
datahub/metadata/_urns/urn_defs.py +15 -15
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/utilities/perf_timer.py +11 -6
{acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
{acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.2rc2.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py CHANGED Viewed

@@ -248,9 +248,9 @@ class BigQuerySchemaGenerator:
     def get_project_workunits(
         self, project: BigqueryProject
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION)
-        logger.info(f"Processing project: {project.id}")
-        yield from self._process_project(project)
+        with self.report.new_stage(f"{project.id}: {METADATA_EXTRACTION}"):
+            logger.info(f"Processing project: {project.id}")
+            yield from self._process_project(project)
     def get_dataplatform_instance_aspect(
         self, dataset_urn: str, project_id: str
@@ -405,11 +405,11 @@ class BigQuerySchemaGenerator:
         if self.config.is_profiling_enabled():
             logger.info(f"Starting profiling project {project_id}")
-            self.report.set_ingestion_stage(project_id, PROFILING)
-            yield from self.profiler.get_workunits(
-                project_id=project_id,
-                tables=db_tables,
-            )
+            with self.report.new_stage(f"{project_id}: {PROFILING}"):
+                yield from self.profiler.get_workunits(
+                    project_id=project_id,
+                    tables=db_tables,
+                )
     def _process_project_datasets(
         self,
@@ -1203,9 +1203,9 @@ class BigQuerySchemaGenerator:
                     report=self.report,
                 )
-        self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round(
-            timer.elapsed_seconds(), 2
-        )
+        self.report.metadata_extraction_sec[
+            f"{project_id}.{dataset.name}"
+        ] = timer.elapsed_seconds(digits=2)
     def get_core_table_details(
         self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str

datahub/ingestion/source/bigquery_v2/lineage.py CHANGED Viewed

@@ -330,11 +330,11 @@ class BigqueryLineageExtractor:
             projects = ["*"]  # project_id not used when using exported metadata
         for project in projects:
-            self.report.set_ingestion_stage(project, LINEAGE_EXTRACTION)
-            yield from self.generate_lineage(
-                project,
-                table_refs,
-            )
+            with self.report.new_stage(f"{project}: {LINEAGE_EXTRACTION}"):
+                yield from self.generate_lineage(
+                    project,
+                    table_refs,
+                )
         if self.redundant_run_skip_handler:
             # Update the checkpoint state for this run.
@@ -368,8 +368,8 @@ class BigqueryLineageExtractor:
             self.report.lineage_metadata_entries[project_id] = len(lineage)
             logger.info(f"Built lineage map containing {len(lineage)} entries.")
             logger.debug(f"lineage metadata is {lineage}")
-            self.report.lineage_extraction_sec[project_id] = round(
-                timer.elapsed_seconds(), 2
+            self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds(
+                digits=2
             )
             self.report.lineage_mem_size[project_id] = humanfriendly.format_size(
                 memory_footprint.total_size(lineage)

datahub/ingestion/source/bigquery_v2/usage.py CHANGED Viewed

@@ -495,62 +495,62 @@ class BigQueryUsageExtractor:
     def _generate_operational_workunits(
         self, usage_state: BigQueryUsageState, table_refs: Collection[str]
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_OPERATIONAL_STATS)
-        for audit_event in usage_state.standalone_events():
-            try:
-                operational_wu = self._create_operation_workunit(
-                    audit_event, table_refs
-                )
-                if operational_wu:
-                    yield operational_wu
-                    self.report.num_operational_stats_workunits_emitted += 1
-            except Exception as e:
-                self.report.warning(
-                    message="Unable to generate operation workunit",
-                    context=f"{audit_event}",
-                    exc=e,
-                )
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_OPERATIONAL_STATS}"):
+            for audit_event in usage_state.standalone_events():
+                try:
+                    operational_wu = self._create_operation_workunit(
+                        audit_event, table_refs
+                    )
+                    if operational_wu:
+                        yield operational_wu
+                        self.report.num_operational_stats_workunits_emitted += 1
+                except Exception as e:
+                    self.report.warning(
+                        message="Unable to generate operation workunit",
+                        context=f"{audit_event}",
+                        exc=e,
+                    )
     def _generate_usage_workunits(
         self, usage_state: BigQueryUsageState
     ) -> Iterable[MetadataWorkUnit]:
-        self.report.set_ingestion_stage("*", USAGE_EXTRACTION_USAGE_AGGREGATION)
-        top_n = (
-            self.config.usage.top_n_queries
-            if self.config.usage.include_top_n_queries
-            else 0
-        )
-        for entry in usage_state.usage_statistics(top_n=top_n):
-            try:
-                query_freq = [
-                    (
-                        self.uuid_to_query.get(
-                            query_hash, usage_state.queries[query_hash]
-                        ),
-                        count,
+        with self.report.new_stage(f"*: {USAGE_EXTRACTION_USAGE_AGGREGATION}"):
+            top_n = (
+                self.config.usage.top_n_queries
+                if self.config.usage.include_top_n_queries
+                else 0
+            )
+            for entry in usage_state.usage_statistics(top_n=top_n):
+                try:
+                    query_freq = [
+                        (
+                            self.uuid_to_query.get(
+                                query_hash, usage_state.queries[query_hash]
+                            ),
+                            count,
+                        )
+                        for query_hash, count in entry.query_freq
+                    ]
+                    yield make_usage_workunit(
+                        bucket_start_time=datetime.fromisoformat(entry.timestamp),
+                        resource=BigQueryTableRef.from_string_name(entry.resource),
+                        query_count=entry.query_count,
+                        query_freq=query_freq,
+                        user_freq=entry.user_freq,
+                        column_freq=entry.column_freq,
+                        bucket_duration=self.config.bucket_duration,
+                        resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
+                        top_n_queries=self.config.usage.top_n_queries,
+                        format_sql_queries=self.config.usage.format_sql_queries,
+                        queries_character_limit=self.config.usage.queries_character_limit,
+                    )
+                    self.report.num_usage_workunits_emitted += 1
+                except Exception as e:
+                    self.report.warning(
+                        message="Unable to generate usage statistics workunit",
+                        context=f"{entry.timestamp}, {entry.resource}",
+                        exc=e,
                     )
-                    for query_hash, count in entry.query_freq
-                ]
-                yield make_usage_workunit(
-                    bucket_start_time=datetime.fromisoformat(entry.timestamp),
-                    resource=BigQueryTableRef.from_string_name(entry.resource),
-                    query_count=entry.query_count,
-                    query_freq=query_freq,
-                    user_freq=entry.user_freq,
-                    column_freq=entry.column_freq,
-                    bucket_duration=self.config.bucket_duration,
-                    resource_urn_builder=self.identifiers.gen_dataset_urn_from_raw_ref,
-                    top_n_queries=self.config.usage.top_n_queries,
-                    format_sql_queries=self.config.usage.format_sql_queries,
-                    queries_character_limit=self.config.usage.queries_character_limit,
-                )
-                self.report.num_usage_workunits_emitted += 1
-            except Exception as e:
-                self.report.warning(
-                    message="Unable to generate usage statistics workunit",
-                    context=f"{entry.timestamp}, {entry.resource}",
-                    exc=e,
-                )
     def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]:
         if self.config.use_exported_bigquery_audit_metadata:
@@ -559,10 +559,10 @@ class BigQueryUsageExtractor:
         for project_id in projects:
             with PerfTimer() as timer:
                 try:
-                    self.report.set_ingestion_stage(
-                        project_id, USAGE_EXTRACTION_INGESTION
-                    )
-                    yield from self._get_parsed_bigquery_log_events(project_id)
+                    with self.report.new_stage(
+                        f"{project_id}: {USAGE_EXTRACTION_INGESTION}"
+                    ):
+                        yield from self._get_parsed_bigquery_log_events(project_id)
                 except Exception as e:
                     self.report.usage_failed_extraction.append(project_id)
                     self.report.warning(
@@ -572,8 +572,8 @@ class BigQueryUsageExtractor:
                     )
                     self.report_status(f"usage-extraction-{project_id}", False)
-                self.report.usage_extraction_sec[project_id] = round(
-                    timer.elapsed_seconds(), 2
+                self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds(
+                    digits=2
                 )
     def _store_usage_event(

datahub/ingestion/source/cassandra/cassandra_profiling.py CHANGED Viewed

@@ -70,30 +70,30 @@ class CassandraProfiler:
     ) -> Iterable[MetadataWorkUnit]:
         for keyspace_name in cassandra_data.keyspaces:
             tables = cassandra_data.tables.get(keyspace_name, [])
-            self.report.set_ingestion_stage(keyspace_name, PROFILING)
-            with ThreadPoolExecutor(
-                max_workers=self.config.profiling.max_workers
-            ) as executor:
-                future_to_dataset = {
-                    executor.submit(
-                        self.generate_profile,
-                        keyspace_name,
-                        table_name,
-                        cassandra_data.columns.get(table_name, []),
-                    ): table_name
-                    for table_name in tables
-                }
-                for future in as_completed(future_to_dataset):
-                    table_name = future_to_dataset[future]
-                    try:
-                        yield from future.result()
-                    except Exception as exc:
-                        self.report.profiling_skipped_other[table_name] += 1
-                        self.report.failure(
-                            message="Failed to profile for table",
-                            context=f"{keyspace_name}.{table_name}",
-                            exc=exc,
-                        )
+            with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
+                with ThreadPoolExecutor(
+                    max_workers=self.config.profiling.max_workers
+                ) as executor:
+                    future_to_dataset = {
+                        executor.submit(
+                            self.generate_profile,
+                            keyspace_name,
+                            table_name,
+                            cassandra_data.columns.get(table_name, []),
+                        ): table_name
+                        for table_name in tables
+                    }
+                    for future in as_completed(future_to_dataset):
+                        table_name = future_to_dataset[future]
+                        try:
+                            yield from future.result()
+                        except Exception as exc:
+                            self.report.profiling_skipped_other[table_name] += 1
+                            self.report.failure(
+                                message="Failed to profile for table",
+                                context=f"{keyspace_name}.{table_name}",
+                                exc=exc,
+                            )
     def generate_profile(
         self,

datahub/ingestion/source/cassandra/cassandra_utils.py CHANGED Viewed

@@ -54,9 +54,6 @@ class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport
         else:
             raise KeyError(f"Unknown entity {ent_type}.")
-    def set_ingestion_stage(self, keyspace: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{keyspace}: {stage}")
     # TODO Need to create seperate common config for profiling report
     profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
     profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -108,6 +108,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
     urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
+    drop_duplicate_schema_fields: bool = Field(
+        default=False,
+        description="Whether to drop duplicate schema fields in the schemaMetadata aspect. "
+        "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
+    )
     @root_validator(skip_on_failure=True)
     def check_ingesting_data(cls, values):
         if (

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -12,7 +12,10 @@ from datahub.ingestion.api.decorators import (
     support_status,
 )
 from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
-from datahub.ingestion.api.source_helpers import auto_workunit_reporter
+from datahub.ingestion.api.source_helpers import (
+    auto_fix_duplicate_schema_field_paths,
+    auto_workunit_reporter,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.datahub.config import DataHubSourceConfig
 from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
@@ -57,7 +60,14 @@ class DataHubSource(StatefulIngestionSourceBase):
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         # Exactly replicate data from DataHub source
-        return [partial(auto_workunit_reporter, self.get_report())]
+        return [
+            (
+                auto_fix_duplicate_schema_field_paths
+                if self.config.drop_duplicate_schema_fields
+                else None
+            ),
+            partial(auto_workunit_reporter, self.get_report()),
+        ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self.report.stop_time = datetime.now(tz=timezone.utc)

datahub/ingestion/source/dremio/dremio_reporting.py CHANGED Viewed

@@ -45,6 +45,3 @@ class DremioSourceReport(
             self.views_scanned += 1
         else:
             raise KeyError(f"Unknown entity {ent_type}.")
-    def set_ingestion_stage(self, dataset: str, stage: str) -> None:
-        self.report_ingestion_stage_start(f"{dataset}: {stage}")

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -472,8 +472,8 @@ class DremioSource(StatefulIngestionSourceBase):
             env=self.config.env,
             platform_instance=self.config.platform_instance,
         )
-        self.report.set_ingestion_stage(dataset_info.resource_name, PROFILING)
-        yield from self.profiler.get_workunits(dataset_info, dataset_urn)
+        with self.report.new_stage(f"{dataset_info.resource_name}: {PROFILING}"):
+            yield from self.profiler.get_workunits(dataset_info, dataset_urn)
     def generate_view_lineage(
         self, dataset_urn: str, parents: List[str]

datahub/ingestion/source/gc/datahub_gc.py CHANGED Viewed

@@ -141,40 +141,36 @@ class DataHubGcSource(Source):
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.cleanup_expired_tokens:
             try:
-                self.report.report_ingestion_stage_start("Expired Token Cleanup")
-                self.revoke_expired_tokens()
+                with self.report.new_stage("Expired Token Cleanup"):
+                    self.revoke_expired_tokens()
             except Exception as e:
                 self.report.failure("While trying to cleanup expired token ", exc=e)
         if self.config.truncate_indices:
             try:
-                self.report.report_ingestion_stage_start("Truncate Indices")
-                self.truncate_indices()
+                with self.report.new_stage("Truncate Indices"):
+                    self.truncate_indices()
             except Exception as e:
                 self.report.failure("While trying to truncate indices ", exc=e)
         if self.config.soft_deleted_entities_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start(
-                    "Soft Deleted Entities Cleanup"
-                )
-                self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
+                with self.report.new_stage("Soft Deleted Entities Cleanup"):
+                    self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
             except Exception as e:
                 self.report.failure(
                     "While trying to cleanup soft deleted entities ", exc=e
                 )
         if self.config.dataprocess_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start("Data Process Cleanup")
-                yield from self.dataprocess_cleanup.get_workunits_internal()
+                with self.report.new_stage("Data Process Cleanup"):
+                    yield from self.dataprocess_cleanup.get_workunits_internal()
             except Exception as e:
                 self.report.failure("While trying to cleanup data process ", exc=e)
         if self.config.execution_request_cleanup.enabled:
             try:
-                self.report.report_ingestion_stage_start("Execution request Cleanup")
-                self.execution_request_cleanup.run()
+                with self.report.new_stage("Execution request Cleanup"):
+                    self.execution_request_cleanup.run()
             except Exception as e:
                 self.report.failure("While trying to cleanup execution request ", exc=e)
-        # Otherwise last stage's duration does not get calculated.
-        self.report.report_ingestion_stage_start("End")
         yield from []
     def truncate_indices(self) -> None:

datahub/ingestion/source/gc/execution_request_cleanup.py CHANGED Viewed

@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
     )
     keep_history_max_days: int = Field(
-        30,
+        90,
         description="Maximum number of days to keep execution requests for, per ingestion source",
     )
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
         description="Maximum runtime in seconds for the cleanup task",
     )
+    limit_entities_delete: Optional[int] = Field(
+        10000, description="Max number of execution requests to hard delete."
+    )
     max_read_errors: int = Field(
         default=10,
         description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
     ergc_delete_errors: int = 0
     ergc_start_time: Optional[datetime.datetime] = None
     ergc_end_time: Optional[datetime.datetime] = None
+    ergc_delete_limit_reached: bool = False
+    ergc_runtime_limit_reached: bool = False
 class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
         self.graph = graph
         self.report = report
         self.instance_id = int(time.time())
+        self.last_print_time = 0.0
         if config is not None:
             self.config = config
         else:
             self.config = DatahubExecutionRequestCleanupConfig()
+    def _print_report(self) -> None:
+        time_taken = round(time.time() - self.last_print_time, 1)
+        # Print report every 2 minutes
+        if time_taken > 120:
+            self.last_print_time = time.time()
+            logger.info(f"\n{self.report.as_string()}")
     def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
         input_aspect = (
             entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
         running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
         for entry in self._scroll_execution_requests():
+            self._print_report()
             self.report.ergc_records_read += 1
             key = entry.ingestion_source
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
                     f"record timestamp: {entry.requested_at}."
                 )
             )
-            self.report.ergc_records_deleted += 1
             yield entry
     def _delete_entry(self, entry: CleanupRecord) -> None:
         try:
-            logger.info(
-                f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
-            )
             self.graph.delete_entity(entry.urn, True)
+            self.report.ergc_records_deleted += 1
         except Exception as e:
             self.report.ergc_delete_errors += 1
             self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
                 >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
             )
         ):
+            self.report.ergc_runtime_limit_reached = True
             logger.info(f"ergc({self.instance_id}): max runtime reached.")
             return True
         return False
+    def _reached_delete_limit(self) -> bool:
+        if (
+            self.config.limit_entities_delete
+            and self.report.ergc_records_deleted >= self.config.limit_entities_delete
+        ):
+            logger.info(
+                f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
+            )
+            self.report.ergc_delete_limit_reached = True
+            return True
+        return False
     def run(self) -> None:
         if not self.config.enabled:
             logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
         )
         for entry in self._scroll_garbage_records():
-            if self._reached_runtime_limit():
+            if self._reached_runtime_limit() or self._reached_delete_limit():
                 break
             self._delete_entry(entry)

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
     def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
         assert self.ctx.graph
         scroll_id: Optional[str] = None
+        batch_size = self.config.batch_size
+        if entity_type == "DATA_PROCESS_INSTANCE":
+            # Due to a bug in Data process instance querying this is a temp workaround
+            # to avoid a giant stacktrace by having a smaller batch size in first call
+            # This will be remove in future version after server with fix has been
+            # around for a while
+            batch_size = 10
         while True:
             try:
                 result = self.ctx.graph.execute_graphql(
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
                             "types": [entity_type],
                             "query": "*",
                             "scrollId": scroll_id if scroll_id else None,
-                            "count": self.config.batch_size,
+                            "count": batch_size,
                             "orFilters": [
                                 {
                                     "and": [
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
             scroll_across_entities = result.get("scrollAcrossEntities")
             if not scroll_across_entities or not scroll_across_entities.get("count"):
                 break
+            if entity_type == "DATA_PROCESS_INSTANCE":
+                # Temp workaround. See note in beginning of the function
+                # We make the batch size = config after call has succeeded once
+                batch_size = self.config.batch_size
             scroll_id = scroll_across_entities.get("nextScrollId")
             self.report.num_queries_found += scroll_across_entities.get("count")
             for query in scroll_across_entities.get("searchResults"):

datahub/ingestion/source/looker/looker_config.py CHANGED Viewed

@@ -300,11 +300,16 @@ class LookerDashboardSourceConfig(
     folder_path_pattern: AllowDenyPattern = Field(
         default=AllowDenyPattern.allow_all(),
-        description="Allow or deny dashboards from specific folders. "
+        description="Allow or deny dashboards from specific folders using their fully qualified paths. "
         "For example: \n"
         "deny: \n"
-        " - sales/deprecated \n"
-        "This pattern will deny the ingestion of all dashboards and looks within the sales/deprecated folder. \n"
+        " - Shared/deprecated \n"
+        "This pattern will deny the ingestion of all dashboards and looks within the Shared/deprecated folder. \n"
+        "allow: \n"
+        " - Shared/sales \n"
+        "This pattern will allow only the ingestion of dashboards within the Shared/sales folder. \n"
+        "To get the correct path from Looker, take the folder hierarchy shown in the UI and join it with slashes. "
+        "For example, Shared -> Customer Reports -> Sales becomes Shared/Customer Reports/Sales. "
         "Dashboards will only be ingested if they're allowed by both this config and dashboard_pattern.",
     )

datahub/ingestion/source/powerbi_report_server/report_server.py CHANGED Viewed

@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
         self.filtered_reports.append(view)
-@platform_name("PowerBI")
+@platform_name("PowerBI Report Server")
 @config_class(PowerBiReportServerDashboardSourceConfig)
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.OWNERSHIP, "Enabled by default")

acryl-datahub 0.15.0.2rc2__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc2py3-none-any.whl → 0.15.0.2rc4py3-none-any.whl