PyPI - acryl-datahub - Versions diffs - 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl - Mend

acryl-datahub 0.15.0.2rc3py3-none-any.whl → 0.15.0.2rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (17) hide show

{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datahub/__init__.py,sha256=qe2wpod5rnOz0vzxn9B60E8BhINzy_rYJpwuvpI0NW8,576
+datahub/__init__.py,sha256=p_vuZBTCmvG7HqtMpeL-FQZeXWYJszD5ZoagGWD4_-w,576
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
 datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -164,7 +164,7 @@ datahub/ingestion/glossary/classifier.py,sha256=zp8Fe3he80H5Zz1EwymKjThUPkTpw6Pg
 datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
 datahub/ingestion/glossary/datahub_classifier.py,sha256=8VhwuLDhyOqqOr0jqAPIgorb4eAOnvTr4m13Y2Wy1-E,7515
 datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/graph/client.py,sha256=R50K7NmE3TYgVXvdLnvLZn7N0fkiCXOK0MoJz9ueglA,64963
+datahub/ingestion/graph/client.py,sha256=tXAzgeUnNfawgKDwKZsAuSDP0zAkhTKQb07Y3gNksgY,65056
 datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
 datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
 datahub/ingestion/graph/filters.py,sha256=UeUZQHoimavIYx-jXLA0WGkOUe10TaO8uEZkfa-QgNE,6188
@@ -217,9 +217,9 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
 datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
 datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
-datahub/ingestion/source/aws/glue.py,sha256=lJW3QHHz1_SWqLEB-vUSTxSuL0EgUQ0ptdQns_NLNds,57343
+datahub/ingestion/source/aws/glue.py,sha256=DcGZzaeY6hVa1wN74o6spNH1B6OFu_6Mn4uoRdlZa44,57647
 datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
-datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
+datahub/ingestion/source/aws/s3_util.py,sha256=pikTe9SuiKdN-TZ8eOhB0PYq0aUgUPDpxwtTLsVofRs,2834
 datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
 datahub/ingestion/source/aws/sagemaker_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/aws/sagemaker_processors/common.py,sha256=NvYfI8LHgDvhEZE7qp6qF1NSZ0_SQKhg3ivtdjsdpFg,2172
@@ -304,8 +304,8 @@ datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57
 datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/gc/datahub_gc.py,sha256=6O-TxU2uCJ1Y8NNzJDufUd3ymapo--E3hTeIuy_QDtY,12763
 datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
-datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=9jsyCIspWSSYSAVPHjKHr05885rXxM6FCH7KzTBceic,10139
-datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=zRtgC_AcZui4qGf9jBASI3R-CrYZxNe3Pm-gNSLT3rw,11420
+datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=VbZ-Xzryl5TMRapu7nlxlsXS8T8lFZcHK9AJnEadJ8Q,11111
+datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=PTx1XmD4Jv9SzXzHqUbFpT3vKGCtkD01MeBUaq1p4no,12083
 datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
 datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
@@ -376,7 +376,7 @@ datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py,sha256=bgcP
 datahub/ingestion/source/powerbi/rest_api_wrapper/query.py,sha256=VNw1Uvli6g0pnu9FpigYmnCdEPbVEipz7vdZU_WmHf4,616
 datahub/ingestion/source/powerbi_report_server/__init__.py,sha256=N9fGcrHXBbuPmx9rpGjd_jkMC3smXmfiwISDP1QZapk,324
 datahub/ingestion/source/powerbi_report_server/constants.py,sha256=i_hXcvPHjwk3VpTT5ef7s8dN9F6pJzPyRUiG5UzCCYI,3544
-datahub/ingestion/source/powerbi_report_server/report_server.py,sha256=lW7Xtdw8SlZRo2njvg4o5h7mdflxlGzovGCPjPUvmaU,20162
+datahub/ingestion/source/powerbi_report_server/report_server.py,sha256=Xsvu_FeteYNyLW_U0pER94-zQLLGUzU5tUEkhsLTQ2Y,20176
 datahub/ingestion/source/powerbi_report_server/report_server_domain.py,sha256=bBYr9fz4zPEFeZZOkldzKm4SBMQdisdp-MMtaYI0n3s,11783
 datahub/ingestion/source/profiling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/profiling/common.py,sha256=4sZ58AeBV64KRfKAgjkg-UyNjAc3YERahQMmW4algAw,1426
@@ -403,7 +403,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
 datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=k7S9Xcmgr3-CvWrd5NEX-V8JSrcAwkm7vbHPTVZicow,3620
 datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
 datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
-datahub/ingestion/source/s3/source.py,sha256=USjq86vUU7hKYKi8bhplBhHOjvoQTgguy91uFE24rUI,47336
+datahub/ingestion/source/s3/source.py,sha256=GXnyvMdr6dKaXdGros8GHEFwh1jwWiX3pKebvWxO2n8,47264
 datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
 datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
@@ -566,12 +566,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
 datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
 datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
 datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
-datahub/metadata/_schema_classes.py,sha256=IAWpWPxOeGmvmc96dapE0CySk1Rikbh-YieT-K9YTMY,964636
+datahub/metadata/_schema_classes.py,sha256=gWKn2rcsptEOQn4wWc7rZyeyXvDf4v7Q2UbIr9oU6Cg,964662
 datahub/metadata/schema.avsc,sha256=CeVb_Z7k0e5kmeqDUXUW7JDL6KSKBCdfAZzqRI_mLZo,729869
 datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
 datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
 datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
-datahub/metadata/_urns/urn_defs.py,sha256=gcUHCVwelD5aSuPgE1vmao242tQQiHe2m9kH8Fs1y1E,107287
+datahub/metadata/_urns/urn_defs.py,sha256=UNmGpVCcFB9_mXuSA4V3xwD_WtHaJ8WHjLTtdP7ojoQ,107852
 datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
 datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
 datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
@@ -678,7 +678,7 @@ datahub/metadata/schemas/DataContractKey.avsc,sha256=Oceu7P26--E0812IFrX3RiEY0Kt
 datahub/metadata/schemas/DataContractProperties.avsc,sha256=RCxuJMlZwqEE0iHTpuXvcH6zRFoOt7ysQFPrJRp3RqE,4763
 datahub/metadata/schemas/DataContractStatus.avsc,sha256=5yvT43AIB13Dn_h0-4s7fsL7BTuXhkK5pi2KJug4_qg,1029
 datahub/metadata/schemas/DataFlowInfo.avsc,sha256=tDRTd1rA3v_7kwUVbQbb-cuo6D-t3pcuE4fiRz4D8f0,4682
-datahub/metadata/schemas/DataFlowKey.avsc,sha256=Dt4mywv7C7Co4Ui4W-2WyRaLk_QkKHopwa7hWiYXo-w,1304
+datahub/metadata/schemas/DataFlowKey.avsc,sha256=4QVVgv4SICNkRMQbQjxGtSXuMHqlKEno9BgTstDxNvU,1323
 datahub/metadata/schemas/DataHubAccessTokenInfo.avsc,sha256=WS77M5w7GJFxUAiyXaxUvBqO0XFV2FnKPxXSXYbXHTE,1646
 datahub/metadata/schemas/DataHubAccessTokenKey.avsc,sha256=3EspNIxgb_I4WwV0a2o4NJOB5yODVr9J-wZzkZanEgo,483
 datahub/metadata/schemas/DataHubActionKey.avsc,sha256=bjiKcoyvUPQKaGUi2ICBMJ_ukwnt7dh0szJS4WBZE0A,448
@@ -705,7 +705,7 @@ datahub/metadata/schemas/DataHubViewInfo.avsc,sha256=U3fBIoG9ietLUpOknfQGNekqBdP
 datahub/metadata/schemas/DataHubViewKey.avsc,sha256=p53axIdSVbubo3r23Vpsed7NqRcQBMGveVikEHAVAok,424
 datahub/metadata/schemas/DataJobInfo.avsc,sha256=--obUbt_4X2paB39EeRKP13sBSiK-r0nq070EamoV1w,7212
 datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=H1O8eAzZV34tvULdu67iBSWkdn08rt7wS208b8Nisbk,15268
-datahub/metadata/schemas/DataJobKey.avsc,sha256=4F3myS-O6n7AlUqTvCkMSFvsYAjVhUq6uaQVbqLoYdM,1583
+datahub/metadata/schemas/DataJobKey.avsc,sha256=fVlqnz1Ljan3YKzDTrMRFD3z09Ge7Gqt4aor5K_AgkY,1602
 datahub/metadata/schemas/DataPlatformInfo.avsc,sha256=WGPFumBNHbR75vsLrivnRCbBc8vSCuxDw2UlylMieh4,2686
 datahub/metadata/schemas/DataPlatformInstance.avsc,sha256=SNd3v_YyyLaDflv8Rd5cQR9GrVuky_cDTkYM6FqJiM8,1058
 datahub/metadata/schemas/DataPlatformInstanceKey.avsc,sha256=sXUV5EMT6N-x8d6s8ebcJ5JdFIOsJCtiiU5Jtm-ncIk,800
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-0.15.0.2rc3.dist-info/METADATA,sha256=OWdxPqF4pJ1N91lZlAy8xWSInWXiBRWM_T8YmesJsyo,173662
-acryl_datahub-0.15.0.2rc3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-acryl_datahub-0.15.0.2rc3.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
-acryl_datahub-0.15.0.2rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-0.15.0.2rc3.dist-info/RECORD,,
+acryl_datahub-0.15.0.2rc4.dist-info/METADATA,sha256=27j9HorUgTZCHO9ZGywLNkPIBkNDl5RHU6xVV_v5fpI,173662
+acryl_datahub-0.15.0.2rc4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+acryl_datahub-0.15.0.2rc4.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
+acryl_datahub-0.15.0.2rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-0.15.0.2rc4.dist-info/RECORD,,

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.15.0.2rc3"
+__version__ = "0.15.0.2rc4"
 def is_dev_mode() -> bool:

datahub/ingestion/graph/client.py CHANGED Viewed

@@ -248,9 +248,11 @@ class DataHubGraph(DatahubRestEmitter):
         with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink:
             yield sink
         if sink.report.failures:
+            logger.error(
+                f"Failed to emit {len(sink.report.failures)} records\n{sink.report.as_string()}"
+            )
             raise OperationalError(
-                f"Failed to emit {len(sink.report.failures)} records",
-                info=sink.report.as_obj(),
+                f"Failed to emit {len(sink.report.failures)} records"
             )
     def emit_all(

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -248,6 +248,9 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
     "Enabled by default when stateful ingestion is turned on.",
 )
 @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
+@capability(
+    SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
+)
 class GlueSource(StatefulIngestionSourceBase):
     """
     Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
@@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase):
         "Action": [
             "glue:GetDataflowGraph",
             "glue:GetJobs",
+            "s3:GetObject",
         ],
         "Resource": "*"
     }
     ```
-    plus `s3:GetObject` for the job script locations.
+    For profiling datasets, the following additional permissions are required:
+    ```json
+        {
+        "Effect": "Allow",
+        "Action": [
+            "glue:GetPartitions",
+        ],
+        "Resource": "*"
+    }
+    ```
     """

datahub/ingestion/source/aws/s3_util.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import logging
 import os
-from typing import Optional
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
+if TYPE_CHECKING:
+    from mypy_boto3_s3.service_resource import ObjectSummary
 S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
@@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str:
             f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
         )
     return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
+def group_s3_objects_by_dirname(
+    s3_objects: Iterable["ObjectSummary"],
+) -> Dict[str, List["ObjectSummary"]]:
+    """
+    Groups S3 objects by their directory name.
+    If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
+    """
+    grouped_s3_objs = defaultdict(list)
+    for obj in s3_objects:
+        if "/" in obj.key:
+            dirname = obj.key.rsplit("/", 1)[0]
+        else:
+            dirname = "/"
+        grouped_s3_objs[dirname].append(obj)
+    return grouped_s3_objs

datahub/ingestion/source/gc/execution_request_cleanup.py CHANGED Viewed

@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
     )
     keep_history_max_days: int = Field(
-        30,
+        90,
         description="Maximum number of days to keep execution requests for, per ingestion source",
     )
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
         description="Maximum runtime in seconds for the cleanup task",
     )
+    limit_entities_delete: Optional[int] = Field(
+        10000, description="Max number of execution requests to hard delete."
+    )
     max_read_errors: int = Field(
         default=10,
         description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
     ergc_delete_errors: int = 0
     ergc_start_time: Optional[datetime.datetime] = None
     ergc_end_time: Optional[datetime.datetime] = None
+    ergc_delete_limit_reached: bool = False
+    ergc_runtime_limit_reached: bool = False
 class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
         self.graph = graph
         self.report = report
         self.instance_id = int(time.time())
+        self.last_print_time = 0.0
         if config is not None:
             self.config = config
         else:
             self.config = DatahubExecutionRequestCleanupConfig()
+    def _print_report(self) -> None:
+        time_taken = round(time.time() - self.last_print_time, 1)
+        # Print report every 2 minutes
+        if time_taken > 120:
+            self.last_print_time = time.time()
+            logger.info(f"\n{self.report.as_string()}")
     def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
         input_aspect = (
             entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
         running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
         for entry in self._scroll_execution_requests():
+            self._print_report()
             self.report.ergc_records_read += 1
             key = entry.ingestion_source
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
                     f"record timestamp: {entry.requested_at}."
                 )
             )
-            self.report.ergc_records_deleted += 1
             yield entry
     def _delete_entry(self, entry: CleanupRecord) -> None:
         try:
-            logger.info(
-                f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
-            )
             self.graph.delete_entity(entry.urn, True)
+            self.report.ergc_records_deleted += 1
         except Exception as e:
             self.report.ergc_delete_errors += 1
             self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
                 >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
             )
         ):
+            self.report.ergc_runtime_limit_reached = True
             logger.info(f"ergc({self.instance_id}): max runtime reached.")
             return True
         return False
+    def _reached_delete_limit(self) -> bool:
+        if (
+            self.config.limit_entities_delete
+            and self.report.ergc_records_deleted >= self.config.limit_entities_delete
+        ):
+            logger.info(
+                f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
+            )
+            self.report.ergc_delete_limit_reached = True
+            return True
+        return False
     def run(self) -> None:
         if not self.config.enabled:
             logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
         )
         for entry in self._scroll_garbage_records():
-            if self._reached_runtime_limit():
+            if self._reached_runtime_limit() or self._reached_delete_limit():
                 break
             self._delete_entry(entry)

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
     def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
         assert self.ctx.graph
         scroll_id: Optional[str] = None
+        batch_size = self.config.batch_size
+        if entity_type == "DATA_PROCESS_INSTANCE":
+            # Due to a bug in Data process instance querying this is a temp workaround
+            # to avoid a giant stacktrace by having a smaller batch size in first call
+            # This will be remove in future version after server with fix has been
+            # around for a while
+            batch_size = 10
         while True:
             try:
                 result = self.ctx.graph.execute_graphql(
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
                             "types": [entity_type],
                             "query": "*",
                             "scrollId": scroll_id if scroll_id else None,
-                            "count": self.config.batch_size,
+                            "count": batch_size,
                             "orFilters": [
                                 {
                                     "and": [
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
             scroll_across_entities = result.get("scrollAcrossEntities")
             if not scroll_across_entities or not scroll_across_entities.get("count"):
                 break
+            if entity_type == "DATA_PROCESS_INSTANCE":
+                # Temp workaround. See note in beginning of the function
+                # We make the batch size = config after call has succeeded once
+                batch_size = self.config.batch_size
             scroll_id = scroll_across_entities.get("nextScrollId")
             self.report.num_queries_found += scroll_across_entities.get("count")
             for query in scroll_across_entities.get("searchResults"):

datahub/ingestion/source/powerbi_report_server/report_server.py CHANGED Viewed

@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
         self.filtered_reports.append(view)
-@platform_name("PowerBI")
+@platform_name("PowerBI Report Server")
 @config_class(PowerBiReportServerDashboardSourceConfig)
 @support_status(SupportStatus.INCUBATING)
 @capability(SourceCapability.OWNERSHIP, "Enabled by default")

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -6,9 +6,8 @@ import pathlib
 import re
 import time
 from datetime import datetime
-from itertools import groupby
 from pathlib import PurePath
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
 from urllib.parse import urlparse
 import smart_open.compression as so_compression
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
     get_bucket_name,
     get_bucket_relative_path,
     get_key_prefix,
+    group_s3_objects_by_dirname,
     strip_s3_prefix,
 )
 from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
 from datahub.telemetry import stats, telemetry
 from datahub.utilities.perf_timer import PerfTimer
+if TYPE_CHECKING:
+    from mypy_boto3_s3.service_resource import Bucket
 # hide annoying debug errors from py4j
 logging.getLogger("py4j").setLevel(logging.ERROR)
 logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
     def get_folder_info(
         self,
         path_spec: PathSpec,
-        bucket: Any,  # Todo: proper type
+        bucket: "Bucket",
         prefix: str,
     ) -> List[Folder]:
         """
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
         Parameters:
         path_spec (PathSpec): The path specification used to determine partitioning.
-        bucket (Any): The S3 bucket object.
+        bucket (Bucket): The S3 bucket object.
         prefix (str): The prefix path in the S3 bucket to list objects from.
         Returns:
         List[Folder]: A list of Folder objects representing the partitions found.
         """
-        prefix_to_list = prefix
-        files = list(
-            bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
-        )
-        files = sorted(files, key=lambda a: a.last_modified)
-        grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
         partitions: List[Folder] = []
-        for key, group in grouped_files:
+        s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
+        for key, group in group_s3_objects_by_dirname(s3_objects).items():
             file_size = 0
             creation_time = None
             modification_time = None
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
                 Folder(
                     partition_id=id,
                     is_partition=bool(id),
-                    creation_time=creation_time if creation_time else None,
+                    creation_time=creation_time if creation_time else None,  # type: ignore[arg-type]
                     modification_time=modification_time,
                     sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
                     size=file_size,

datahub/metadata/_schema_classes.py CHANGED Viewed

@@ -14262,7 +14262,7 @@ class DataFlowKeyClass(_Aspect):
     ASPECT_NAME = 'dataFlowKey'
-    ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
+    ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
     RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataFlowKey")
     def __init__(self,
@@ -14678,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
     ASPECT_NAME = 'dataJobKey'
-    ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
+    ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
     RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
     def __init__(self,

datahub/metadata/_urns/urn_defs.py CHANGED Viewed

@@ -10,7 +10,7 @@
 # This file contains classes corresponding to entity URNs.
-from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
+from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
 import functools
 from deprecated.sphinx import deprecated as _sphinx_deprecated
@@ -213,7 +213,7 @@ class SchemaFieldUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "schemaField"
     _URN_PARTS: ClassVar[int] = 2
-    def __init__(self, parent: str, field_path: str, *, _allow_coercion: bool = True) -> None:
+    def __init__(self, parent: Union["Urn", str], field_path: str, *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
             field_path = UrnEncoder.encode_string(field_path)
@@ -604,7 +604,7 @@ class DataJobUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "dataJob"
     _URN_PARTS: ClassVar[int] = 2
-    def __init__(self, flow: str, job_id: str, *, _allow_coercion: bool = True) -> None:
+    def __init__(self, flow: Union["DataFlowUrn", str], job_id: str, *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
             job_id = UrnEncoder.encode_string(job_id)
@@ -1435,10 +1435,10 @@ class DataPlatformInstanceUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "dataPlatformInstance"
     _URN_PARTS: ClassVar[int] = 2
-    def __init__(self, platform: str, instance: str, *, _allow_coercion: bool = True) -> None:
+    def __init__(self, platform: Union["DataPlatformUrn", str], instance: str, *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
-            platform = DataPlatformUrn(platform).urn()
+            platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
             instance = UrnEncoder.encode_string(instance)
         # Validation logic.
@@ -1678,10 +1678,10 @@ class DatasetUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "dataset"
     _URN_PARTS: ClassVar[int] = 3
-    def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
+    def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
-            platform = DataPlatformUrn(platform).urn()
+            platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
             name = UrnEncoder.encode_string(name)
             env = env.upper()
             env = UrnEncoder.encode_string(env)
@@ -1771,10 +1771,10 @@ class MlModelUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "mlModel"
     _URN_PARTS: ClassVar[int] = 3
-    def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
+    def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
-            platform = DataPlatformUrn(platform).urn()
+            platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
             name = UrnEncoder.encode_string(name)
             env = env.upper()
             env = UrnEncoder.encode_string(env)
@@ -1889,10 +1889,10 @@ class MlModelDeploymentUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "mlModelDeployment"
     _URN_PARTS: ClassVar[int] = 3
-    def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
+    def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
-            platform = DataPlatformUrn(platform).urn()
+            platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
             name = UrnEncoder.encode_string(name)
             env = env.upper()
             env = UrnEncoder.encode_string(env)
@@ -1953,10 +1953,10 @@ class MlFeatureTableUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "mlFeatureTable"
     _URN_PARTS: ClassVar[int] = 2
-    def __init__(self, platform: str, name: str, *, _allow_coercion: bool = True) -> None:
+    def __init__(self, platform: Union["DataPlatformUrn", str], name: str, *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
-            platform = DataPlatformUrn(platform).urn()
+            platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
             name = UrnEncoder.encode_string(name)
         # Validation logic.
@@ -2385,10 +2385,10 @@ class MlModelGroupUrn(_SpecificUrn):
     ENTITY_TYPE: ClassVar[str] = "mlModelGroup"
     _URN_PARTS: ClassVar[int] = 3
-    def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
+    def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
         if _allow_coercion:
             # Field coercion logic (if any is required).
-            platform = DataPlatformUrn(platform).urn()
+            platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
             name = UrnEncoder.encode_string(name)
             env = env.upper()
             env = UrnEncoder.encode_string(env)

datahub/metadata/schemas/DataFlowKey.avsc CHANGED Viewed

@@ -17,6 +17,7 @@
       "glossaryTerms",
       "institutionalMemory",
       "dataPlatformInstance",
+      "container",
       "browsePathsV2",
       "structuredProperties",
       "incidentsSummary",

datahub/metadata/schemas/DataJobKey.avsc CHANGED Viewed

@@ -20,6 +20,7 @@
       "glossaryTerms",
       "institutionalMemory",
       "dataPlatformInstance",
+      "container",
       "browsePathsV2",
       "structuredProperties",
       "forms",

{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.2rc3py3-none-any.whl → 0.15.0.2rc4py3-none-any.whl