acryl-datahub 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=qe2wpod5rnOz0vzxn9B60E8BhINzy_rYJpwuvpI0NW8,576
1
+ datahub/__init__.py,sha256=p_vuZBTCmvG7HqtMpeL-FQZeXWYJszD5ZoagGWD4_-w,576
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -164,7 +164,7 @@ datahub/ingestion/glossary/classifier.py,sha256=zp8Fe3he80H5Zz1EwymKjThUPkTpw6Pg
164
164
  datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
165
165
  datahub/ingestion/glossary/datahub_classifier.py,sha256=8VhwuLDhyOqqOr0jqAPIgorb4eAOnvTr4m13Y2Wy1-E,7515
166
166
  datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
167
- datahub/ingestion/graph/client.py,sha256=R50K7NmE3TYgVXvdLnvLZn7N0fkiCXOK0MoJz9ueglA,64963
167
+ datahub/ingestion/graph/client.py,sha256=tXAzgeUnNfawgKDwKZsAuSDP0zAkhTKQb07Y3gNksgY,65056
168
168
  datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
169
169
  datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
170
170
  datahub/ingestion/graph/filters.py,sha256=UeUZQHoimavIYx-jXLA0WGkOUe10TaO8uEZkfa-QgNE,6188
@@ -217,9 +217,9 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
217
217
  datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
218
218
  datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
220
- datahub/ingestion/source/aws/glue.py,sha256=lJW3QHHz1_SWqLEB-vUSTxSuL0EgUQ0ptdQns_NLNds,57343
220
+ datahub/ingestion/source/aws/glue.py,sha256=DcGZzaeY6hVa1wN74o6spNH1B6OFu_6Mn4uoRdlZa44,57647
221
221
  datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
222
- datahub/ingestion/source/aws/s3_util.py,sha256=OFypcgmVC6jnZM90-gjcPpAMtTV1lbnreCaMhCzNlzs,2149
222
+ datahub/ingestion/source/aws/s3_util.py,sha256=pikTe9SuiKdN-TZ8eOhB0PYq0aUgUPDpxwtTLsVofRs,2834
223
223
  datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
224
224
  datahub/ingestion/source/aws/sagemaker_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
225
225
  datahub/ingestion/source/aws/sagemaker_processors/common.py,sha256=NvYfI8LHgDvhEZE7qp6qF1NSZ0_SQKhg3ivtdjsdpFg,2172
@@ -304,8 +304,8 @@ datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57
304
304
  datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
305
305
  datahub/ingestion/source/gc/datahub_gc.py,sha256=6O-TxU2uCJ1Y8NNzJDufUd3ymapo--E3hTeIuy_QDtY,12763
306
306
  datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
307
- datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=9jsyCIspWSSYSAVPHjKHr05885rXxM6FCH7KzTBceic,10139
308
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=zRtgC_AcZui4qGf9jBASI3R-CrYZxNe3Pm-gNSLT3rw,11420
307
+ datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=VbZ-Xzryl5TMRapu7nlxlsXS8T8lFZcHK9AJnEadJ8Q,11111
308
+ datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=PTx1XmD4Jv9SzXzHqUbFpT3vKGCtkD01MeBUaq1p4no,12083
309
309
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
310
  datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
311
311
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
@@ -376,7 +376,7 @@ datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py,sha256=bgcP
376
376
  datahub/ingestion/source/powerbi/rest_api_wrapper/query.py,sha256=VNw1Uvli6g0pnu9FpigYmnCdEPbVEipz7vdZU_WmHf4,616
377
377
  datahub/ingestion/source/powerbi_report_server/__init__.py,sha256=N9fGcrHXBbuPmx9rpGjd_jkMC3smXmfiwISDP1QZapk,324
378
378
  datahub/ingestion/source/powerbi_report_server/constants.py,sha256=i_hXcvPHjwk3VpTT5ef7s8dN9F6pJzPyRUiG5UzCCYI,3544
379
- datahub/ingestion/source/powerbi_report_server/report_server.py,sha256=lW7Xtdw8SlZRo2njvg4o5h7mdflxlGzovGCPjPUvmaU,20162
379
+ datahub/ingestion/source/powerbi_report_server/report_server.py,sha256=Xsvu_FeteYNyLW_U0pER94-zQLLGUzU5tUEkhsLTQ2Y,20176
380
380
  datahub/ingestion/source/powerbi_report_server/report_server_domain.py,sha256=bBYr9fz4zPEFeZZOkldzKm4SBMQdisdp-MMtaYI0n3s,11783
381
381
  datahub/ingestion/source/profiling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
382
382
  datahub/ingestion/source/profiling/common.py,sha256=4sZ58AeBV64KRfKAgjkg-UyNjAc3YERahQMmW4algAw,1426
@@ -403,7 +403,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
403
403
  datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=k7S9Xcmgr3-CvWrd5NEX-V8JSrcAwkm7vbHPTVZicow,3620
404
404
  datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
405
405
  datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
406
- datahub/ingestion/source/s3/source.py,sha256=USjq86vUU7hKYKi8bhplBhHOjvoQTgguy91uFE24rUI,47336
406
+ datahub/ingestion/source/s3/source.py,sha256=GXnyvMdr6dKaXdGros8GHEFwh1jwWiX3pKebvWxO2n8,47264
407
407
  datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
408
408
  datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
409
409
  datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
@@ -566,12 +566,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
566
566
  datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
567
567
  datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
568
568
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
569
- datahub/metadata/_schema_classes.py,sha256=IAWpWPxOeGmvmc96dapE0CySk1Rikbh-YieT-K9YTMY,964636
569
+ datahub/metadata/_schema_classes.py,sha256=gWKn2rcsptEOQn4wWc7rZyeyXvDf4v7Q2UbIr9oU6Cg,964662
570
570
  datahub/metadata/schema.avsc,sha256=CeVb_Z7k0e5kmeqDUXUW7JDL6KSKBCdfAZzqRI_mLZo,729869
571
571
  datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
572
572
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
573
573
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
574
- datahub/metadata/_urns/urn_defs.py,sha256=gcUHCVwelD5aSuPgE1vmao242tQQiHe2m9kH8Fs1y1E,107287
574
+ datahub/metadata/_urns/urn_defs.py,sha256=UNmGpVCcFB9_mXuSA4V3xwD_WtHaJ8WHjLTtdP7ojoQ,107852
575
575
  datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
576
576
  datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
577
577
  datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
@@ -678,7 +678,7 @@ datahub/metadata/schemas/DataContractKey.avsc,sha256=Oceu7P26--E0812IFrX3RiEY0Kt
678
678
  datahub/metadata/schemas/DataContractProperties.avsc,sha256=RCxuJMlZwqEE0iHTpuXvcH6zRFoOt7ysQFPrJRp3RqE,4763
679
679
  datahub/metadata/schemas/DataContractStatus.avsc,sha256=5yvT43AIB13Dn_h0-4s7fsL7BTuXhkK5pi2KJug4_qg,1029
680
680
  datahub/metadata/schemas/DataFlowInfo.avsc,sha256=tDRTd1rA3v_7kwUVbQbb-cuo6D-t3pcuE4fiRz4D8f0,4682
681
- datahub/metadata/schemas/DataFlowKey.avsc,sha256=Dt4mywv7C7Co4Ui4W-2WyRaLk_QkKHopwa7hWiYXo-w,1304
681
+ datahub/metadata/schemas/DataFlowKey.avsc,sha256=4QVVgv4SICNkRMQbQjxGtSXuMHqlKEno9BgTstDxNvU,1323
682
682
  datahub/metadata/schemas/DataHubAccessTokenInfo.avsc,sha256=WS77M5w7GJFxUAiyXaxUvBqO0XFV2FnKPxXSXYbXHTE,1646
683
683
  datahub/metadata/schemas/DataHubAccessTokenKey.avsc,sha256=3EspNIxgb_I4WwV0a2o4NJOB5yODVr9J-wZzkZanEgo,483
684
684
  datahub/metadata/schemas/DataHubActionKey.avsc,sha256=bjiKcoyvUPQKaGUi2ICBMJ_ukwnt7dh0szJS4WBZE0A,448
@@ -705,7 +705,7 @@ datahub/metadata/schemas/DataHubViewInfo.avsc,sha256=U3fBIoG9ietLUpOknfQGNekqBdP
705
705
  datahub/metadata/schemas/DataHubViewKey.avsc,sha256=p53axIdSVbubo3r23Vpsed7NqRcQBMGveVikEHAVAok,424
706
706
  datahub/metadata/schemas/DataJobInfo.avsc,sha256=--obUbt_4X2paB39EeRKP13sBSiK-r0nq070EamoV1w,7212
707
707
  datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=H1O8eAzZV34tvULdu67iBSWkdn08rt7wS208b8Nisbk,15268
708
- datahub/metadata/schemas/DataJobKey.avsc,sha256=4F3myS-O6n7AlUqTvCkMSFvsYAjVhUq6uaQVbqLoYdM,1583
708
+ datahub/metadata/schemas/DataJobKey.avsc,sha256=fVlqnz1Ljan3YKzDTrMRFD3z09Ge7Gqt4aor5K_AgkY,1602
709
709
  datahub/metadata/schemas/DataPlatformInfo.avsc,sha256=WGPFumBNHbR75vsLrivnRCbBc8vSCuxDw2UlylMieh4,2686
710
710
  datahub/metadata/schemas/DataPlatformInstance.avsc,sha256=SNd3v_YyyLaDflv8Rd5cQR9GrVuky_cDTkYM6FqJiM8,1058
711
711
  datahub/metadata/schemas/DataPlatformInstanceKey.avsc,sha256=sXUV5EMT6N-x8d6s8ebcJ5JdFIOsJCtiiU5Jtm-ncIk,800
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
986
986
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
987
987
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
988
988
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
989
- acryl_datahub-0.15.0.2rc3.dist-info/METADATA,sha256=OWdxPqF4pJ1N91lZlAy8xWSInWXiBRWM_T8YmesJsyo,173662
990
- acryl_datahub-0.15.0.2rc3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
991
- acryl_datahub-0.15.0.2rc3.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
- acryl_datahub-0.15.0.2rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
- acryl_datahub-0.15.0.2rc3.dist-info/RECORD,,
989
+ acryl_datahub-0.15.0.2rc4.dist-info/METADATA,sha256=27j9HorUgTZCHO9ZGywLNkPIBkNDl5RHU6xVV_v5fpI,173662
990
+ acryl_datahub-0.15.0.2rc4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
991
+ acryl_datahub-0.15.0.2rc4.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
992
+ acryl_datahub-0.15.0.2rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
993
+ acryl_datahub-0.15.0.2rc4.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0.2rc3"
6
+ __version__ = "0.15.0.2rc4"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -248,9 +248,11 @@ class DataHubGraph(DatahubRestEmitter):
248
248
  with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink:
249
249
  yield sink
250
250
  if sink.report.failures:
251
+ logger.error(
252
+ f"Failed to emit {len(sink.report.failures)} records\n{sink.report.as_string()}"
253
+ )
251
254
  raise OperationalError(
252
- f"Failed to emit {len(sink.report.failures)} records",
253
- info=sink.report.as_obj(),
255
+ f"Failed to emit {len(sink.report.failures)} records"
254
256
  )
255
257
 
256
258
  def emit_all(
@@ -248,6 +248,9 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
248
248
  "Enabled by default when stateful ingestion is turned on.",
249
249
  )
250
250
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
251
+ @capability(
252
+ SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
253
+ )
251
254
  class GlueSource(StatefulIngestionSourceBase):
252
255
  """
253
256
  Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
@@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase):
284
287
  "Action": [
285
288
  "glue:GetDataflowGraph",
286
289
  "glue:GetJobs",
290
+ "s3:GetObject",
287
291
  ],
288
292
  "Resource": "*"
289
293
  }
290
294
  ```
291
295
 
292
- plus `s3:GetObject` for the job script locations.
296
+ For profiling datasets, the following additional permissions are required:
297
+ ```json
298
+ {
299
+ "Effect": "Allow",
300
+ "Action": [
301
+ "glue:GetPartitions",
302
+ ],
303
+ "Resource": "*"
304
+ }
305
+ ```
293
306
 
294
307
  """
295
308
 
@@ -1,6 +1,11 @@
1
1
  import logging
2
2
  import os
3
- from typing import Optional
3
+ from collections import defaultdict
4
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
5
+
6
+ if TYPE_CHECKING:
7
+ from mypy_boto3_s3.service_resource import ObjectSummary
8
+
4
9
 
5
10
  S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
6
11
 
@@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str:
68
73
  f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
69
74
  )
70
75
  return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
76
+
77
+
78
+ def group_s3_objects_by_dirname(
79
+ s3_objects: Iterable["ObjectSummary"],
80
+ ) -> Dict[str, List["ObjectSummary"]]:
81
+ """
82
+ Groups S3 objects by their directory name.
83
+
84
+ If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
85
+ """
86
+ grouped_s3_objs = defaultdict(list)
87
+ for obj in s3_objects:
88
+ if "/" in obj.key:
89
+ dirname = obj.key.rsplit("/", 1)[0]
90
+ else:
91
+ dirname = "/"
92
+ grouped_s3_objs[dirname].append(obj)
93
+ return grouped_s3_objs
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
29
29
  )
30
30
 
31
31
  keep_history_max_days: int = Field(
32
- 30,
32
+ 90,
33
33
  description="Maximum number of days to keep execution requests for, per ingestion source",
34
34
  )
35
35
 
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
48
48
  description="Maximum runtime in seconds for the cleanup task",
49
49
  )
50
50
 
51
+ limit_entities_delete: Optional[int] = Field(
52
+ 10000, description="Max number of execution requests to hard delete."
53
+ )
54
+
51
55
  max_read_errors: int = Field(
52
56
  default=10,
53
57
  description="Maximum number of read errors before aborting",
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
65
69
  ergc_delete_errors: int = 0
66
70
  ergc_start_time: Optional[datetime.datetime] = None
67
71
  ergc_end_time: Optional[datetime.datetime] = None
72
+ ergc_delete_limit_reached: bool = False
73
+ ergc_runtime_limit_reached: bool = False
68
74
 
69
75
 
70
76
  class CleanupRecord(BaseModel):
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
85
91
  self.graph = graph
86
92
  self.report = report
87
93
  self.instance_id = int(time.time())
94
+ self.last_print_time = 0.0
88
95
 
89
96
  if config is not None:
90
97
  self.config = config
91
98
  else:
92
99
  self.config = DatahubExecutionRequestCleanupConfig()
93
100
 
101
+ def _print_report(self) -> None:
102
+ time_taken = round(time.time() - self.last_print_time, 1)
103
+ # Print report every 2 minutes
104
+ if time_taken > 120:
105
+ self.last_print_time = time.time()
106
+ logger.info(f"\n{self.report.as_string()}")
107
+
94
108
  def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
95
109
  input_aspect = (
96
110
  entry.get("aspects", {})
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
175
189
  running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
176
190
 
177
191
  for entry in self._scroll_execution_requests():
192
+ self._print_report()
178
193
  self.report.ergc_records_read += 1
179
194
  key = entry.ingestion_source
180
195
 
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
225
240
  f"record timestamp: {entry.requested_at}."
226
241
  )
227
242
  )
228
- self.report.ergc_records_deleted += 1
229
243
  yield entry
230
244
 
231
245
  def _delete_entry(self, entry: CleanupRecord) -> None:
232
246
  try:
233
- logger.info(
234
- f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
235
- )
236
247
  self.graph.delete_entity(entry.urn, True)
248
+ self.report.ergc_records_deleted += 1
237
249
  except Exception as e:
238
250
  self.report.ergc_delete_errors += 1
239
251
  self.report.failure(
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
252
264
  >= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
253
265
  )
254
266
  ):
267
+ self.report.ergc_runtime_limit_reached = True
255
268
  logger.info(f"ergc({self.instance_id}): max runtime reached.")
256
269
  return True
257
270
  return False
258
271
 
272
+ def _reached_delete_limit(self) -> bool:
273
+ if (
274
+ self.config.limit_entities_delete
275
+ and self.report.ergc_records_deleted >= self.config.limit_entities_delete
276
+ ):
277
+ logger.info(
278
+ f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
279
+ )
280
+ self.report.ergc_delete_limit_reached = True
281
+ return True
282
+ return False
283
+
259
284
  def run(self) -> None:
260
285
  if not self.config.enabled:
261
286
  logger.info(
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
274
299
  )
275
300
 
276
301
  for entry in self._scroll_garbage_records():
277
- if self._reached_runtime_limit():
302
+ if self._reached_runtime_limit() or self._reached_delete_limit():
278
303
  break
279
304
  self._delete_entry(entry)
280
305
 
@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
231
231
  def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
232
232
  assert self.ctx.graph
233
233
  scroll_id: Optional[str] = None
234
+
235
+ batch_size = self.config.batch_size
236
+ if entity_type == "DATA_PROCESS_INSTANCE":
237
+ # Due to a bug in Data process instance querying this is a temp workaround
238
+ # to avoid a giant stacktrace by having a smaller batch size in first call
239
+ # This will be remove in future version after server with fix has been
240
+ # around for a while
241
+ batch_size = 10
242
+
234
243
  while True:
235
244
  try:
236
245
  result = self.ctx.graph.execute_graphql(
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
240
249
  "types": [entity_type],
241
250
  "query": "*",
242
251
  "scrollId": scroll_id if scroll_id else None,
243
- "count": self.config.batch_size,
252
+ "count": batch_size,
244
253
  "orFilters": [
245
254
  {
246
255
  "and": [
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
263
272
  scroll_across_entities = result.get("scrollAcrossEntities")
264
273
  if not scroll_across_entities or not scroll_across_entities.get("count"):
265
274
  break
275
+ if entity_type == "DATA_PROCESS_INSTANCE":
276
+ # Temp workaround. See note in beginning of the function
277
+ # We make the batch size = config after call has succeeded once
278
+ batch_size = self.config.batch_size
266
279
  scroll_id = scroll_across_entities.get("nextScrollId")
267
280
  self.report.num_queries_found += scroll_across_entities.get("count")
268
281
  for query in scroll_across_entities.get("searchResults"):
@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
485
485
  self.filtered_reports.append(view)
486
486
 
487
487
 
488
- @platform_name("PowerBI")
488
+ @platform_name("PowerBI Report Server")
489
489
  @config_class(PowerBiReportServerDashboardSourceConfig)
490
490
  @support_status(SupportStatus.INCUBATING)
491
491
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
@@ -6,9 +6,8 @@ import pathlib
6
6
  import re
7
7
  import time
8
8
  from datetime import datetime
9
- from itertools import groupby
10
9
  from pathlib import PurePath
11
- from typing import Any, Dict, Iterable, List, Optional, Tuple
10
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
12
11
  from urllib.parse import urlparse
13
12
 
14
13
  import smart_open.compression as so_compression
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
41
40
  get_bucket_name,
42
41
  get_bucket_relative_path,
43
42
  get_key_prefix,
43
+ group_s3_objects_by_dirname,
44
44
  strip_s3_prefix,
45
45
  )
46
46
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
75
75
  from datahub.telemetry import stats, telemetry
76
76
  from datahub.utilities.perf_timer import PerfTimer
77
77
 
78
+ if TYPE_CHECKING:
79
+ from mypy_boto3_s3.service_resource import Bucket
80
+
78
81
  # hide annoying debug errors from py4j
79
82
  logging.getLogger("py4j").setLevel(logging.ERROR)
80
83
  logger: logging.Logger = logging.getLogger(__name__)
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
842
845
  def get_folder_info(
843
846
  self,
844
847
  path_spec: PathSpec,
845
- bucket: Any, # Todo: proper type
848
+ bucket: "Bucket",
846
849
  prefix: str,
847
850
  ) -> List[Folder]:
848
851
  """
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
857
860
 
858
861
  Parameters:
859
862
  path_spec (PathSpec): The path specification used to determine partitioning.
860
- bucket (Any): The S3 bucket object.
863
+ bucket (Bucket): The S3 bucket object.
861
864
  prefix (str): The prefix path in the S3 bucket to list objects from.
862
865
 
863
866
  Returns:
864
867
  List[Folder]: A list of Folder objects representing the partitions found.
865
868
  """
866
-
867
- prefix_to_list = prefix
868
- files = list(
869
- bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
870
- )
871
- files = sorted(files, key=lambda a: a.last_modified)
872
- grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
873
-
874
869
  partitions: List[Folder] = []
875
- for key, group in grouped_files:
870
+ s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
871
+ for key, group in group_s3_objects_by_dirname(s3_objects).items():
876
872
  file_size = 0
877
873
  creation_time = None
878
874
  modification_time = None
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
904
900
  Folder(
905
901
  partition_id=id,
906
902
  is_partition=bool(id),
907
- creation_time=creation_time if creation_time else None,
903
+ creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
908
904
  modification_time=modification_time,
909
905
  sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
910
906
  size=file_size,
@@ -14262,7 +14262,7 @@ class DataFlowKeyClass(_Aspect):
14262
14262
 
14263
14263
 
14264
14264
  ASPECT_NAME = 'dataFlowKey'
14265
- ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
14265
+ ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
14266
14266
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataFlowKey")
14267
14267
 
14268
14268
  def __init__(self,
@@ -14678,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
14678
14678
 
14679
14679
 
14680
14680
  ASPECT_NAME = 'dataJobKey'
14681
- ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
14681
+ ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
14682
14682
  RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
14683
14683
 
14684
14684
  def __init__(self,
@@ -10,7 +10,7 @@
10
10
 
11
11
  # This file contains classes corresponding to entity URNs.
12
12
 
13
- from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
13
+ from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
14
14
 
15
15
  import functools
16
16
  from deprecated.sphinx import deprecated as _sphinx_deprecated
@@ -213,7 +213,7 @@ class SchemaFieldUrn(_SpecificUrn):
213
213
  ENTITY_TYPE: ClassVar[str] = "schemaField"
214
214
  _URN_PARTS: ClassVar[int] = 2
215
215
 
216
- def __init__(self, parent: str, field_path: str, *, _allow_coercion: bool = True) -> None:
216
+ def __init__(self, parent: Union["Urn", str], field_path: str, *, _allow_coercion: bool = True) -> None:
217
217
  if _allow_coercion:
218
218
  # Field coercion logic (if any is required).
219
219
  field_path = UrnEncoder.encode_string(field_path)
@@ -604,7 +604,7 @@ class DataJobUrn(_SpecificUrn):
604
604
  ENTITY_TYPE: ClassVar[str] = "dataJob"
605
605
  _URN_PARTS: ClassVar[int] = 2
606
606
 
607
- def __init__(self, flow: str, job_id: str, *, _allow_coercion: bool = True) -> None:
607
+ def __init__(self, flow: Union["DataFlowUrn", str], job_id: str, *, _allow_coercion: bool = True) -> None:
608
608
  if _allow_coercion:
609
609
  # Field coercion logic (if any is required).
610
610
  job_id = UrnEncoder.encode_string(job_id)
@@ -1435,10 +1435,10 @@ class DataPlatformInstanceUrn(_SpecificUrn):
1435
1435
  ENTITY_TYPE: ClassVar[str] = "dataPlatformInstance"
1436
1436
  _URN_PARTS: ClassVar[int] = 2
1437
1437
 
1438
- def __init__(self, platform: str, instance: str, *, _allow_coercion: bool = True) -> None:
1438
+ def __init__(self, platform: Union["DataPlatformUrn", str], instance: str, *, _allow_coercion: bool = True) -> None:
1439
1439
  if _allow_coercion:
1440
1440
  # Field coercion logic (if any is required).
1441
- platform = DataPlatformUrn(platform).urn()
1441
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1442
1442
  instance = UrnEncoder.encode_string(instance)
1443
1443
 
1444
1444
  # Validation logic.
@@ -1678,10 +1678,10 @@ class DatasetUrn(_SpecificUrn):
1678
1678
  ENTITY_TYPE: ClassVar[str] = "dataset"
1679
1679
  _URN_PARTS: ClassVar[int] = 3
1680
1680
 
1681
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1681
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1682
1682
  if _allow_coercion:
1683
1683
  # Field coercion logic (if any is required).
1684
- platform = DataPlatformUrn(platform).urn()
1684
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1685
1685
  name = UrnEncoder.encode_string(name)
1686
1686
  env = env.upper()
1687
1687
  env = UrnEncoder.encode_string(env)
@@ -1771,10 +1771,10 @@ class MlModelUrn(_SpecificUrn):
1771
1771
  ENTITY_TYPE: ClassVar[str] = "mlModel"
1772
1772
  _URN_PARTS: ClassVar[int] = 3
1773
1773
 
1774
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1774
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1775
1775
  if _allow_coercion:
1776
1776
  # Field coercion logic (if any is required).
1777
- platform = DataPlatformUrn(platform).urn()
1777
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1778
1778
  name = UrnEncoder.encode_string(name)
1779
1779
  env = env.upper()
1780
1780
  env = UrnEncoder.encode_string(env)
@@ -1889,10 +1889,10 @@ class MlModelDeploymentUrn(_SpecificUrn):
1889
1889
  ENTITY_TYPE: ClassVar[str] = "mlModelDeployment"
1890
1890
  _URN_PARTS: ClassVar[int] = 3
1891
1891
 
1892
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1892
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
1893
1893
  if _allow_coercion:
1894
1894
  # Field coercion logic (if any is required).
1895
- platform = DataPlatformUrn(platform).urn()
1895
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1896
1896
  name = UrnEncoder.encode_string(name)
1897
1897
  env = env.upper()
1898
1898
  env = UrnEncoder.encode_string(env)
@@ -1953,10 +1953,10 @@ class MlFeatureTableUrn(_SpecificUrn):
1953
1953
  ENTITY_TYPE: ClassVar[str] = "mlFeatureTable"
1954
1954
  _URN_PARTS: ClassVar[int] = 2
1955
1955
 
1956
- def __init__(self, platform: str, name: str, *, _allow_coercion: bool = True) -> None:
1956
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, *, _allow_coercion: bool = True) -> None:
1957
1957
  if _allow_coercion:
1958
1958
  # Field coercion logic (if any is required).
1959
- platform = DataPlatformUrn(platform).urn()
1959
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
1960
1960
  name = UrnEncoder.encode_string(name)
1961
1961
 
1962
1962
  # Validation logic.
@@ -2385,10 +2385,10 @@ class MlModelGroupUrn(_SpecificUrn):
2385
2385
  ENTITY_TYPE: ClassVar[str] = "mlModelGroup"
2386
2386
  _URN_PARTS: ClassVar[int] = 3
2387
2387
 
2388
- def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
2388
+ def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
2389
2389
  if _allow_coercion:
2390
2390
  # Field coercion logic (if any is required).
2391
- platform = DataPlatformUrn(platform).urn()
2391
+ platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
2392
2392
  name = UrnEncoder.encode_string(name)
2393
2393
  env = env.upper()
2394
2394
  env = UrnEncoder.encode_string(env)
@@ -17,6 +17,7 @@
17
17
  "glossaryTerms",
18
18
  "institutionalMemory",
19
19
  "dataPlatformInstance",
20
+ "container",
20
21
  "browsePathsV2",
21
22
  "structuredProperties",
22
23
  "incidentsSummary",
@@ -20,6 +20,7 @@
20
20
  "glossaryTerms",
21
21
  "institutionalMemory",
22
22
  "dataPlatformInstance",
23
+ "container",
23
24
  "browsePathsV2",
24
25
  "structuredProperties",
25
26
  "forms",