acryl-datahub 0.15.0.2rc3__py3-none-any.whl → 0.15.0.2rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/METADATA +2405 -2405
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/RECORD +17 -17
- datahub/__init__.py +1 -1
- datahub/ingestion/graph/client.py +4 -2
- datahub/ingestion/source/aws/glue.py +14 -1
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +14 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/s3/source.py +10 -14
- datahub/metadata/_schema_classes.py +2 -2
- datahub/metadata/_urns/urn_defs.py +15 -15
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=p_vuZBTCmvG7HqtMpeL-FQZeXWYJszD5ZoagGWD4_-w,576
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -164,7 +164,7 @@ datahub/ingestion/glossary/classifier.py,sha256=zp8Fe3he80H5Zz1EwymKjThUPkTpw6Pg
|
|
|
164
164
|
datahub/ingestion/glossary/classifier_registry.py,sha256=yFOYLQhDgCLqXYMG3L1BquXafeLcZDcmp8meyw6k9ts,307
|
|
165
165
|
datahub/ingestion/glossary/datahub_classifier.py,sha256=8VhwuLDhyOqqOr0jqAPIgorb4eAOnvTr4m13Y2Wy1-E,7515
|
|
166
166
|
datahub/ingestion/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
167
|
-
datahub/ingestion/graph/client.py,sha256=
|
|
167
|
+
datahub/ingestion/graph/client.py,sha256=tXAzgeUnNfawgKDwKZsAuSDP0zAkhTKQb07Y3gNksgY,65056
|
|
168
168
|
datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
|
|
169
169
|
datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
|
|
170
170
|
datahub/ingestion/graph/filters.py,sha256=UeUZQHoimavIYx-jXLA0WGkOUe10TaO8uEZkfa-QgNE,6188
|
|
@@ -217,9 +217,9 @@ datahub/ingestion/source/abs/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm
|
|
|
217
217
|
datahub/ingestion/source/abs/source.py,sha256=pzxW-R_cWGKPneEhX8JWdTZiX2k1kAZOPKgMxp9mAEI,24533
|
|
218
218
|
datahub/ingestion/source/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
219
219
|
datahub/ingestion/source/aws/aws_common.py,sha256=DfdQgkJ_s2isFx8WvqKTlAcBk4KE8SgfpmA5BgC3fgY,17716
|
|
220
|
-
datahub/ingestion/source/aws/glue.py,sha256=
|
|
220
|
+
datahub/ingestion/source/aws/glue.py,sha256=DcGZzaeY6hVa1wN74o6spNH1B6OFu_6Mn4uoRdlZa44,57647
|
|
221
221
|
datahub/ingestion/source/aws/s3_boto_utils.py,sha256=Wyp9k9tapsCuw9dyH4FCXJr_wmeLaYFoCtKvrV6SEDk,3892
|
|
222
|
-
datahub/ingestion/source/aws/s3_util.py,sha256=
|
|
222
|
+
datahub/ingestion/source/aws/s3_util.py,sha256=pikTe9SuiKdN-TZ8eOhB0PYq0aUgUPDpxwtTLsVofRs,2834
|
|
223
223
|
datahub/ingestion/source/aws/sagemaker.py,sha256=Bl2tkBYnrindgx61VHYgNovUF_Kp_fXNcivQn28vC2w,5254
|
|
224
224
|
datahub/ingestion/source/aws/sagemaker_processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
225
225
|
datahub/ingestion/source/aws/sagemaker_processors/common.py,sha256=NvYfI8LHgDvhEZE7qp6qF1NSZ0_SQKhg3ivtdjsdpFg,2172
|
|
@@ -304,8 +304,8 @@ datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57
|
|
|
304
304
|
datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
305
305
|
datahub/ingestion/source/gc/datahub_gc.py,sha256=6O-TxU2uCJ1Y8NNzJDufUd3ymapo--E3hTeIuy_QDtY,12763
|
|
306
306
|
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=86Tm3NNWMf0xM4TklNIEeNOjEingKpYy-XvCPeaAb4k,17125
|
|
307
|
-
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=
|
|
308
|
-
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=
|
|
307
|
+
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=VbZ-Xzryl5TMRapu7nlxlsXS8T8lFZcHK9AJnEadJ8Q,11111
|
|
308
|
+
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=PTx1XmD4Jv9SzXzHqUbFpT3vKGCtkD01MeBUaq1p4no,12083
|
|
309
309
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
310
310
|
datahub/ingestion/source/gcs/gcs_source.py,sha256=iwvj4JwjyVWRP1Vq106sUtQhh0GuOYVSu9zCa1wCZN0,6189
|
|
311
311
|
datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
|
|
@@ -376,7 +376,7 @@ datahub/ingestion/source/powerbi/rest_api_wrapper/profiling_utils.py,sha256=bgcP
|
|
|
376
376
|
datahub/ingestion/source/powerbi/rest_api_wrapper/query.py,sha256=VNw1Uvli6g0pnu9FpigYmnCdEPbVEipz7vdZU_WmHf4,616
|
|
377
377
|
datahub/ingestion/source/powerbi_report_server/__init__.py,sha256=N9fGcrHXBbuPmx9rpGjd_jkMC3smXmfiwISDP1QZapk,324
|
|
378
378
|
datahub/ingestion/source/powerbi_report_server/constants.py,sha256=i_hXcvPHjwk3VpTT5ef7s8dN9F6pJzPyRUiG5UzCCYI,3544
|
|
379
|
-
datahub/ingestion/source/powerbi_report_server/report_server.py,sha256=
|
|
379
|
+
datahub/ingestion/source/powerbi_report_server/report_server.py,sha256=Xsvu_FeteYNyLW_U0pER94-zQLLGUzU5tUEkhsLTQ2Y,20176
|
|
380
380
|
datahub/ingestion/source/powerbi_report_server/report_server_domain.py,sha256=bBYr9fz4zPEFeZZOkldzKm4SBMQdisdp-MMtaYI0n3s,11783
|
|
381
381
|
datahub/ingestion/source/profiling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
382
382
|
datahub/ingestion/source/profiling/common.py,sha256=4sZ58AeBV64KRfKAgjkg-UyNjAc3YERahQMmW4algAw,1426
|
|
@@ -403,7 +403,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
|
|
|
403
403
|
datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=k7S9Xcmgr3-CvWrd5NEX-V8JSrcAwkm7vbHPTVZicow,3620
|
|
404
404
|
datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
|
|
405
405
|
datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
|
|
406
|
-
datahub/ingestion/source/s3/source.py,sha256=
|
|
406
|
+
datahub/ingestion/source/s3/source.py,sha256=GXnyvMdr6dKaXdGros8GHEFwh1jwWiX3pKebvWxO2n8,47264
|
|
407
407
|
datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
408
408
|
datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
|
|
409
409
|
datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
|
|
@@ -566,12 +566,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
|
|
|
566
566
|
datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
|
|
567
567
|
datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
|
|
568
568
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
569
|
-
datahub/metadata/_schema_classes.py,sha256=
|
|
569
|
+
datahub/metadata/_schema_classes.py,sha256=gWKn2rcsptEOQn4wWc7rZyeyXvDf4v7Q2UbIr9oU6Cg,964662
|
|
570
570
|
datahub/metadata/schema.avsc,sha256=CeVb_Z7k0e5kmeqDUXUW7JDL6KSKBCdfAZzqRI_mLZo,729869
|
|
571
571
|
datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
|
|
572
572
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
573
573
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
574
|
-
datahub/metadata/_urns/urn_defs.py,sha256=
|
|
574
|
+
datahub/metadata/_urns/urn_defs.py,sha256=UNmGpVCcFB9_mXuSA4V3xwD_WtHaJ8WHjLTtdP7ojoQ,107852
|
|
575
575
|
datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
576
576
|
datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
577
577
|
datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
|
|
@@ -678,7 +678,7 @@ datahub/metadata/schemas/DataContractKey.avsc,sha256=Oceu7P26--E0812IFrX3RiEY0Kt
|
|
|
678
678
|
datahub/metadata/schemas/DataContractProperties.avsc,sha256=RCxuJMlZwqEE0iHTpuXvcH6zRFoOt7ysQFPrJRp3RqE,4763
|
|
679
679
|
datahub/metadata/schemas/DataContractStatus.avsc,sha256=5yvT43AIB13Dn_h0-4s7fsL7BTuXhkK5pi2KJug4_qg,1029
|
|
680
680
|
datahub/metadata/schemas/DataFlowInfo.avsc,sha256=tDRTd1rA3v_7kwUVbQbb-cuo6D-t3pcuE4fiRz4D8f0,4682
|
|
681
|
-
datahub/metadata/schemas/DataFlowKey.avsc,sha256=
|
|
681
|
+
datahub/metadata/schemas/DataFlowKey.avsc,sha256=4QVVgv4SICNkRMQbQjxGtSXuMHqlKEno9BgTstDxNvU,1323
|
|
682
682
|
datahub/metadata/schemas/DataHubAccessTokenInfo.avsc,sha256=WS77M5w7GJFxUAiyXaxUvBqO0XFV2FnKPxXSXYbXHTE,1646
|
|
683
683
|
datahub/metadata/schemas/DataHubAccessTokenKey.avsc,sha256=3EspNIxgb_I4WwV0a2o4NJOB5yODVr9J-wZzkZanEgo,483
|
|
684
684
|
datahub/metadata/schemas/DataHubActionKey.avsc,sha256=bjiKcoyvUPQKaGUi2ICBMJ_ukwnt7dh0szJS4WBZE0A,448
|
|
@@ -705,7 +705,7 @@ datahub/metadata/schemas/DataHubViewInfo.avsc,sha256=U3fBIoG9ietLUpOknfQGNekqBdP
|
|
|
705
705
|
datahub/metadata/schemas/DataHubViewKey.avsc,sha256=p53axIdSVbubo3r23Vpsed7NqRcQBMGveVikEHAVAok,424
|
|
706
706
|
datahub/metadata/schemas/DataJobInfo.avsc,sha256=--obUbt_4X2paB39EeRKP13sBSiK-r0nq070EamoV1w,7212
|
|
707
707
|
datahub/metadata/schemas/DataJobInputOutput.avsc,sha256=H1O8eAzZV34tvULdu67iBSWkdn08rt7wS208b8Nisbk,15268
|
|
708
|
-
datahub/metadata/schemas/DataJobKey.avsc,sha256=
|
|
708
|
+
datahub/metadata/schemas/DataJobKey.avsc,sha256=fVlqnz1Ljan3YKzDTrMRFD3z09Ge7Gqt4aor5K_AgkY,1602
|
|
709
709
|
datahub/metadata/schemas/DataPlatformInfo.avsc,sha256=WGPFumBNHbR75vsLrivnRCbBc8vSCuxDw2UlylMieh4,2686
|
|
710
710
|
datahub/metadata/schemas/DataPlatformInstance.avsc,sha256=SNd3v_YyyLaDflv8Rd5cQR9GrVuky_cDTkYM6FqJiM8,1058
|
|
711
711
|
datahub/metadata/schemas/DataPlatformInstanceKey.avsc,sha256=sXUV5EMT6N-x8d6s8ebcJ5JdFIOsJCtiiU5Jtm-ncIk,800
|
|
@@ -986,8 +986,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
986
986
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
987
987
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
988
988
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
989
|
-
acryl_datahub-0.15.0.
|
|
990
|
-
acryl_datahub-0.15.0.
|
|
991
|
-
acryl_datahub-0.15.0.
|
|
992
|
-
acryl_datahub-0.15.0.
|
|
993
|
-
acryl_datahub-0.15.0.
|
|
989
|
+
acryl_datahub-0.15.0.2rc4.dist-info/METADATA,sha256=27j9HorUgTZCHO9ZGywLNkPIBkNDl5RHU6xVV_v5fpI,173662
|
|
990
|
+
acryl_datahub-0.15.0.2rc4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
991
|
+
acryl_datahub-0.15.0.2rc4.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
992
|
+
acryl_datahub-0.15.0.2rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
993
|
+
acryl_datahub-0.15.0.2rc4.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -248,9 +248,11 @@ class DataHubGraph(DatahubRestEmitter):
|
|
|
248
248
|
with DatahubRestSink(PipelineContext(run_id=run_id), sink_config) as sink:
|
|
249
249
|
yield sink
|
|
250
250
|
if sink.report.failures:
|
|
251
|
+
logger.error(
|
|
252
|
+
f"Failed to emit {len(sink.report.failures)} records\n{sink.report.as_string()}"
|
|
253
|
+
)
|
|
251
254
|
raise OperationalError(
|
|
252
|
-
f"Failed to emit {len(sink.report.failures)} records"
|
|
253
|
-
info=sink.report.as_obj(),
|
|
255
|
+
f"Failed to emit {len(sink.report.failures)} records"
|
|
254
256
|
)
|
|
255
257
|
|
|
256
258
|
def emit_all(
|
|
@@ -248,6 +248,9 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
|
248
248
|
"Enabled by default when stateful ingestion is turned on.",
|
|
249
249
|
)
|
|
250
250
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
251
|
+
@capability(
|
|
252
|
+
SourceCapability.LINEAGE_FINE, "Support via the `emit_s3_lineage` config field"
|
|
253
|
+
)
|
|
251
254
|
class GlueSource(StatefulIngestionSourceBase):
|
|
252
255
|
"""
|
|
253
256
|
Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub.
|
|
@@ -284,12 +287,22 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
284
287
|
"Action": [
|
|
285
288
|
"glue:GetDataflowGraph",
|
|
286
289
|
"glue:GetJobs",
|
|
290
|
+
"s3:GetObject",
|
|
287
291
|
],
|
|
288
292
|
"Resource": "*"
|
|
289
293
|
}
|
|
290
294
|
```
|
|
291
295
|
|
|
292
|
-
|
|
296
|
+
For profiling datasets, the following additional permissions are required:
|
|
297
|
+
```json
|
|
298
|
+
{
|
|
299
|
+
"Effect": "Allow",
|
|
300
|
+
"Action": [
|
|
301
|
+
"glue:GetPartitions",
|
|
302
|
+
],
|
|
303
|
+
"Resource": "*"
|
|
304
|
+
}
|
|
305
|
+
```
|
|
293
306
|
|
|
294
307
|
"""
|
|
295
308
|
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from mypy_boto3_s3.service_resource import ObjectSummary
|
|
8
|
+
|
|
4
9
|
|
|
5
10
|
S3_PREFIXES = ["s3://", "s3n://", "s3a://"]
|
|
6
11
|
|
|
@@ -68,3 +73,21 @@ def get_key_prefix(s3_uri: str) -> str:
|
|
|
68
73
|
f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}"
|
|
69
74
|
)
|
|
70
75
|
return strip_s3_prefix(s3_uri).split("/", maxsplit=1)[1]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def group_s3_objects_by_dirname(
|
|
79
|
+
s3_objects: Iterable["ObjectSummary"],
|
|
80
|
+
) -> Dict[str, List["ObjectSummary"]]:
|
|
81
|
+
"""
|
|
82
|
+
Groups S3 objects by their directory name.
|
|
83
|
+
|
|
84
|
+
If a s3_object in the root directory (i.e., s3://bucket/file.txt), it is grouped under '/'.
|
|
85
|
+
"""
|
|
86
|
+
grouped_s3_objs = defaultdict(list)
|
|
87
|
+
for obj in s3_objects:
|
|
88
|
+
if "/" in obj.key:
|
|
89
|
+
dirname = obj.key.rsplit("/", 1)[0]
|
|
90
|
+
else:
|
|
91
|
+
dirname = "/"
|
|
92
|
+
grouped_s3_objs[dirname].append(obj)
|
|
93
|
+
return grouped_s3_objs
|
|
@@ -29,7 +29,7 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
29
29
|
)
|
|
30
30
|
|
|
31
31
|
keep_history_max_days: int = Field(
|
|
32
|
-
|
|
32
|
+
90,
|
|
33
33
|
description="Maximum number of days to keep execution requests for, per ingestion source",
|
|
34
34
|
)
|
|
35
35
|
|
|
@@ -48,6 +48,10 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
48
48
|
description="Maximum runtime in seconds for the cleanup task",
|
|
49
49
|
)
|
|
50
50
|
|
|
51
|
+
limit_entities_delete: Optional[int] = Field(
|
|
52
|
+
10000, description="Max number of execution requests to hard delete."
|
|
53
|
+
)
|
|
54
|
+
|
|
51
55
|
max_read_errors: int = Field(
|
|
52
56
|
default=10,
|
|
53
57
|
description="Maximum number of read errors before aborting",
|
|
@@ -65,6 +69,8 @@ class DatahubExecutionRequestCleanupReport(SourceReport):
|
|
|
65
69
|
ergc_delete_errors: int = 0
|
|
66
70
|
ergc_start_time: Optional[datetime.datetime] = None
|
|
67
71
|
ergc_end_time: Optional[datetime.datetime] = None
|
|
72
|
+
ergc_delete_limit_reached: bool = False
|
|
73
|
+
ergc_runtime_limit_reached: bool = False
|
|
68
74
|
|
|
69
75
|
|
|
70
76
|
class CleanupRecord(BaseModel):
|
|
@@ -85,12 +91,20 @@ class DatahubExecutionRequestCleanup:
|
|
|
85
91
|
self.graph = graph
|
|
86
92
|
self.report = report
|
|
87
93
|
self.instance_id = int(time.time())
|
|
94
|
+
self.last_print_time = 0.0
|
|
88
95
|
|
|
89
96
|
if config is not None:
|
|
90
97
|
self.config = config
|
|
91
98
|
else:
|
|
92
99
|
self.config = DatahubExecutionRequestCleanupConfig()
|
|
93
100
|
|
|
101
|
+
def _print_report(self) -> None:
|
|
102
|
+
time_taken = round(time.time() - self.last_print_time, 1)
|
|
103
|
+
# Print report every 2 minutes
|
|
104
|
+
if time_taken > 120:
|
|
105
|
+
self.last_print_time = time.time()
|
|
106
|
+
logger.info(f"\n{self.report.as_string()}")
|
|
107
|
+
|
|
94
108
|
def _to_cleanup_record(self, entry: Dict) -> CleanupRecord:
|
|
95
109
|
input_aspect = (
|
|
96
110
|
entry.get("aspects", {})
|
|
@@ -175,6 +189,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
175
189
|
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
|
|
176
190
|
|
|
177
191
|
for entry in self._scroll_execution_requests():
|
|
192
|
+
self._print_report()
|
|
178
193
|
self.report.ergc_records_read += 1
|
|
179
194
|
key = entry.ingestion_source
|
|
180
195
|
|
|
@@ -225,15 +240,12 @@ class DatahubExecutionRequestCleanup:
|
|
|
225
240
|
f"record timestamp: {entry.requested_at}."
|
|
226
241
|
)
|
|
227
242
|
)
|
|
228
|
-
self.report.ergc_records_deleted += 1
|
|
229
243
|
yield entry
|
|
230
244
|
|
|
231
245
|
def _delete_entry(self, entry: CleanupRecord) -> None:
|
|
232
246
|
try:
|
|
233
|
-
logger.info(
|
|
234
|
-
f"ergc({self.instance_id}): going to delete ExecutionRequest {entry.request_id}"
|
|
235
|
-
)
|
|
236
247
|
self.graph.delete_entity(entry.urn, True)
|
|
248
|
+
self.report.ergc_records_deleted += 1
|
|
237
249
|
except Exception as e:
|
|
238
250
|
self.report.ergc_delete_errors += 1
|
|
239
251
|
self.report.failure(
|
|
@@ -252,10 +264,23 @@ class DatahubExecutionRequestCleanup:
|
|
|
252
264
|
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
|
|
253
265
|
)
|
|
254
266
|
):
|
|
267
|
+
self.report.ergc_runtime_limit_reached = True
|
|
255
268
|
logger.info(f"ergc({self.instance_id}): max runtime reached.")
|
|
256
269
|
return True
|
|
257
270
|
return False
|
|
258
271
|
|
|
272
|
+
def _reached_delete_limit(self) -> bool:
|
|
273
|
+
if (
|
|
274
|
+
self.config.limit_entities_delete
|
|
275
|
+
and self.report.ergc_records_deleted >= self.config.limit_entities_delete
|
|
276
|
+
):
|
|
277
|
+
logger.info(
|
|
278
|
+
f"ergc({self.instance_id}): max delete limit reached: {self.config.limit_entities_delete}."
|
|
279
|
+
)
|
|
280
|
+
self.report.ergc_delete_limit_reached = True
|
|
281
|
+
return True
|
|
282
|
+
return False
|
|
283
|
+
|
|
259
284
|
def run(self) -> None:
|
|
260
285
|
if not self.config.enabled:
|
|
261
286
|
logger.info(
|
|
@@ -274,7 +299,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
274
299
|
)
|
|
275
300
|
|
|
276
301
|
for entry in self._scroll_garbage_records():
|
|
277
|
-
if self._reached_runtime_limit():
|
|
302
|
+
if self._reached_runtime_limit() or self._reached_delete_limit():
|
|
278
303
|
break
|
|
279
304
|
self._delete_entry(entry)
|
|
280
305
|
|
|
@@ -231,6 +231,15 @@ class SoftDeletedEntitiesCleanup:
|
|
|
231
231
|
def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
|
|
232
232
|
assert self.ctx.graph
|
|
233
233
|
scroll_id: Optional[str] = None
|
|
234
|
+
|
|
235
|
+
batch_size = self.config.batch_size
|
|
236
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
237
|
+
# Due to a bug in Data process instance querying this is a temp workaround
|
|
238
|
+
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
239
|
+
# This will be remove in future version after server with fix has been
|
|
240
|
+
# around for a while
|
|
241
|
+
batch_size = 10
|
|
242
|
+
|
|
234
243
|
while True:
|
|
235
244
|
try:
|
|
236
245
|
result = self.ctx.graph.execute_graphql(
|
|
@@ -240,7 +249,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
240
249
|
"types": [entity_type],
|
|
241
250
|
"query": "*",
|
|
242
251
|
"scrollId": scroll_id if scroll_id else None,
|
|
243
|
-
"count":
|
|
252
|
+
"count": batch_size,
|
|
244
253
|
"orFilters": [
|
|
245
254
|
{
|
|
246
255
|
"and": [
|
|
@@ -263,6 +272,10 @@ class SoftDeletedEntitiesCleanup:
|
|
|
263
272
|
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
264
273
|
if not scroll_across_entities or not scroll_across_entities.get("count"):
|
|
265
274
|
break
|
|
275
|
+
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
276
|
+
# Temp workaround. See note in beginning of the function
|
|
277
|
+
# We make the batch size = config after call has succeeded once
|
|
278
|
+
batch_size = self.config.batch_size
|
|
266
279
|
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
267
280
|
self.report.num_queries_found += scroll_across_entities.get("count")
|
|
268
281
|
for query in scroll_across_entities.get("searchResults"):
|
|
@@ -485,7 +485,7 @@ class PowerBiReportServerDashboardSourceReport(SourceReport):
|
|
|
485
485
|
self.filtered_reports.append(view)
|
|
486
486
|
|
|
487
487
|
|
|
488
|
-
@platform_name("PowerBI")
|
|
488
|
+
@platform_name("PowerBI Report Server")
|
|
489
489
|
@config_class(PowerBiReportServerDashboardSourceConfig)
|
|
490
490
|
@support_status(SupportStatus.INCUBATING)
|
|
491
491
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
@@ -6,9 +6,8 @@ import pathlib
|
|
|
6
6
|
import re
|
|
7
7
|
import time
|
|
8
8
|
from datetime import datetime
|
|
9
|
-
from itertools import groupby
|
|
10
9
|
from pathlib import PurePath
|
|
11
|
-
from typing import
|
|
10
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
|
|
12
11
|
from urllib.parse import urlparse
|
|
13
12
|
|
|
14
13
|
import smart_open.compression as so_compression
|
|
@@ -41,6 +40,7 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
41
40
|
get_bucket_name,
|
|
42
41
|
get_bucket_relative_path,
|
|
43
42
|
get_key_prefix,
|
|
43
|
+
group_s3_objects_by_dirname,
|
|
44
44
|
strip_s3_prefix,
|
|
45
45
|
)
|
|
46
46
|
from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
|
|
@@ -75,6 +75,9 @@ from datahub.metadata.schema_classes import (
|
|
|
75
75
|
from datahub.telemetry import stats, telemetry
|
|
76
76
|
from datahub.utilities.perf_timer import PerfTimer
|
|
77
77
|
|
|
78
|
+
if TYPE_CHECKING:
|
|
79
|
+
from mypy_boto3_s3.service_resource import Bucket
|
|
80
|
+
|
|
78
81
|
# hide annoying debug errors from py4j
|
|
79
82
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
80
83
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -842,7 +845,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
842
845
|
def get_folder_info(
|
|
843
846
|
self,
|
|
844
847
|
path_spec: PathSpec,
|
|
845
|
-
bucket:
|
|
848
|
+
bucket: "Bucket",
|
|
846
849
|
prefix: str,
|
|
847
850
|
) -> List[Folder]:
|
|
848
851
|
"""
|
|
@@ -857,22 +860,15 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
857
860
|
|
|
858
861
|
Parameters:
|
|
859
862
|
path_spec (PathSpec): The path specification used to determine partitioning.
|
|
860
|
-
bucket (
|
|
863
|
+
bucket (Bucket): The S3 bucket object.
|
|
861
864
|
prefix (str): The prefix path in the S3 bucket to list objects from.
|
|
862
865
|
|
|
863
866
|
Returns:
|
|
864
867
|
List[Folder]: A list of Folder objects representing the partitions found.
|
|
865
868
|
"""
|
|
866
|
-
|
|
867
|
-
prefix_to_list = prefix
|
|
868
|
-
files = list(
|
|
869
|
-
bucket.objects.filter(Prefix=f"{prefix_to_list}").page_size(PAGE_SIZE)
|
|
870
|
-
)
|
|
871
|
-
files = sorted(files, key=lambda a: a.last_modified)
|
|
872
|
-
grouped_files = groupby(files, lambda x: x.key.rsplit("/", 1)[0])
|
|
873
|
-
|
|
874
869
|
partitions: List[Folder] = []
|
|
875
|
-
|
|
870
|
+
s3_objects = bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
|
|
871
|
+
for key, group in group_s3_objects_by_dirname(s3_objects).items():
|
|
876
872
|
file_size = 0
|
|
877
873
|
creation_time = None
|
|
878
874
|
modification_time = None
|
|
@@ -904,7 +900,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
904
900
|
Folder(
|
|
905
901
|
partition_id=id,
|
|
906
902
|
is_partition=bool(id),
|
|
907
|
-
creation_time=creation_time if creation_time else None,
|
|
903
|
+
creation_time=creation_time if creation_time else None, # type: ignore[arg-type]
|
|
908
904
|
modification_time=modification_time,
|
|
909
905
|
sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
|
|
910
906
|
size=file_size,
|
|
@@ -14262,7 +14262,7 @@ class DataFlowKeyClass(_Aspect):
|
|
|
14262
14262
|
|
|
14263
14263
|
|
|
14264
14264
|
ASPECT_NAME = 'dataFlowKey'
|
|
14265
|
-
ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
|
|
14265
|
+
ASPECT_INFO = {'keyForEntity': 'dataFlow', 'entityCategory': 'core', 'entityAspects': ['domains', 'deprecation', 'versionInfo', 'dataFlowInfo', 'editableDataFlowProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'incidentsSummary', 'forms', 'subTypes', 'testResults']}
|
|
14266
14266
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataFlowKey")
|
|
14267
14267
|
|
|
14268
14268
|
def __init__(self,
|
|
@@ -14678,7 +14678,7 @@ class DataJobKeyClass(_Aspect):
|
|
|
14678
14678
|
|
|
14679
14679
|
|
|
14680
14680
|
ASPECT_NAME = 'dataJobKey'
|
|
14681
|
-
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
|
|
14681
|
+
ASPECT_INFO = {'keyForEntity': 'dataJob', 'entityCategory': '_unset_', 'entityAspects': ['datahubIngestionRunSummary', 'datahubIngestionCheckpoint', 'domains', 'deprecation', 'versionInfo', 'dataJobInfo', 'dataJobInputOutput', 'editableDataJobProperties', 'ownership', 'status', 'globalTags', 'browsePaths', 'glossaryTerms', 'institutionalMemory', 'dataPlatformInstance', 'container', 'browsePathsV2', 'structuredProperties', 'forms', 'subTypes', 'incidentsSummary', 'testResults', 'dataTransformLogic']}
|
|
14682
14682
|
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.metadata.key.DataJobKey")
|
|
14683
14683
|
|
|
14684
14684
|
def __init__(self,
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
# This file contains classes corresponding to entity URNs.
|
|
12
12
|
|
|
13
|
-
from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
|
|
13
|
+
from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
|
|
14
14
|
|
|
15
15
|
import functools
|
|
16
16
|
from deprecated.sphinx import deprecated as _sphinx_deprecated
|
|
@@ -213,7 +213,7 @@ class SchemaFieldUrn(_SpecificUrn):
|
|
|
213
213
|
ENTITY_TYPE: ClassVar[str] = "schemaField"
|
|
214
214
|
_URN_PARTS: ClassVar[int] = 2
|
|
215
215
|
|
|
216
|
-
def __init__(self, parent: str, field_path: str, *, _allow_coercion: bool = True) -> None:
|
|
216
|
+
def __init__(self, parent: Union["Urn", str], field_path: str, *, _allow_coercion: bool = True) -> None:
|
|
217
217
|
if _allow_coercion:
|
|
218
218
|
# Field coercion logic (if any is required).
|
|
219
219
|
field_path = UrnEncoder.encode_string(field_path)
|
|
@@ -604,7 +604,7 @@ class DataJobUrn(_SpecificUrn):
|
|
|
604
604
|
ENTITY_TYPE: ClassVar[str] = "dataJob"
|
|
605
605
|
_URN_PARTS: ClassVar[int] = 2
|
|
606
606
|
|
|
607
|
-
def __init__(self, flow: str, job_id: str, *, _allow_coercion: bool = True) -> None:
|
|
607
|
+
def __init__(self, flow: Union["DataFlowUrn", str], job_id: str, *, _allow_coercion: bool = True) -> None:
|
|
608
608
|
if _allow_coercion:
|
|
609
609
|
# Field coercion logic (if any is required).
|
|
610
610
|
job_id = UrnEncoder.encode_string(job_id)
|
|
@@ -1435,10 +1435,10 @@ class DataPlatformInstanceUrn(_SpecificUrn):
|
|
|
1435
1435
|
ENTITY_TYPE: ClassVar[str] = "dataPlatformInstance"
|
|
1436
1436
|
_URN_PARTS: ClassVar[int] = 2
|
|
1437
1437
|
|
|
1438
|
-
def __init__(self, platform: str, instance: str, *, _allow_coercion: bool = True) -> None:
|
|
1438
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], instance: str, *, _allow_coercion: bool = True) -> None:
|
|
1439
1439
|
if _allow_coercion:
|
|
1440
1440
|
# Field coercion logic (if any is required).
|
|
1441
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1441
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1442
1442
|
instance = UrnEncoder.encode_string(instance)
|
|
1443
1443
|
|
|
1444
1444
|
# Validation logic.
|
|
@@ -1678,10 +1678,10 @@ class DatasetUrn(_SpecificUrn):
|
|
|
1678
1678
|
ENTITY_TYPE: ClassVar[str] = "dataset"
|
|
1679
1679
|
_URN_PARTS: ClassVar[int] = 3
|
|
1680
1680
|
|
|
1681
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1681
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1682
1682
|
if _allow_coercion:
|
|
1683
1683
|
# Field coercion logic (if any is required).
|
|
1684
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1684
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1685
1685
|
name = UrnEncoder.encode_string(name)
|
|
1686
1686
|
env = env.upper()
|
|
1687
1687
|
env = UrnEncoder.encode_string(env)
|
|
@@ -1771,10 +1771,10 @@ class MlModelUrn(_SpecificUrn):
|
|
|
1771
1771
|
ENTITY_TYPE: ClassVar[str] = "mlModel"
|
|
1772
1772
|
_URN_PARTS: ClassVar[int] = 3
|
|
1773
1773
|
|
|
1774
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1774
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1775
1775
|
if _allow_coercion:
|
|
1776
1776
|
# Field coercion logic (if any is required).
|
|
1777
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1777
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1778
1778
|
name = UrnEncoder.encode_string(name)
|
|
1779
1779
|
env = env.upper()
|
|
1780
1780
|
env = UrnEncoder.encode_string(env)
|
|
@@ -1889,10 +1889,10 @@ class MlModelDeploymentUrn(_SpecificUrn):
|
|
|
1889
1889
|
ENTITY_TYPE: ClassVar[str] = "mlModelDeployment"
|
|
1890
1890
|
_URN_PARTS: ClassVar[int] = 3
|
|
1891
1891
|
|
|
1892
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1892
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
1893
1893
|
if _allow_coercion:
|
|
1894
1894
|
# Field coercion logic (if any is required).
|
|
1895
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1895
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1896
1896
|
name = UrnEncoder.encode_string(name)
|
|
1897
1897
|
env = env.upper()
|
|
1898
1898
|
env = UrnEncoder.encode_string(env)
|
|
@@ -1953,10 +1953,10 @@ class MlFeatureTableUrn(_SpecificUrn):
|
|
|
1953
1953
|
ENTITY_TYPE: ClassVar[str] = "mlFeatureTable"
|
|
1954
1954
|
_URN_PARTS: ClassVar[int] = 2
|
|
1955
1955
|
|
|
1956
|
-
def __init__(self, platform: str, name: str, *, _allow_coercion: bool = True) -> None:
|
|
1956
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, *, _allow_coercion: bool = True) -> None:
|
|
1957
1957
|
if _allow_coercion:
|
|
1958
1958
|
# Field coercion logic (if any is required).
|
|
1959
|
-
platform = DataPlatformUrn(platform).urn()
|
|
1959
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
1960
1960
|
name = UrnEncoder.encode_string(name)
|
|
1961
1961
|
|
|
1962
1962
|
# Validation logic.
|
|
@@ -2385,10 +2385,10 @@ class MlModelGroupUrn(_SpecificUrn):
|
|
|
2385
2385
|
ENTITY_TYPE: ClassVar[str] = "mlModelGroup"
|
|
2386
2386
|
_URN_PARTS: ClassVar[int] = 3
|
|
2387
2387
|
|
|
2388
|
-
def __init__(self, platform: str, name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
2388
|
+
def __init__(self, platform: Union["DataPlatformUrn", str], name: str, env: str = "PROD", *, _allow_coercion: bool = True) -> None:
|
|
2389
2389
|
if _allow_coercion:
|
|
2390
2390
|
# Field coercion logic (if any is required).
|
|
2391
|
-
platform = DataPlatformUrn(platform).urn()
|
|
2391
|
+
platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()
|
|
2392
2392
|
name = UrnEncoder.encode_string(name)
|
|
2393
2393
|
env = env.upper()
|
|
2394
2394
|
env = UrnEncoder.encode_string(env)
|
|
File without changes
|
{acryl_datahub-0.15.0.2rc3.dist-info → acryl_datahub-0.15.0.2rc4.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|