acryl-datahub 0.15.0rc20__py3-none-any.whl → 0.15.0rc22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc22.dist-info}/METADATA +2478 -2478
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc22.dist-info}/RECORD +28 -26
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +56 -68
- datahub/cli/ingest_cli.py +110 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/sink/datahub_rest.py +12 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +16 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +23 -0
- datahub/ingestion/source/tableau/tableau.py +42 -3
- datahub/ingestion/source/tableau/tableau_common.py +12 -5
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/metadata/_schema_classes.py +400 -400
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/schema.avsc +17221 -17574
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/utilities/file_backed_collections.py +35 -2
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc22.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc22.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc20.dist-info → acryl_datahub-0.15.0rc22.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=T0tNQ0v5Y2QyvLqZg1tU0kxvIjYvmZ8eZdrD_d8Uwe4,575
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -52,7 +52,7 @@ datahub/api/entities/forms/forms_graphql_constants.py,sha256=DKpnKlMKTjmnyrCTvp6
|
|
|
52
52
|
datahub/api/entities/platformresource/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
53
|
datahub/api/entities/platformresource/platform_resource.py,sha256=pVAjv6NoH746Mfvdak7ji0eqlEcEeV-Ji7M5gyNXmds,10603
|
|
54
54
|
datahub/api/entities/structuredproperties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
|
-
datahub/api/entities/structuredproperties/structuredproperties.py,sha256=
|
|
55
|
+
datahub/api/entities/structuredproperties/structuredproperties.py,sha256=YO4mdn6BziOzvzoFe-g2KfZlOZy8gqwMyyzj_7vF4BY,8845
|
|
56
56
|
datahub/api/graphql/__init__.py,sha256=5yl0dJxO-2d_QuykdJrDIbWq4ja9bo0t2dAEh89JOog,142
|
|
57
57
|
datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0VhA,2818
|
|
58
58
|
datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
|
|
@@ -67,7 +67,7 @@ datahub/cli/docker_cli.py,sha256=QGoWFp8ZZsXOSMbgu0Q4snMmMmtP3epWAN-fYglUNEc,364
|
|
|
67
67
|
datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
|
|
68
68
|
datahub/cli/exists_cli.py,sha256=IsuU86R-g7BJjAl1vULH6d-BWJHAKa4XHLZl5WxGUEM,1233
|
|
69
69
|
datahub/cli/get_cli.py,sha256=VV80BCXfZ0-C8fr2k43SIuN9DB-fOYP9StWsTHnXwFw,2327
|
|
70
|
-
datahub/cli/ingest_cli.py,sha256=
|
|
70
|
+
datahub/cli/ingest_cli.py,sha256=nRoZvVpsGPXmEZCvSOBfsZ61Ep1fCqYRVp79RBnHSnI,22393
|
|
71
71
|
datahub/cli/json_file.py,sha256=nWo-VVthaaW4Do1eUqgrzk0fShb29MjiKXvZVOTq76c,943
|
|
72
72
|
datahub/cli/lite_cli.py,sha256=UmlMMquce6lHiPaKUBBT0XQtqR9SHEmrGlJyKV9YY60,13030
|
|
73
73
|
datahub/cli/migrate.py,sha256=p42vixwKzi9OHQnIa0K2FxwGvt-1OxXeuYGJzfu5Sqo,17939
|
|
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
|
|
|
119
119
|
datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
|
|
120
120
|
datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
|
|
121
121
|
datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
|
|
122
|
-
datahub/emitter/rest_emitter.py,sha256=
|
|
122
|
+
datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
|
|
123
123
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
124
124
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
125
125
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -180,7 +180,7 @@ datahub/ingestion/sink/blackhole.py,sha256=-jYcWo4i8q7312bCIoHrGr7nT9JdPvA7c4jvS
|
|
|
180
180
|
datahub/ingestion/sink/console.py,sha256=TZfhA0Ec2eNCrMH7RRy2JOdUE-U-hkoIQrPm1CmKLQs,591
|
|
181
181
|
datahub/ingestion/sink/datahub_kafka.py,sha256=_cjuXu5I6G0zJ2UK7hMbaKjMPZXeIwRMgm7CVeTiNtc,2578
|
|
182
182
|
datahub/ingestion/sink/datahub_lite.py,sha256=7u2aWm7ENLshKHl-PkjJg6Mrw4bWs8sTfKIBz4mm8Ak,1879
|
|
183
|
-
datahub/ingestion/sink/datahub_rest.py,sha256=
|
|
183
|
+
datahub/ingestion/sink/datahub_rest.py,sha256=ME8OygJgd7AowrokJLmdjYHxIQEy5jXWS0yKwOLR934,12592
|
|
184
184
|
datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
|
|
185
185
|
datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
|
|
186
186
|
datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -266,9 +266,9 @@ datahub/ingestion/source/data_lake_common/path_spec.py,sha256=u3u2eMe70V5vur-j8m
|
|
|
266
266
|
datahub/ingestion/source/datahub/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
267
267
|
datahub/ingestion/source/datahub/config.py,sha256=pOXt0b1PX6D7dtD4RuKwdmr6sQKnXSf6LHxfPUMhP8s,3658
|
|
268
268
|
datahub/ingestion/source/datahub/datahub_api_reader.py,sha256=hlKADVEPoTFiRGKqRsMF5mL4fSu_IrIW8Nx7LpEzvkM,2134
|
|
269
|
-
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=
|
|
269
|
+
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=F8JrOjSrmJ2B6m1MWh83A1EYFDcGMla749HUeQWMnL0,9464
|
|
270
270
|
datahub/ingestion/source/datahub/datahub_kafka_reader.py,sha256=8x9_u5kRjgSmu7c295ZIZjxP6bgoZZbWsKRicuLStRQ,4145
|
|
271
|
-
datahub/ingestion/source/datahub/datahub_source.py,sha256=
|
|
271
|
+
datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSUJYD6Cb1McYFKOVbA-Zcm4,8487
|
|
272
272
|
datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
|
|
273
273
|
datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
|
|
274
274
|
datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -321,7 +321,7 @@ datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR
|
|
|
321
321
|
datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
|
|
322
322
|
datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
323
323
|
datahub/ingestion/source/kafka/kafka.py,sha256=9SR7bqp9J0rPYde5IClhnAuVNy9ItsB8-ZeXtTc_mEY,26442
|
|
324
|
-
datahub/ingestion/source/kafka/kafka_connect.py,sha256=
|
|
324
|
+
datahub/ingestion/source/kafka/kafka_connect.py,sha256=Jm1MYky_OPIwvVHuEjgOjK0e6-jA-dYnsLZ7r-Y_9mA,56208
|
|
325
325
|
datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
|
|
326
326
|
datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
327
327
|
datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
|
|
@@ -398,7 +398,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
|
|
|
398
398
|
datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=k7S9Xcmgr3-CvWrd5NEX-V8JSrcAwkm7vbHPTVZicow,3620
|
|
399
399
|
datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
|
|
400
400
|
datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
|
|
401
|
-
datahub/ingestion/source/s3/source.py,sha256=
|
|
401
|
+
datahub/ingestion/source/s3/source.py,sha256=8O_vu1J91h7owQlYyK27AZAQHxKsDpNC_jsLNpMed98,47336
|
|
402
402
|
datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
403
403
|
datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
|
|
404
404
|
datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
|
|
@@ -427,13 +427,13 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
|
|
|
427
427
|
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
|
|
428
428
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
429
429
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
430
|
-
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=
|
|
430
|
+
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=EnTJoRIQKcZOIYfb_NUff_YA8IdIroaFD1JHUn-M6ok,23346
|
|
431
431
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
432
432
|
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
|
|
433
|
-
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=
|
|
433
|
+
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
|
|
434
434
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
|
|
435
|
-
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=
|
|
436
|
-
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=
|
|
435
|
+
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
|
|
436
|
+
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
|
|
437
437
|
datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
|
|
438
438
|
datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
|
|
439
439
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
|
|
@@ -486,9 +486,11 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
|
|
|
486
486
|
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
|
|
487
487
|
datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
|
|
488
488
|
datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
489
|
-
datahub/ingestion/source/tableau/tableau.py,sha256=
|
|
490
|
-
datahub/ingestion/source/tableau/tableau_common.py,sha256=
|
|
491
|
-
datahub/ingestion/source/tableau/tableau_constant.py,sha256=
|
|
489
|
+
datahub/ingestion/source/tableau/tableau.py,sha256=P_DUuUvXk5u2ihA0JghtRkYc_KI_yQR2ZiQVe9IUvsU,138197
|
|
490
|
+
datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
|
|
491
|
+
datahub/ingestion/source/tableau/tableau_constant.py,sha256=jVQMgLXND5aPL6XLETKp81BehRkvyLTU_Vhhe_1NOkI,2576
|
|
492
|
+
datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=PEGfcoUcBdsnOa5EzCqy1IiuQ3OZ9fZVEMzDqhhHOto,922
|
|
493
|
+
datahub/ingestion/source/tableau/tableau_validation.py,sha256=l0DuXUuxJwEXMzo61xLx-KLc5u6tiz2n0e9EepJdWEM,1808
|
|
492
494
|
datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
493
495
|
datahub/ingestion/source/unity/analyze_profiler.py,sha256=2pqkFY30CfN4aHgFZZntjeG0hNhBytZJvXC13VfTc1I,4689
|
|
494
496
|
datahub/ingestion/source/unity/config.py,sha256=m4-n7mYz4Ct4L1QdfJFklwHyj8boKCbV7Sb3Ou6AT3Q,14756
|
|
@@ -559,12 +561,12 @@ datahub/lite/lite_registry.py,sha256=bpH0kasP-LtwwUFNA2QsOIehfekAYfJtN-AkQLmSWnw
|
|
|
559
561
|
datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1949
|
|
560
562
|
datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
|
|
561
563
|
datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
|
|
562
|
-
datahub/metadata/_schema_classes.py,sha256=
|
|
563
|
-
datahub/metadata/schema.avsc,sha256=
|
|
564
|
+
datahub/metadata/_schema_classes.py,sha256=FTLom36n7gr6zxYfPWWoy9AmdnB4KOIXYRoVZbS9kog,955042
|
|
565
|
+
datahub/metadata/schema.avsc,sha256=D-rNu2SC2tyvqju8pQwGNGGT9zy1_fzxzoigH5YmUvo,722242
|
|
564
566
|
datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
|
|
565
567
|
datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
|
|
566
568
|
datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
|
|
567
|
-
datahub/metadata/_urns/urn_defs.py,sha256=
|
|
569
|
+
datahub/metadata/_urns/urn_defs.py,sha256=LFHZGzHlDA0KJes1Xg7-lWetXusi7bubA7Q5hu4ER88,107119
|
|
568
570
|
datahub/metadata/com/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
569
571
|
datahub/metadata/com/linkedin/__init__.py,sha256=gsAIuTxzfJdI7a9ybZlgMIHMAYksM1SxGxXjtySgKSc,202
|
|
570
572
|
datahub/metadata/com/linkedin/events/__init__.py,sha256=s_dR0plZF-rOxxIbE8ojekJqwiHzl2WYR-Z3kW6kKS0,298
|
|
@@ -882,7 +884,7 @@ datahub/testing/__init__.py,sha256=TywIuzGQvzJsNhI_PGD1RFk11M3RtGl9jIMtAVVHIkg,2
|
|
|
882
884
|
datahub/testing/check_imports.py,sha256=EKuJmgUA46uOrlaOy0fCvPB7j9POkpJ0ExhO_pT3YAk,1356
|
|
883
885
|
datahub/testing/check_sql_parser_result.py,sha256=f7U7IUSbfV4VACdNI857wPZ9tAZ9j6mXiXmcJNT_RzM,2671
|
|
884
886
|
datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4Nwl3E,1187
|
|
885
|
-
datahub/testing/compare_metadata_json.py,sha256=
|
|
887
|
+
datahub/testing/compare_metadata_json.py,sha256=pVJB2qLoKzEJLBXqFT-qGrxpA1y76y-mIbvJf0NnAD0,5274
|
|
886
888
|
datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
|
|
887
889
|
datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
|
|
888
890
|
datahub/testing/mcp_diff.py,sha256=_sBFhmclYXJGQ_JYDrvKWXNGXt9ACvqeQvFaZrRHa8Q,10729
|
|
@@ -900,7 +902,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
|
|
|
900
902
|
datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
|
|
901
903
|
datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
|
|
902
904
|
datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
|
|
903
|
-
datahub/utilities/file_backed_collections.py,sha256=
|
|
905
|
+
datahub/utilities/file_backed_collections.py,sha256=I2GxSYtVzfo38pQpv2FyoBeWISiKD4zUi0t34jPCNrU,21957
|
|
904
906
|
datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
|
|
905
907
|
datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
|
|
906
908
|
datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
|
|
@@ -974,8 +976,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
974
976
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
975
977
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
976
978
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
977
|
-
acryl_datahub-0.15.
|
|
978
|
-
acryl_datahub-0.15.
|
|
979
|
-
acryl_datahub-0.15.
|
|
980
|
-
acryl_datahub-0.15.
|
|
981
|
-
acryl_datahub-0.15.
|
|
979
|
+
acryl_datahub-0.15.0rc22.dist-info/METADATA,sha256=48jbXm5fKitlO7rhjtNA1FcJT9Y7ypQ25EtatHbSeqY,173559
|
|
980
|
+
acryl_datahub-0.15.0rc22.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
981
|
+
acryl_datahub-0.15.0rc22.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
|
|
982
|
+
acryl_datahub-0.15.0rc22.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
983
|
+
acryl_datahub-0.15.0rc22.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from contextlib import contextmanager
|
|
3
2
|
from enum import Enum
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import List, Optional
|
|
6
5
|
|
|
7
6
|
import yaml
|
|
8
7
|
from pydantic import validator
|
|
@@ -10,6 +9,7 @@ from ruamel.yaml import YAML
|
|
|
10
9
|
|
|
11
10
|
from datahub.configuration.common import ConfigModel
|
|
12
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
|
+
from datahub.ingestion.api.global_context import get_graph_context, set_graph_context
|
|
13
13
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
14
14
|
from datahub.metadata.schema_classes import (
|
|
15
15
|
PropertyValueClass,
|
|
@@ -24,23 +24,10 @@ logger = logging.getLogger(__name__)
|
|
|
24
24
|
class StructuredPropertiesConfig:
|
|
25
25
|
"""Configuration class to hold the graph client"""
|
|
26
26
|
|
|
27
|
-
_graph: Optional[DataHubGraph] = None
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
@contextmanager
|
|
31
|
-
def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]:
|
|
32
|
-
"""Context manager to temporarily set a custom graph"""
|
|
33
|
-
previous_graph = cls._graph
|
|
34
|
-
cls._graph = graph
|
|
35
|
-
try:
|
|
36
|
-
yield
|
|
37
|
-
finally:
|
|
38
|
-
cls._graph = previous_graph
|
|
39
|
-
|
|
40
27
|
@classmethod
|
|
41
|
-
def
|
|
28
|
+
def get_graph_required(cls) -> DataHubGraph:
|
|
42
29
|
"""Get the current graph, falling back to default if none set"""
|
|
43
|
-
return
|
|
30
|
+
return get_graph_context() or get_default_graph()
|
|
44
31
|
|
|
45
32
|
|
|
46
33
|
class AllowedTypes(Enum):
|
|
@@ -79,7 +66,7 @@ class TypeQualifierAllowedTypes(ConfigModel):
|
|
|
79
66
|
@validator("allowed_types", each_item=True)
|
|
80
67
|
def validate_allowed_types(cls, v):
|
|
81
68
|
if v:
|
|
82
|
-
graph = StructuredPropertiesConfig.
|
|
69
|
+
graph = StructuredPropertiesConfig.get_graph_required()
|
|
83
70
|
validated_urn = Urn.make_entity_type_urn(v)
|
|
84
71
|
if not graph.exists(validated_urn):
|
|
85
72
|
raise ValueError(
|
|
@@ -106,7 +93,7 @@ class StructuredProperties(ConfigModel):
|
|
|
106
93
|
@validator("entity_types", each_item=True)
|
|
107
94
|
def validate_entity_types(cls, v):
|
|
108
95
|
if v:
|
|
109
|
-
graph = StructuredPropertiesConfig.
|
|
96
|
+
graph = StructuredPropertiesConfig.get_graph_required()
|
|
110
97
|
validated_urn = Urn.make_entity_type_urn(v)
|
|
111
98
|
if not graph.exists(validated_urn):
|
|
112
99
|
raise ValueError(
|
|
@@ -136,63 +123,64 @@ class StructuredProperties(ConfigModel):
|
|
|
136
123
|
|
|
137
124
|
@staticmethod
|
|
138
125
|
def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
126
|
+
with set_graph_context(graph):
|
|
127
|
+
graph = StructuredPropertiesConfig.get_graph_required()
|
|
128
|
+
|
|
142
129
|
with open(file) as fp:
|
|
143
130
|
structuredproperties: List[dict] = yaml.safe_load(fp)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
131
|
+
for structuredproperty_raw in structuredproperties:
|
|
132
|
+
structuredproperty = StructuredProperties.parse_obj(
|
|
133
|
+
structuredproperty_raw
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if not structuredproperty.type.islower():
|
|
137
|
+
structuredproperty.type = structuredproperty.type.lower()
|
|
138
|
+
logger.warning(
|
|
139
|
+
f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
|
|
147
140
|
)
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
|
|
152
|
-
)
|
|
153
|
-
if not AllowedTypes.check_allowed_type(structuredproperty.type):
|
|
154
|
-
raise ValueError(
|
|
155
|
-
f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
|
|
156
|
-
)
|
|
157
|
-
mcp = MetadataChangeProposalWrapper(
|
|
158
|
-
entityUrn=structuredproperty.urn,
|
|
159
|
-
aspect=StructuredPropertyDefinitionClass(
|
|
160
|
-
qualifiedName=structuredproperty.fqn,
|
|
161
|
-
valueType=Urn.make_data_type_urn(structuredproperty.type),
|
|
162
|
-
displayName=structuredproperty.display_name,
|
|
163
|
-
description=structuredproperty.description,
|
|
164
|
-
entityTypes=[
|
|
165
|
-
Urn.make_entity_type_urn(entity_type)
|
|
166
|
-
for entity_type in structuredproperty.entity_types or []
|
|
167
|
-
],
|
|
168
|
-
cardinality=structuredproperty.cardinality,
|
|
169
|
-
immutable=structuredproperty.immutable,
|
|
170
|
-
allowedValues=(
|
|
171
|
-
[
|
|
172
|
-
PropertyValueClass(
|
|
173
|
-
value=v.value, description=v.description
|
|
174
|
-
)
|
|
175
|
-
for v in structuredproperty.allowed_values
|
|
176
|
-
]
|
|
177
|
-
if structuredproperty.allowed_values
|
|
178
|
-
else None
|
|
179
|
-
),
|
|
180
|
-
typeQualifier=(
|
|
181
|
-
{
|
|
182
|
-
"allowedTypes": structuredproperty.type_qualifier.allowed_types
|
|
183
|
-
}
|
|
184
|
-
if structuredproperty.type_qualifier
|
|
185
|
-
else None
|
|
186
|
-
),
|
|
187
|
-
),
|
|
141
|
+
if not AllowedTypes.check_allowed_type(structuredproperty.type):
|
|
142
|
+
raise ValueError(
|
|
143
|
+
f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
|
|
188
144
|
)
|
|
189
|
-
|
|
145
|
+
mcp = MetadataChangeProposalWrapper(
|
|
146
|
+
entityUrn=structuredproperty.urn,
|
|
147
|
+
aspect=StructuredPropertyDefinitionClass(
|
|
148
|
+
qualifiedName=structuredproperty.fqn,
|
|
149
|
+
valueType=Urn.make_data_type_urn(structuredproperty.type),
|
|
150
|
+
displayName=structuredproperty.display_name,
|
|
151
|
+
description=structuredproperty.description,
|
|
152
|
+
entityTypes=[
|
|
153
|
+
Urn.make_entity_type_urn(entity_type)
|
|
154
|
+
for entity_type in structuredproperty.entity_types or []
|
|
155
|
+
],
|
|
156
|
+
cardinality=structuredproperty.cardinality,
|
|
157
|
+
immutable=structuredproperty.immutable,
|
|
158
|
+
allowedValues=(
|
|
159
|
+
[
|
|
160
|
+
PropertyValueClass(
|
|
161
|
+
value=v.value, description=v.description
|
|
162
|
+
)
|
|
163
|
+
for v in structuredproperty.allowed_values
|
|
164
|
+
]
|
|
165
|
+
if structuredproperty.allowed_values
|
|
166
|
+
else None
|
|
167
|
+
),
|
|
168
|
+
typeQualifier=(
|
|
169
|
+
{
|
|
170
|
+
"allowedTypes": structuredproperty.type_qualifier.allowed_types
|
|
171
|
+
}
|
|
172
|
+
if structuredproperty.type_qualifier
|
|
173
|
+
else None
|
|
174
|
+
),
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
graph.emit_mcp(mcp)
|
|
190
178
|
|
|
191
|
-
|
|
179
|
+
logger.info(f"Created structured property {structuredproperty.urn}")
|
|
192
180
|
|
|
193
181
|
@classmethod
|
|
194
182
|
def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
|
|
195
|
-
with
|
|
183
|
+
with set_graph_context(graph):
|
|
196
184
|
structured_property: Optional[
|
|
197
185
|
StructuredPropertyDefinitionClass
|
|
198
186
|
] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
|
datahub/cli/ingest_cli.py
CHANGED
|
@@ -27,6 +27,7 @@ from datahub.utilities.perf_timer import PerfTimer
|
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
29
29
|
|
|
30
|
+
INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"]
|
|
30
31
|
RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"]
|
|
31
32
|
RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"]
|
|
32
33
|
|
|
@@ -437,6 +438,115 @@ def mcps(path: str) -> None:
|
|
|
437
438
|
sys.exit(ret)
|
|
438
439
|
|
|
439
440
|
|
|
441
|
+
@ingest.command()
|
|
442
|
+
@click.argument("page_offset", type=int, default=0)
|
|
443
|
+
@click.argument("page_size", type=int, default=100)
|
|
444
|
+
@click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.")
|
|
445
|
+
@click.option(
|
|
446
|
+
"--source", type=str, default=None, help="Filter by ingestion source name."
|
|
447
|
+
)
|
|
448
|
+
@upgrade.check_upgrade
|
|
449
|
+
@telemetry.with_telemetry()
|
|
450
|
+
def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
|
|
451
|
+
"""List ingestion source runs with their details, optionally filtered by URN or source."""
|
|
452
|
+
|
|
453
|
+
query = """
|
|
454
|
+
query listIngestionRuns($input: ListIngestionSourcesInput!) {
|
|
455
|
+
listIngestionSources(input: $input) {
|
|
456
|
+
ingestionSources {
|
|
457
|
+
urn
|
|
458
|
+
name
|
|
459
|
+
executions {
|
|
460
|
+
executionRequests {
|
|
461
|
+
id
|
|
462
|
+
result {
|
|
463
|
+
startTimeMs
|
|
464
|
+
status
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
"""
|
|
472
|
+
|
|
473
|
+
# filter by urn and/or source using CONTAINS
|
|
474
|
+
filters = []
|
|
475
|
+
if urn:
|
|
476
|
+
filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"})
|
|
477
|
+
if source:
|
|
478
|
+
filters.append({"field": "name", "values": [source], "condition": "CONTAIN"})
|
|
479
|
+
|
|
480
|
+
variables = {
|
|
481
|
+
"input": {
|
|
482
|
+
"start": page_offset,
|
|
483
|
+
"count": page_size,
|
|
484
|
+
"filters": filters,
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
client = get_default_graph()
|
|
489
|
+
session = client._session
|
|
490
|
+
gms_host = client.config.server
|
|
491
|
+
|
|
492
|
+
url = f"{gms_host}/api/graphql"
|
|
493
|
+
try:
|
|
494
|
+
response = session.post(url, json={"query": query, "variables": variables})
|
|
495
|
+
response.raise_for_status()
|
|
496
|
+
except Exception as e:
|
|
497
|
+
click.echo(f"Error fetching data: {str(e)}")
|
|
498
|
+
return
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
data = response.json()
|
|
502
|
+
except ValueError:
|
|
503
|
+
click.echo("Failed to parse JSON response from server.")
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
if not data:
|
|
507
|
+
click.echo("No response received from the server.")
|
|
508
|
+
return
|
|
509
|
+
|
|
510
|
+
# when urn or source filter does not match, exit gracefully
|
|
511
|
+
if (
|
|
512
|
+
not isinstance(data.get("data"), dict)
|
|
513
|
+
or "listIngestionSources" not in data["data"]
|
|
514
|
+
):
|
|
515
|
+
click.echo("No matching ingestion sources found. Please check your filters.")
|
|
516
|
+
return
|
|
517
|
+
|
|
518
|
+
ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
|
|
519
|
+
if not ingestion_sources:
|
|
520
|
+
click.echo("No ingestion sources or executions found.")
|
|
521
|
+
return
|
|
522
|
+
|
|
523
|
+
rows = []
|
|
524
|
+
for ingestion_source in ingestion_sources:
|
|
525
|
+
urn = ingestion_source.get("urn", "N/A")
|
|
526
|
+
name = ingestion_source.get("name", "N/A")
|
|
527
|
+
|
|
528
|
+
executions = ingestion_source.get("executions", {}).get("executionRequests", [])
|
|
529
|
+
for execution in executions:
|
|
530
|
+
execution_id = execution.get("id", "N/A")
|
|
531
|
+
start_time = execution.get("result", {}).get("startTimeMs", "N/A")
|
|
532
|
+
start_time = (
|
|
533
|
+
datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
|
|
534
|
+
if start_time != "N/A"
|
|
535
|
+
else "N/A"
|
|
536
|
+
)
|
|
537
|
+
status = execution.get("result", {}).get("status", "N/A")
|
|
538
|
+
|
|
539
|
+
rows.append([execution_id, name, start_time, status, urn])
|
|
540
|
+
|
|
541
|
+
click.echo(
|
|
542
|
+
tabulate(
|
|
543
|
+
rows,
|
|
544
|
+
headers=INGEST_SRC_TABLE_COLUMNS,
|
|
545
|
+
tablefmt="grid",
|
|
546
|
+
)
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
|
|
440
550
|
@ingest.command()
|
|
441
551
|
@click.argument("page_offset", type=int, default=0)
|
|
442
552
|
@click.argument("page_size", type=int, default=100)
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -46,8 +46,18 @@ _DEFAULT_RETRY_MAX_TIMES = int(
|
|
|
46
46
|
os.getenv("DATAHUB_REST_EMITTER_DEFAULT_RETRY_MAX_TIMES", "4")
|
|
47
47
|
)
|
|
48
48
|
|
|
49
|
-
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
|
-
|
|
49
|
+
# The limit is 16mb. We will use a max of 15mb to have some space
|
|
50
|
+
# for overhead like request headers.
|
|
51
|
+
# This applies to pretty much all calls to GMS.
|
|
52
|
+
INGEST_MAX_PAYLOAD_BYTES = 15 * 1024 * 1024
|
|
53
|
+
|
|
54
|
+
# This limit is somewhat arbitrary. All GMS endpoints will timeout
|
|
55
|
+
# and return a 500 if processing takes too long. To avoid sending
|
|
56
|
+
# too much to the backend and hitting a timeout, we try to limit
|
|
57
|
+
# the number of MCPs we send in a batch.
|
|
58
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH = int(
|
|
59
|
+
os.getenv("DATAHUB_REST_EMITTER_BATCH_MAX_PAYLOAD_LENGTH", 200)
|
|
60
|
+
)
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
class DataHubRestEmitter(Closeable, Emitter):
|
|
@@ -290,11 +300,14 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
290
300
|
# As a safety mechanism, we need to make sure we don't exceed the max payload size for GMS.
|
|
291
301
|
# If we will exceed the limit, we need to break it up into chunks.
|
|
292
302
|
mcp_obj_chunks: List[List[str]] = []
|
|
293
|
-
current_chunk_size =
|
|
303
|
+
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
294
304
|
for mcp_obj in mcp_objs:
|
|
295
305
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
296
306
|
|
|
297
|
-
if
|
|
307
|
+
if (
|
|
308
|
+
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
309
|
+
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
310
|
+
):
|
|
298
311
|
mcp_obj_chunks.append([])
|
|
299
312
|
current_chunk_size = 0
|
|
300
313
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
@@ -18,7 +18,10 @@ from datahub.configuration.common import (
|
|
|
18
18
|
)
|
|
19
19
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
20
20
|
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
21
|
-
from datahub.emitter.rest_emitter import
|
|
21
|
+
from datahub.emitter.rest_emitter import (
|
|
22
|
+
BATCH_INGEST_MAX_PAYLOAD_LENGTH,
|
|
23
|
+
DataHubRestEmitter,
|
|
24
|
+
)
|
|
22
25
|
from datahub.ingestion.api.common import RecordEnvelope, WorkUnit
|
|
23
26
|
from datahub.ingestion.api.sink import (
|
|
24
27
|
NoopWriteCallback,
|
|
@@ -71,6 +74,14 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|
|
71
74
|
# Only applies in async batch mode.
|
|
72
75
|
max_per_batch: pydantic.PositiveInt = 100
|
|
73
76
|
|
|
77
|
+
@pydantic.validator("max_per_batch", always=True)
|
|
78
|
+
def validate_max_per_batch(cls, v):
|
|
79
|
+
if v > BATCH_INGEST_MAX_PAYLOAD_LENGTH:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"max_per_batch must be less than or equal to {BATCH_INGEST_MAX_PAYLOAD_LENGTH}"
|
|
82
|
+
)
|
|
83
|
+
return v
|
|
84
|
+
|
|
74
85
|
|
|
75
86
|
@dataclasses.dataclass
|
|
76
87
|
class DataHubRestSinkReport(SinkReport):
|
|
@@ -147,6 +147,47 @@ class DataHubDatabaseReader:
|
|
|
147
147
|
version
|
|
148
148
|
"""
|
|
149
149
|
|
|
150
|
+
def execute_server_cursor(
|
|
151
|
+
self, query: str, params: Dict[str, Any]
|
|
152
|
+
) -> Iterable[Dict[str, Any]]:
|
|
153
|
+
with self.engine.connect() as conn:
|
|
154
|
+
if self.engine.dialect.name == "postgresql":
|
|
155
|
+
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
|
156
|
+
conn = conn.execution_options(
|
|
157
|
+
stream_results=True,
|
|
158
|
+
yield_per=self.config.database_query_batch_size,
|
|
159
|
+
)
|
|
160
|
+
result = conn.execute(query, params)
|
|
161
|
+
for row in result:
|
|
162
|
+
yield dict(row)
|
|
163
|
+
elif self.engine.dialect.name == "mysql": # MySQL
|
|
164
|
+
import MySQLdb
|
|
165
|
+
|
|
166
|
+
with contextlib.closing(
|
|
167
|
+
conn.connection.cursor(MySQLdb.cursors.SSCursor)
|
|
168
|
+
) as cursor:
|
|
169
|
+
logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
|
|
170
|
+
cursor.execute(query, params)
|
|
171
|
+
|
|
172
|
+
columns = [desc[0] for desc in cursor.description]
|
|
173
|
+
while True:
|
|
174
|
+
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
175
|
+
if not rows:
|
|
176
|
+
break # Use break instead of return in generator
|
|
177
|
+
for row in rows:
|
|
178
|
+
yield dict(zip(columns, row))
|
|
179
|
+
else:
|
|
180
|
+
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
181
|
+
|
|
182
|
+
def _get_rows(
|
|
183
|
+
self, from_createdon: datetime, stop_time: datetime
|
|
184
|
+
) -> Iterable[Dict[str, Any]]:
|
|
185
|
+
params = {
|
|
186
|
+
"exclude_aspects": list(self.config.exclude_aspects),
|
|
187
|
+
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
188
|
+
}
|
|
189
|
+
yield from self.execute_server_cursor(self.query, params)
|
|
190
|
+
|
|
150
191
|
def get_aspects(
|
|
151
192
|
self, from_createdon: datetime, stop_time: datetime
|
|
152
193
|
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
@@ -159,27 +200,6 @@ class DataHubDatabaseReader:
|
|
|
159
200
|
if mcp:
|
|
160
201
|
yield mcp, row["createdon"]
|
|
161
202
|
|
|
162
|
-
def _get_rows(
|
|
163
|
-
self, from_createdon: datetime, stop_time: datetime
|
|
164
|
-
) -> Iterable[Dict[str, Any]]:
|
|
165
|
-
with self.engine.connect() as conn:
|
|
166
|
-
with contextlib.closing(conn.connection.cursor()) as cursor:
|
|
167
|
-
cursor.execute(
|
|
168
|
-
self.query,
|
|
169
|
-
{
|
|
170
|
-
"exclude_aspects": list(self.config.exclude_aspects),
|
|
171
|
-
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
172
|
-
},
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
columns = [desc[0] for desc in cursor.description]
|
|
176
|
-
while True:
|
|
177
|
-
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
178
|
-
if not rows:
|
|
179
|
-
return
|
|
180
|
-
for row in rows:
|
|
181
|
-
yield dict(zip(columns, row))
|
|
182
|
-
|
|
183
203
|
def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
|
|
184
204
|
"""
|
|
185
205
|
Fetches all soft-deleted entities from the database.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from datetime import datetime, timezone
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
3
|
from functools import partial
|
|
4
4
|
from typing import Dict, Iterable, List, Optional
|
|
5
5
|
|
|
@@ -26,6 +26,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
26
26
|
StatefulIngestionSourceBase,
|
|
27
27
|
)
|
|
28
28
|
from datahub.metadata.schema_classes import ChangeTypeClass
|
|
29
|
+
from datahub.utilities.progress_timer import ProgressTimer
|
|
29
30
|
|
|
30
31
|
logger = logging.getLogger(__name__)
|
|
31
32
|
|
|
@@ -105,11 +106,17 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
105
106
|
self, from_createdon: datetime, reader: DataHubDatabaseReader
|
|
106
107
|
) -> Iterable[MetadataWorkUnit]:
|
|
107
108
|
logger.info(f"Fetching database aspects starting from {from_createdon}")
|
|
109
|
+
progress = ProgressTimer(report_every=timedelta(seconds=60))
|
|
108
110
|
mcps = reader.get_aspects(from_createdon, self.report.stop_time)
|
|
109
111
|
for i, (mcp, createdon) in enumerate(mcps):
|
|
110
112
|
if not self.urn_pattern.allowed(str(mcp.entityUrn)):
|
|
111
113
|
continue
|
|
112
114
|
|
|
115
|
+
if progress.should_report():
|
|
116
|
+
logger.info(
|
|
117
|
+
f"Ingested {i} database aspects so far, currently at {createdon}"
|
|
118
|
+
)
|
|
119
|
+
|
|
113
120
|
yield mcp.as_workunit()
|
|
114
121
|
self.report.num_database_aspects_ingested += 1
|
|
115
122
|
|