acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/rest_emitter.py +16 -1
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +54 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +12 -1
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
- datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +14 -0
- datahub/ingestion/source/tableau/tableau.py +4 -5
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source_report/ingestion_stage.py +1 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
- datahub/sql_parsing/tool_meta_extractor.py +116 -5
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=gbsVKK_ULsM259cMG08Rrx6A9_72Iy7zxyDkQZ37NCw,576
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -52,7 +52,7 @@ datahub/api/entities/forms/forms_graphql_constants.py,sha256=DKpnKlMKTjmnyrCTvp6
|
|
|
52
52
|
datahub/api/entities/platformresource/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
53
|
datahub/api/entities/platformresource/platform_resource.py,sha256=pVAjv6NoH746Mfvdak7ji0eqlEcEeV-Ji7M5gyNXmds,10603
|
|
54
54
|
datahub/api/entities/structuredproperties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
|
-
datahub/api/entities/structuredproperties/structuredproperties.py,sha256=
|
|
55
|
+
datahub/api/entities/structuredproperties/structuredproperties.py,sha256=wtIt-zqW8_PamvSPpRR28tKkiOAZ7ME3lufoaTeGnlU,8116
|
|
56
56
|
datahub/api/graphql/__init__.py,sha256=5yl0dJxO-2d_QuykdJrDIbWq4ja9bo0t2dAEh89JOog,142
|
|
57
57
|
datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0VhA,2818
|
|
58
58
|
datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
|
|
@@ -100,7 +100,7 @@ datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVg
|
|
|
100
100
|
datahub/configuration/kafka_consumer_config.py,sha256=LivsObTt9yC3WoGnslJbF_x4ojfNdxMIMEhb8vvJfcA,2133
|
|
101
101
|
datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
|
|
102
102
|
datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
|
|
103
|
-
datahub/configuration/source_common.py,sha256=
|
|
103
|
+
datahub/configuration/source_common.py,sha256=CXs5xJzeFpnrKojVZbHZOvoBDnrkEogN-e8hqMERzfo,2606
|
|
104
104
|
datahub/configuration/time_window_config.py,sha256=xuwoftrHWC069qfumErCV3BCuE-RWVsDLAefNWn_RFc,5347
|
|
105
105
|
datahub/configuration/toml.py,sha256=Ohc5sAWLPoAinPYL8njyheZ3ak81fC2Sp8IbBbESPGg,380
|
|
106
106
|
datahub/configuration/validate_field_deprecation.py,sha256=byBKOlGhPkRO_aCXlJ9BQk7-v3RqCnwI5_sho72ZDMQ,1268
|
|
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
|
|
|
119
119
|
datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
|
|
120
120
|
datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
|
|
121
121
|
datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
|
|
122
|
-
datahub/emitter/rest_emitter.py,sha256=
|
|
122
|
+
datahub/emitter/rest_emitter.py,sha256=d5Zjo3GXDu9rUqlSsK9aOx-yEbHjDFZHelfq_ZFeb5M,16393
|
|
123
123
|
datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
|
|
124
124
|
datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
|
|
125
125
|
datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
|
|
@@ -144,6 +144,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
|
|
|
144
144
|
datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
|
|
145
145
|
datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
146
|
datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
|
|
147
|
+
datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
|
|
147
148
|
datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
148
149
|
datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
|
|
149
150
|
datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
|
|
@@ -195,7 +196,7 @@ datahub/ingestion/source/ge_profiling_config.py,sha256=P-9pd20koFvpxeEL_pqFvKWWz
|
|
|
195
196
|
datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
|
|
196
197
|
datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
|
|
197
198
|
datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
|
|
198
|
-
datahub/ingestion/source/mlflow.py,sha256
|
|
199
|
+
datahub/ingestion/source/mlflow.py,sha256=-yWUuAEVBiNN-elz8Pgn0UeGsC3fVB20z1zKNIr4LXI,12309
|
|
199
200
|
datahub/ingestion/source/mode.py,sha256=n_5em3jADCr5gWTLDOP4O4bRS0Zt_TCZtW8uFPxn-DI,63043
|
|
200
201
|
datahub/ingestion/source/mongodb.py,sha256=vZue4Nz0xaBoCUsQr3_0OIRkWRxeE_IH_Y_QKZ1s7S0,21077
|
|
201
202
|
datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPGPWQ,56146
|
|
@@ -313,28 +314,32 @@ datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvk
|
|
|
313
314
|
datahub/ingestion/source/grafana/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
314
315
|
datahub/ingestion/source/grafana/grafana_source.py,sha256=3pU3xodPgS5lmnjuQ_u7F0XPzD_Y8MnPlMxRJ86qz4g,4960
|
|
315
316
|
datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
316
|
-
datahub/ingestion/source/iceberg/iceberg.py,sha256=
|
|
317
|
-
datahub/ingestion/source/iceberg/iceberg_common.py,sha256=
|
|
317
|
+
datahub/ingestion/source/iceberg/iceberg.py,sha256=Pi2QD8v0HOpqr8M9la78Nlm3Be9iy3G4pCZqu2NitZM,27253
|
|
318
|
+
datahub/ingestion/source/iceberg/iceberg_common.py,sha256=4efWbnj8iWWNcO6_lFXFZRIzaKVPWhd1MmmxdJafemw,8684
|
|
318
319
|
datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=hLT1Le_TEUoFXvsJSlrRB1qbTiTe-YVGCof5TFHMyd8,9908
|
|
319
320
|
datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
320
321
|
datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR2woGLRJwBXUkgXq0,28809
|
|
321
322
|
datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
|
|
322
323
|
datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
323
324
|
datahub/ingestion/source/kafka/kafka.py,sha256=9SR7bqp9J0rPYde5IClhnAuVNy9ItsB8-ZeXtTc_mEY,26442
|
|
324
|
-
datahub/ingestion/source/kafka/kafka_connect.py,sha256=Jm1MYky_OPIwvVHuEjgOjK0e6-jA-dYnsLZ7r-Y_9mA,56208
|
|
325
325
|
datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
|
|
326
|
+
datahub/ingestion/source/kafka_connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
327
|
+
datahub/ingestion/source/kafka_connect/common.py,sha256=Ekb1K_J1eTgiH7LSP1AbEIf7NQh_2Vyu1lYX_Ggcqk4,7049
|
|
328
|
+
datahub/ingestion/source/kafka_connect/kafka_connect.py,sha256=8bjVFDkwjgs3gP7Y3itzABGfBcY_WbMQ5PWjrm-g93A,14249
|
|
329
|
+
datahub/ingestion/source/kafka_connect/sink_connectors.py,sha256=ESuJE5SFLLvss9OwDEIB8SAko4rhzaWZ-4dKY0Dh0N8,12900
|
|
330
|
+
datahub/ingestion/source/kafka_connect/source_connectors.py,sha256=_765fSMDAWAe0Cf_F4VNHfOWKNhtqBA1Ep2jL3rf-qc,21263
|
|
326
331
|
datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
327
332
|
datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
|
|
328
|
-
datahub/ingestion/source/looker/looker_common.py,sha256=
|
|
333
|
+
datahub/ingestion/source/looker/looker_common.py,sha256=0-5xhq7vE9YFj8tzbvRI7RnP1cD_oxnN4NpSXjKzxjE,61726
|
|
329
334
|
datahub/ingestion/source/looker/looker_config.py,sha256=87WAgdJ_QWdTq25RBwgIqfc2kq7dubSpzbEtXb2ihMw,13182
|
|
330
335
|
datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
|
|
331
336
|
datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
|
|
332
337
|
datahub/ingestion/source/looker/looker_dataclasses.py,sha256=ULWLFWsV2cKmTuOFavD8QjEBmnXmvjyr8RbUB62DwJQ,12178
|
|
333
338
|
datahub/ingestion/source/looker/looker_file_loader.py,sha256=c1ewDrIb9VJg1o-asbwX9gL83kgL01vIETzzbmZIhmw,4267
|
|
334
|
-
datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=
|
|
339
|
+
datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=0gaYjBv4wkbbLWVgvaAV6JyWAFb0utTG6TCve2d9xss,11511
|
|
335
340
|
datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
|
|
336
341
|
datahub/ingestion/source/looker/looker_query_model.py,sha256=N0jBbFruiCIIGT6sJn6tNeppeQ78KGTkOwTLirhxFNc,2144
|
|
337
|
-
datahub/ingestion/source/looker/looker_source.py,sha256=
|
|
342
|
+
datahub/ingestion/source/looker/looker_source.py,sha256=IUI2FpYG2_bhVnP_LZeQymjc5D0F7lqYTADktzV3yr8,65735
|
|
338
343
|
datahub/ingestion/source/looker/looker_template_language.py,sha256=EG4ZfVZ0x53lgaYh2ohzL4ZCy9KsX0TA51XqCmsCd2Q,14328
|
|
339
344
|
datahub/ingestion/source/looker/looker_usage.py,sha256=qegMr-Rnqz3xNGSBfsuD3S_BPXf7UEMhwFN7DPQeLNo,22914
|
|
340
345
|
datahub/ingestion/source/looker/looker_view_id_cache.py,sha256=92gDy6NONhJYBp92z_IBzDVZvezmUIkaBCZY1bdk6mE,4392
|
|
@@ -351,15 +356,15 @@ datahub/ingestion/source/metadata/lineage.py,sha256=XiZGuY6k3O9qBmgo7AzosIndJHwr
|
|
|
351
356
|
datahub/ingestion/source/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
352
357
|
datahub/ingestion/source/neo4j/neo4j_source.py,sha256=L9WiZ5yZrIDMrgj3gYU9j6zz3TRMXYpcWxeTegD7sFg,12409
|
|
353
358
|
datahub/ingestion/source/powerbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
354
|
-
datahub/ingestion/source/powerbi/config.py,sha256=
|
|
355
|
-
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256
|
|
359
|
+
datahub/ingestion/source/powerbi/config.py,sha256=9cgMrwp3DQqv14iTqulAONkbVe5nhycJ1ElkA_275go,22732
|
|
360
|
+
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=-njW1kJOy-LY5JFwJLhVQ0bMBj9NQz5TZhQqsSi_KsM,2285
|
|
356
361
|
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=PRWzuZMMhKdOVoAaE8csHvUFbZHxYe5meJHgrqlgiuw,19795
|
|
357
362
|
datahub/ingestion/source/powerbi/powerbi.py,sha256=7UsAEqaFlkWONcXJdQ2hotUYYn46ks6Fe71KXEMh7lI,54495
|
|
358
363
|
datahub/ingestion/source/powerbi/m_query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
359
364
|
datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=EbaEasEOGZ73jz0cQofH9ez65wSvRBof0R6GQaIVLnM,2009
|
|
360
365
|
datahub/ingestion/source/powerbi/m_query/native_sql_parser.py,sha256=zzKVDGeUM3Yv3-zNah4D6mSnr6jXsstNuLmzczcPQEE,3683
|
|
361
366
|
datahub/ingestion/source/powerbi/m_query/parser.py,sha256=pB1LGdb02Ryf8Pr8JPSgiOmLE6mEAgDbKodtKOOY6LU,5782
|
|
362
|
-
datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=
|
|
367
|
+
datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=yDNdzPREzegarux_I9F_PsLe1hlSFa-uwoORrEgbO-c,32485
|
|
363
368
|
datahub/ingestion/source/powerbi/m_query/resolver.py,sha256=TMt84_JaCazpP7vcneW0O3cUjtbIuh8Yid78JWfDxsI,16953
|
|
364
369
|
datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=h77DunhlgOP0fAg8UXDXxxInOi7Pay85_d1Ca4YqyKs,6134
|
|
365
370
|
datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
|
|
@@ -427,19 +432,19 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
|
|
|
427
432
|
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
|
|
428
433
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
429
434
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
430
|
-
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=
|
|
435
|
+
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=uMGmMEl4hWEmN7GxMyDBdwlIPAW7WmOnu41kZ0dvCG4,21551
|
|
431
436
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
432
|
-
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=
|
|
437
|
+
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
|
|
433
438
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
|
|
434
439
|
datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
|
|
435
440
|
datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
|
|
436
|
-
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=
|
|
441
|
+
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=K-KEr3OpwMHye08lXAy-5doUUGoGJP3b-ntJAGU_NBY,42472
|
|
437
442
|
datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
|
|
438
443
|
datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
|
|
439
444
|
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
|
|
440
445
|
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
|
|
441
446
|
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
|
|
442
|
-
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=
|
|
447
|
+
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=ecaTCJNAQ_IJOPInPGXA3jv1dE5lztSU82UhpBygiq0,31654
|
|
443
448
|
datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
444
449
|
datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
|
|
445
450
|
datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
|
|
@@ -467,8 +472,8 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
|
|
|
467
472
|
datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
|
|
468
473
|
datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
|
|
469
474
|
datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
|
|
470
|
-
datahub/ingestion/source/sql/mssql/job_models.py,sha256=
|
|
471
|
-
datahub/ingestion/source/sql/mssql/source.py,sha256
|
|
475
|
+
datahub/ingestion/source/sql/mssql/job_models.py,sha256=ztXDrD4anhzwWvACIm9fucE2WhMDMKkJ4alMYOQOqWA,7083
|
|
476
|
+
datahub/ingestion/source/sql/mssql/source.py,sha256=ODdsOIbDA3X0E7En6GT15mD49W6RW9sXLwRoUgw2a8I,30925
|
|
472
477
|
datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
|
|
473
478
|
datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
474
479
|
datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
|
|
@@ -486,11 +491,11 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
|
|
|
486
491
|
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
|
|
487
492
|
datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
|
|
488
493
|
datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
489
|
-
datahub/ingestion/source/tableau/tableau.py,sha256=
|
|
494
|
+
datahub/ingestion/source/tableau/tableau.py,sha256=80m5a8g2q5jnm5pFtL4s_SmSeGPQIuZGqGe9RGn6OSI,138248
|
|
490
495
|
datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
|
|
491
|
-
datahub/ingestion/source/tableau/tableau_constant.py,sha256=
|
|
492
|
-
datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=
|
|
493
|
-
datahub/ingestion/source/tableau/tableau_validation.py,sha256=
|
|
496
|
+
datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
|
|
497
|
+
datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
|
|
498
|
+
datahub/ingestion/source/tableau/tableau_validation.py,sha256=pd--LcTLTfrFsouhCOvGC_2IjeMfKbJV81EEo3ibMwE,1820
|
|
494
499
|
datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
495
500
|
datahub/ingestion/source/unity/analyze_profiler.py,sha256=2pqkFY30CfN4aHgFZZntjeG0hNhBytZJvXC13VfTc1I,4689
|
|
496
501
|
datahub/ingestion/source/unity/config.py,sha256=m4-n7mYz4Ct4L1QdfJFklwHyj8boKCbV7Sb3Ou6AT3Q,14756
|
|
@@ -501,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=2-pYQ-3B9UVUwO1yB9iTdi3DqgqZ2JrpQ
|
|
|
501
506
|
datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
|
|
502
507
|
datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
|
|
503
508
|
datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
|
|
504
|
-
datahub/ingestion/source/unity/source.py,sha256=
|
|
509
|
+
datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
|
|
505
510
|
datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
|
|
506
511
|
datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
507
512
|
datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
|
|
@@ -512,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
|
|
|
512
517
|
datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
|
|
513
518
|
datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
|
|
514
519
|
datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
515
|
-
datahub/ingestion/source_report/ingestion_stage.py,sha256=
|
|
520
|
+
datahub/ingestion/source_report/ingestion_stage.py,sha256=pJcJeLSjaixlLqQyQtE3bfUcvXEVwrSaWWtU4iU9UEo,1557
|
|
516
521
|
datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
|
|
517
522
|
datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
|
|
518
523
|
datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -871,12 +876,12 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
|
|
|
871
876
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
872
877
|
datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
|
|
873
878
|
datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
|
|
874
|
-
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=
|
|
879
|
+
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=LBs1RjRqh3natrx4WfgRQGNpI56o12jtbABO5ipEBWA,69889
|
|
875
880
|
datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
|
|
876
881
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
877
882
|
datahub/sql_parsing/sqlglot_lineage.py,sha256=CLDOc0HNqL_539eahOP3QOoldIYC6CF29id4Xe3TlEM,47018
|
|
878
883
|
datahub/sql_parsing/sqlglot_utils.py,sha256=n6yufzEGwSlFeCSU540hEldIuab0q8KGqm9x0vSawkc,14699
|
|
879
|
-
datahub/sql_parsing/tool_meta_extractor.py,sha256=
|
|
884
|
+
datahub/sql_parsing/tool_meta_extractor.py,sha256=7tY4FAClhFcqwc23lGVlnT6Dequ_5Xcpbt0hDvnlLzM,6670
|
|
880
885
|
datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
881
886
|
datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
|
|
882
887
|
datahub/telemetry/telemetry.py,sha256=gzla-QGNsynGg2FqFxiDDFQ0emG53MJ9lhOA2-UUg-Y,15047
|
|
@@ -976,8 +981,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
976
981
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
977
982
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
978
983
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
979
|
-
acryl_datahub-0.15.
|
|
980
|
-
acryl_datahub-0.15.
|
|
981
|
-
acryl_datahub-0.15.
|
|
982
|
-
acryl_datahub-0.15.
|
|
983
|
-
acryl_datahub-0.15.
|
|
984
|
+
acryl_datahub-0.15.0.1rc2.dist-info/METADATA,sha256=gLix1LBWIrfQF-dcU1JsLeAAFkHuALY9dHVboVmjIJg,173642
|
|
985
|
+
acryl_datahub-0.15.0.1rc2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
986
|
+
acryl_datahub-0.15.0.1rc2.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
|
|
987
|
+
acryl_datahub-0.15.0.1rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
988
|
+
acryl_datahub-0.15.0.1rc2.dist-info/RECORD,,
|
|
@@ -57,7 +57,7 @@ hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource
|
|
|
57
57
|
iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource
|
|
58
58
|
json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource
|
|
59
59
|
kafka = datahub.ingestion.source.kafka.kafka:KafkaSource
|
|
60
|
-
kafka-connect = datahub.ingestion.source.
|
|
60
|
+
kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource
|
|
61
61
|
ldap = datahub.ingestion.source.ldap:LDAPSource
|
|
62
62
|
looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource
|
|
63
63
|
lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource
|
datahub/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@ from datahub.metadata.schema_classes import (
|
|
|
14
14
|
PropertyValueClass,
|
|
15
15
|
StructuredPropertyDefinitionClass,
|
|
16
16
|
)
|
|
17
|
-
from datahub.metadata.urns import StructuredPropertyUrn, Urn
|
|
17
|
+
from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn
|
|
18
18
|
from datahub.utilities.urns._urn_base import URN_TYPES
|
|
19
19
|
|
|
20
20
|
logging.basicConfig(level=logging.INFO)
|
|
@@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel):
|
|
|
86
86
|
|
|
87
87
|
@validator("type")
|
|
88
88
|
def validate_type(cls, v: str) -> str:
|
|
89
|
-
#
|
|
90
|
-
|
|
89
|
+
# This logic is somewhat hacky, since we need to deal with
|
|
90
|
+
# 1. fully qualified urns
|
|
91
|
+
# 2. raw data types, that need to get the datahub namespace prefix
|
|
92
|
+
# While keeping the user-facing interface and error messages clean.
|
|
93
|
+
|
|
94
|
+
if not v.startswith("urn:li:") and not v.islower():
|
|
95
|
+
# Convert to lowercase if needed
|
|
96
|
+
v = v.lower()
|
|
91
97
|
logger.warning(
|
|
92
|
-
f"Structured property type should be lowercase. Updated to {v
|
|
98
|
+
f"Structured property type should be lowercase. Updated to {v}"
|
|
93
99
|
)
|
|
94
|
-
|
|
100
|
+
|
|
101
|
+
urn = Urn.make_data_type_urn(v)
|
|
95
102
|
|
|
96
103
|
# Check if type is allowed
|
|
97
|
-
|
|
104
|
+
data_type_urn = DataTypeUrn.from_string(urn)
|
|
105
|
+
unqualified_data_type = data_type_urn.id
|
|
106
|
+
if unqualified_data_type.startswith("datahub."):
|
|
107
|
+
unqualified_data_type = unqualified_data_type[len("datahub.") :]
|
|
108
|
+
if not AllowedTypes.check_allowed_type(unqualified_data_type):
|
|
98
109
|
raise ValueError(
|
|
99
|
-
f"Type {
|
|
110
|
+
f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}"
|
|
100
111
|
)
|
|
101
|
-
|
|
112
|
+
|
|
113
|
+
return urn
|
|
102
114
|
|
|
103
115
|
@property
|
|
104
116
|
def fqn(self) -> str:
|
|
@@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin):
|
|
|
63
63
|
default=None,
|
|
64
64
|
description="A holder for platform -> platform_instance mappings to generate correct dataset urns",
|
|
65
65
|
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class PlatformDetail(ConfigModel):
|
|
69
|
+
platform_instance: Optional[str] = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
|
|
72
|
+
"with platform instance name used in ingestion "
|
|
73
|
+
"recipe of other datahub sources.",
|
|
74
|
+
)
|
|
75
|
+
env: str = Field(
|
|
76
|
+
default=DEFAULT_ENV,
|
|
77
|
+
description="The environment that all assets produced by DataHub platform ingestion source belong to",
|
|
78
|
+
)
|
datahub/emitter/rest_emitter.py
CHANGED
|
@@ -291,6 +291,7 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
291
291
|
mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
|
|
292
292
|
async_flag: Optional[bool] = None,
|
|
293
293
|
) -> int:
|
|
294
|
+
logger.debug("Attempting to emit batch mcps")
|
|
294
295
|
url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
|
|
295
296
|
for mcp in mcps:
|
|
296
297
|
ensure_has_system_metadata(mcp)
|
|
@@ -303,15 +304,22 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
303
304
|
current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
|
|
304
305
|
for mcp_obj in mcp_objs:
|
|
305
306
|
mcp_obj_size = len(json.dumps(mcp_obj))
|
|
307
|
+
logger.debug(
|
|
308
|
+
f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
|
|
309
|
+
)
|
|
306
310
|
|
|
307
311
|
if (
|
|
308
312
|
mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
|
|
309
313
|
or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
|
|
310
314
|
):
|
|
315
|
+
logger.debug("Decided to create new chunk")
|
|
311
316
|
mcp_obj_chunks.append([])
|
|
312
317
|
current_chunk_size = 0
|
|
313
318
|
mcp_obj_chunks[-1].append(mcp_obj)
|
|
314
319
|
current_chunk_size += mcp_obj_size
|
|
320
|
+
logger.debug(
|
|
321
|
+
f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
|
|
322
|
+
)
|
|
315
323
|
|
|
316
324
|
for mcp_obj_chunk in mcp_obj_chunks:
|
|
317
325
|
# TODO: We're calling json.dumps on each MCP object twice, once to estimate
|
|
@@ -338,8 +346,15 @@ class DataHubRestEmitter(Closeable, Emitter):
|
|
|
338
346
|
|
|
339
347
|
def _emit_generic(self, url: str, payload: str) -> None:
|
|
340
348
|
curl_command = make_curl_command(self._session, "POST", url, payload)
|
|
349
|
+
payload_size = len(payload)
|
|
350
|
+
if payload_size > INGEST_MAX_PAYLOAD_BYTES:
|
|
351
|
+
# since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
|
|
352
|
+
logger.warning(
|
|
353
|
+
f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
|
|
354
|
+
)
|
|
341
355
|
logger.debug(
|
|
342
|
-
"Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
|
|
356
|
+
"Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
|
|
357
|
+
payload_size,
|
|
343
358
|
curl_command,
|
|
344
359
|
)
|
|
345
360
|
try:
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Iterable, List
|
|
4
|
+
|
|
5
|
+
from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
|
|
6
|
+
from datahub.emitter.serialization_helper import pre_json_transform
|
|
7
|
+
from datahub.ingestion.api.source import SourceReport
|
|
8
|
+
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
9
|
+
from datahub.metadata.schema_classes import (
|
|
10
|
+
DatasetProfileClass,
|
|
11
|
+
SchemaFieldClass,
|
|
12
|
+
SchemaMetadataClass,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EnsureAspectSizeProcessor:
|
|
19
|
+
def __init__(
|
|
20
|
+
self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
|
|
21
|
+
):
|
|
22
|
+
self.report = report
|
|
23
|
+
self.payload_constraint = payload_constraint
|
|
24
|
+
|
|
25
|
+
def ensure_dataset_profile_size(
|
|
26
|
+
self, dataset_urn: str, profile: DatasetProfileClass
|
|
27
|
+
) -> None:
|
|
28
|
+
"""
|
|
29
|
+
This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
|
|
30
|
+
in the future
|
|
31
|
+
"""
|
|
32
|
+
sample_fields_size = 0
|
|
33
|
+
if profile.fieldProfiles:
|
|
34
|
+
logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
|
|
35
|
+
for field in profile.fieldProfiles:
|
|
36
|
+
if field.sampleValues:
|
|
37
|
+
values_len = 0
|
|
38
|
+
for value in field.sampleValues:
|
|
39
|
+
if value:
|
|
40
|
+
values_len += len(value)
|
|
41
|
+
logger.debug(
|
|
42
|
+
f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
|
|
43
|
+
)
|
|
44
|
+
if sample_fields_size + values_len > self.payload_constraint:
|
|
45
|
+
field.sampleValues = []
|
|
46
|
+
self.report.warning(
|
|
47
|
+
title="Dataset profile truncated due to size constraint",
|
|
48
|
+
message="Dataset profile contained too much data and would have caused ingestion to fail",
|
|
49
|
+
context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
|
|
50
|
+
)
|
|
51
|
+
else:
|
|
52
|
+
sample_fields_size += values_len
|
|
53
|
+
else:
|
|
54
|
+
logger.debug(f"Field {field.fieldPath} has no sample values")
|
|
55
|
+
|
|
56
|
+
def ensure_schema_metadata_size(
|
|
57
|
+
self, dataset_urn: str, schema: SchemaMetadataClass
|
|
58
|
+
) -> None:
|
|
59
|
+
"""
|
|
60
|
+
This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
|
|
61
|
+
in the future
|
|
62
|
+
"""
|
|
63
|
+
total_fields_size = 0
|
|
64
|
+
logger.debug(f"Amount of schema fields: {len(schema.fields)}")
|
|
65
|
+
accepted_fields: List[SchemaFieldClass] = []
|
|
66
|
+
for field in schema.fields:
|
|
67
|
+
field_size = len(json.dumps(pre_json_transform(field.to_obj())))
|
|
68
|
+
logger.debug(f"Field {field.fieldPath} takes total {field_size}")
|
|
69
|
+
if total_fields_size + field_size < self.payload_constraint:
|
|
70
|
+
accepted_fields.append(field)
|
|
71
|
+
total_fields_size += field_size
|
|
72
|
+
else:
|
|
73
|
+
self.report.warning(
|
|
74
|
+
title="Schema truncated due to size constraint",
|
|
75
|
+
message="Dataset schema contained too much data and would have caused ingestion to fail",
|
|
76
|
+
context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
schema.fields = accepted_fields
|
|
80
|
+
|
|
81
|
+
def ensure_aspect_size(
|
|
82
|
+
self,
|
|
83
|
+
stream: Iterable[MetadataWorkUnit],
|
|
84
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
85
|
+
"""
|
|
86
|
+
We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
|
|
87
|
+
on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
|
|
88
|
+
"""
|
|
89
|
+
for wu in stream:
|
|
90
|
+
logger.debug(f"Ensuring size of workunit: {wu.id}")
|
|
91
|
+
|
|
92
|
+
if schema := wu.get_aspect_of_type(SchemaMetadataClass):
|
|
93
|
+
self.ensure_schema_metadata_size(wu.get_urn(), schema)
|
|
94
|
+
elif profile := wu.get_aspect_of_type(DatasetProfileClass):
|
|
95
|
+
self.ensure_dataset_profile_size(wu.get_urn(), profile)
|
|
96
|
+
yield wu
|
|
@@ -10,6 +10,7 @@ from pyiceberg.exceptions import (
|
|
|
10
10
|
NoSuchNamespaceError,
|
|
11
11
|
NoSuchPropertyException,
|
|
12
12
|
NoSuchTableError,
|
|
13
|
+
ServerError,
|
|
13
14
|
)
|
|
14
15
|
from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
|
|
15
16
|
from pyiceberg.table import Table
|
|
@@ -145,6 +146,13 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
145
146
|
self.report.report_no_listed_namespaces(len(namespaces))
|
|
146
147
|
tables_count = 0
|
|
147
148
|
for namespace in namespaces:
|
|
149
|
+
namespace_repr = ".".join(namespace)
|
|
150
|
+
if not self.config.namespace_pattern.allowed(namespace_repr):
|
|
151
|
+
LOGGER.info(
|
|
152
|
+
f"Namespace {namespace_repr} is not allowed by config pattern, skipping"
|
|
153
|
+
)
|
|
154
|
+
self.report.report_dropped(f"{namespace_repr}.*")
|
|
155
|
+
continue
|
|
148
156
|
try:
|
|
149
157
|
tables = catalog.list_tables(namespace)
|
|
150
158
|
tables_count += len(tables)
|
|
@@ -181,6 +189,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
181
189
|
if not self.config.table_pattern.allowed(dataset_name):
|
|
182
190
|
# Dataset name is rejected by pattern, report as dropped.
|
|
183
191
|
self.report.report_dropped(dataset_name)
|
|
192
|
+
LOGGER.debug(
|
|
193
|
+
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
194
|
+
)
|
|
184
195
|
return
|
|
185
196
|
try:
|
|
186
197
|
if not hasattr(thread_local, "local_catalog"):
|
|
@@ -219,6 +230,22 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
219
230
|
LOGGER.warning(
|
|
220
231
|
f"NoSuchTableError while processing table {dataset_path}, skipping it.",
|
|
221
232
|
)
|
|
233
|
+
except FileNotFoundError as e:
|
|
234
|
+
self.report.report_warning(
|
|
235
|
+
"file-not-found",
|
|
236
|
+
f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
|
|
237
|
+
)
|
|
238
|
+
LOGGER.warning(
|
|
239
|
+
f"FileNotFoundError while processing table {dataset_path}, skipping it."
|
|
240
|
+
)
|
|
241
|
+
except ServerError as e:
|
|
242
|
+
self.report.report_warning(
|
|
243
|
+
"iceberg-rest-server-error",
|
|
244
|
+
f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
|
|
245
|
+
)
|
|
246
|
+
LOGGER.warning(
|
|
247
|
+
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
248
|
+
)
|
|
222
249
|
except Exception as e:
|
|
223
250
|
self.report.report_failure("general", f"Failed to create workunit: {e}")
|
|
224
251
|
LOGGER.exception(
|
|
@@ -269,7 +296,6 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
269
296
|
] = table.current_snapshot().manifest_list
|
|
270
297
|
dataset_properties = DatasetPropertiesClass(
|
|
271
298
|
name=table.name()[-1],
|
|
272
|
-
tags=[],
|
|
273
299
|
description=table.metadata.properties.get("comment", None),
|
|
274
300
|
customProperties=custom_properties,
|
|
275
301
|
)
|
|
@@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
|
|
|
68
68
|
default=AllowDenyPattern.allow_all(),
|
|
69
69
|
description="Regex patterns for tables to filter in ingestion.",
|
|
70
70
|
)
|
|
71
|
+
namespace_pattern: AllowDenyPattern = Field(
|
|
72
|
+
default=AllowDenyPattern.allow_all(),
|
|
73
|
+
description="Regex patterns for namespaces to filter in ingestion.",
|
|
74
|
+
)
|
|
71
75
|
user_ownership_property: Optional[str] = Field(
|
|
72
76
|
default="owner",
|
|
73
77
|
description="Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.",
|
|
File without changes
|