acryl-datahub 0.15.0rc25__py3-none-any.whl → 0.15.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (39) hide show
  1. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/METADATA +2470 -2470
  2. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/RECORD +38 -33
  3. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/entry_points.txt +1 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
  6. datahub/configuration/source_common.py +13 -0
  7. datahub/emitter/rest_emitter.py +16 -1
  8. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +96 -0
  9. datahub/ingestion/source/iceberg/iceberg.py +27 -1
  10. datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
  11. datahub/ingestion/source/kafka_connect/__init__.py +0 -0
  12. datahub/ingestion/source/kafka_connect/common.py +202 -0
  13. datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
  14. datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
  15. datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
  16. datahub/ingestion/source/looker/looker_common.py +54 -2
  17. datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
  18. datahub/ingestion/source/looker/looker_source.py +12 -1
  19. datahub/ingestion/source/mlflow.py +30 -5
  20. datahub/ingestion/source/powerbi/config.py +1 -14
  21. datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
  22. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
  23. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -51
  24. datahub/ingestion/source/snowflake/snowflake_queries.py +0 -3
  25. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +52 -2
  26. datahub/ingestion/source/snowflake/snowflake_v2.py +24 -28
  27. datahub/ingestion/source/sql/mssql/job_models.py +30 -1
  28. datahub/ingestion/source/sql/mssql/source.py +14 -0
  29. datahub/ingestion/source/tableau/tableau.py +4 -5
  30. datahub/ingestion/source/tableau/tableau_constant.py +3 -1
  31. datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
  32. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  33. datahub/ingestion/source/unity/source.py +4 -0
  34. datahub/ingestion/source_report/ingestion_stage.py +1 -0
  35. datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
  36. datahub/sql_parsing/tool_meta_extractor.py +116 -5
  37. datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
  38. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/WHEEL +0 -0
  39. {acryl_datahub-0.15.0rc25.dist-info → acryl_datahub-0.15.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=U5x9yuhIDpX_smTqMjMG3FyKLV9w8fZiOK34-Pl2vd0,575
1
+ datahub/__init__.py,sha256=gbsVKK_ULsM259cMG08Rrx6A9_72Iy7zxyDkQZ37NCw,576
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -52,7 +52,7 @@ datahub/api/entities/forms/forms_graphql_constants.py,sha256=DKpnKlMKTjmnyrCTvp6
52
52
  datahub/api/entities/platformresource/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
53
  datahub/api/entities/platformresource/platform_resource.py,sha256=pVAjv6NoH746Mfvdak7ji0eqlEcEeV-Ji7M5gyNXmds,10603
54
54
  datahub/api/entities/structuredproperties/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
- datahub/api/entities/structuredproperties/structuredproperties.py,sha256=tYEVp2oqJa9FhlrnbAf2Zw82WqicJI9lF0P5U9soY9E,7502
55
+ datahub/api/entities/structuredproperties/structuredproperties.py,sha256=wtIt-zqW8_PamvSPpRR28tKkiOAZ7ME3lufoaTeGnlU,8116
56
56
  datahub/api/graphql/__init__.py,sha256=5yl0dJxO-2d_QuykdJrDIbWq4ja9bo0t2dAEh89JOog,142
57
57
  datahub/api/graphql/assertion.py,sha256=ponITypRQ8vE8kiqRNpvdoniNJzi4aeBK97UvkF0VhA,2818
58
58
  datahub/api/graphql/base.py,sha256=9q637r6v-RGOd8Mk8HW2g0vt9zpqFexsQ5R6TPEHVbs,1614
@@ -100,7 +100,7 @@ datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVg
100
100
  datahub/configuration/kafka_consumer_config.py,sha256=LivsObTt9yC3WoGnslJbF_x4ojfNdxMIMEhb8vvJfcA,2133
101
101
  datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
102
102
  datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
103
- datahub/configuration/source_common.py,sha256=68LZOuB23zSEcfgQJE1wZQnyYQHVVnEZK3Sniv_nEQs,2107
103
+ datahub/configuration/source_common.py,sha256=CXs5xJzeFpnrKojVZbHZOvoBDnrkEogN-e8hqMERzfo,2606
104
104
  datahub/configuration/time_window_config.py,sha256=xuwoftrHWC069qfumErCV3BCuE-RWVsDLAefNWn_RFc,5347
105
105
  datahub/configuration/toml.py,sha256=Ohc5sAWLPoAinPYL8njyheZ3ak81fC2Sp8IbBbESPGg,380
106
106
  datahub/configuration/validate_field_deprecation.py,sha256=byBKOlGhPkRO_aCXlJ9BQk7-v3RqCnwI5_sho72ZDMQ,1268
@@ -119,7 +119,7 @@ datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
119
119
  datahub/emitter/mcp_builder.py,sha256=ju-1dZMKs5dlWcTi4zcNRVmhkfhmfX3JFULZSbgxSFs,9968
120
120
  datahub/emitter/mcp_patch_builder.py,sha256=W85q1maVUMpOIo5lwLRn82rLXRVoZ_gurl_a-pvVCpE,4291
121
121
  datahub/emitter/request_helper.py,sha256=33ORG3S3OVy97_jlWBRn7yUM5XCIkRN6WSdJvN7Ofcg,670
122
- datahub/emitter/rest_emitter.py,sha256=3kG_aPKy9pLibd4SJNtdJxn792c5TJliFjjCOw6NoUM,15533
122
+ datahub/emitter/rest_emitter.py,sha256=d5Zjo3GXDu9rUqlSsK9aOx-yEbHjDFZHelfq_ZFeb5M,16393
123
123
  datahub/emitter/serialization_helper.py,sha256=q12Avmf70Vy4ttQGMJoTKlE5EsybMKNg2w3MQeZiHvk,3652
124
124
  datahub/emitter/sql_parsing_builder.py,sha256=Cr5imZrm3dYDSCACt5MFscgHCtVbHTD6IjUmsvsKoEs,11991
125
125
  datahub/emitter/synchronized_file_emitter.py,sha256=s4ATuxalI4GDAkrZTaGSegxBdvvNPZ9jRSdtElU0kNs,1805
@@ -144,6 +144,7 @@ datahub/ingestion/api/transform.py,sha256=X0GpjMJzYkLuZx8MTWxH50cWGm9rGsnn3k188m
144
144
  datahub/ingestion/api/workunit.py,sha256=e8n8RfSjHZZm2R4ShNH0UuMtUkMjyqqM2j2t7oL74lo,6327
145
145
  datahub/ingestion/api/auto_work_units/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  datahub/ingestion/api/auto_work_units/auto_dataset_properties_aspect.py,sha256=ID_6N3nWl2qohsSGizUCqo3d2MNyDeVbyWroQpSOSsc,5059
147
+ datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py,sha256=9mzg3vhCFPI9zHSi_j3FvJtK8kUvb8PmRSz-TM3tt6k,4323
147
148
  datahub/ingestion/extractor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
149
  datahub/ingestion/extractor/extractor_registry.py,sha256=f7CLfW3pr29QZkXSHbp7HjUrsdw7ejQJmot-tiSPcqc,342
149
150
  datahub/ingestion/extractor/json_ref_patch.py,sha256=4g3ZWHn7rwS74jUvSXJiGpi-UKHhiSYKKgBeU4E5ukE,1448
@@ -195,7 +196,7 @@ datahub/ingestion/source/ge_profiling_config.py,sha256=P-9pd20koFvpxeEL_pqFvKWWz
195
196
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
196
197
  datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
197
198
  datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
198
- datahub/ingestion/source/mlflow.py,sha256=SxCt4jtxQcpPWEI2rRNagCiE_6TWr2RroqmxRd_td1Y,11565
199
+ datahub/ingestion/source/mlflow.py,sha256=-yWUuAEVBiNN-elz8Pgn0UeGsC3fVB20z1zKNIr4LXI,12309
199
200
  datahub/ingestion/source/mode.py,sha256=n_5em3jADCr5gWTLDOP4O4bRS0Zt_TCZtW8uFPxn-DI,63043
200
201
  datahub/ingestion/source/mongodb.py,sha256=vZue4Nz0xaBoCUsQr3_0OIRkWRxeE_IH_Y_QKZ1s7S0,21077
201
202
  datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPGPWQ,56146
@@ -313,28 +314,32 @@ datahub/ingestion/source/git/git_import.py,sha256=5CT6vMDb0MDctCtShnxb3JVihULtvk
313
314
  datahub/ingestion/source/grafana/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
314
315
  datahub/ingestion/source/grafana/grafana_source.py,sha256=3pU3xodPgS5lmnjuQ_u7F0XPzD_Y8MnPlMxRJ86qz4g,4960
315
316
  datahub/ingestion/source/iceberg/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
316
- datahub/ingestion/source/iceberg/iceberg.py,sha256=fjqp3VBW5W5-54X_-ubkRZiAmdHvuMbxRbC4UYzEr4U,25900
317
- datahub/ingestion/source/iceberg/iceberg_common.py,sha256=TS3_ZYZ47Fe02CmzEo1z0pvy7yjXuG1VlwqNxa0U6pc,8506
317
+ datahub/ingestion/source/iceberg/iceberg.py,sha256=Pi2QD8v0HOpqr8M9la78Nlm3Be9iy3G4pCZqu2NitZM,27253
318
+ datahub/ingestion/source/iceberg/iceberg_common.py,sha256=4efWbnj8iWWNcO6_lFXFZRIzaKVPWhd1MmmxdJafemw,8684
318
319
  datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=hLT1Le_TEUoFXvsJSlrRB1qbTiTe-YVGCof5TFHMyd8,9908
319
320
  datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
320
321
  datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR2woGLRJwBXUkgXq0,28809
321
322
  datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
322
323
  datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
323
324
  datahub/ingestion/source/kafka/kafka.py,sha256=9SR7bqp9J0rPYde5IClhnAuVNy9ItsB8-ZeXtTc_mEY,26442
324
- datahub/ingestion/source/kafka/kafka_connect.py,sha256=Jm1MYky_OPIwvVHuEjgOjK0e6-jA-dYnsLZ7r-Y_9mA,56208
325
325
  datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
326
+ datahub/ingestion/source/kafka_connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
327
+ datahub/ingestion/source/kafka_connect/common.py,sha256=Ekb1K_J1eTgiH7LSP1AbEIf7NQh_2Vyu1lYX_Ggcqk4,7049
328
+ datahub/ingestion/source/kafka_connect/kafka_connect.py,sha256=8bjVFDkwjgs3gP7Y3itzABGfBcY_WbMQ5PWjrm-g93A,14249
329
+ datahub/ingestion/source/kafka_connect/sink_connectors.py,sha256=ESuJE5SFLLvss9OwDEIB8SAko4rhzaWZ-4dKY0Dh0N8,12900
330
+ datahub/ingestion/source/kafka_connect/source_connectors.py,sha256=_765fSMDAWAe0Cf_F4VNHfOWKNhtqBA1Ep2jL3rf-qc,21263
326
331
  datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
327
332
  datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
328
- datahub/ingestion/source/looker/looker_common.py,sha256=cIrsc0nDEsODxx9tjvSm2A3-OlCkPKCu4W2L8PxzLWs,59926
333
+ datahub/ingestion/source/looker/looker_common.py,sha256=0-5xhq7vE9YFj8tzbvRI7RnP1cD_oxnN4NpSXjKzxjE,61726
329
334
  datahub/ingestion/source/looker/looker_config.py,sha256=87WAgdJ_QWdTq25RBwgIqfc2kq7dubSpzbEtXb2ihMw,13182
330
335
  datahub/ingestion/source/looker/looker_connection.py,sha256=yDmC6lDsHmL2e_Pw8ULylwOIHPWPp_6gT1iyLvD0fTw,2075
331
336
  datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKfcGLSJwKdUCTesp7U8dIrQ,402
332
337
  datahub/ingestion/source/looker/looker_dataclasses.py,sha256=ULWLFWsV2cKmTuOFavD8QjEBmnXmvjyr8RbUB62DwJQ,12178
333
338
  datahub/ingestion/source/looker/looker_file_loader.py,sha256=c1ewDrIb9VJg1o-asbwX9gL83kgL01vIETzzbmZIhmw,4267
334
- datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=QTTCW-rPNUoazQG_sTJbCARXJzQ7NKS-XKURp2AAWls,11106
339
+ datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=0gaYjBv4wkbbLWVgvaAV6JyWAFb0utTG6TCve2d9xss,11511
335
340
  datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
336
341
  datahub/ingestion/source/looker/looker_query_model.py,sha256=N0jBbFruiCIIGT6sJn6tNeppeQ78KGTkOwTLirhxFNc,2144
337
- datahub/ingestion/source/looker/looker_source.py,sha256=AByQxWVfOBqOtZPaR_cw9SB-tFZtfppiKRkFSbcK1GA,65346
342
+ datahub/ingestion/source/looker/looker_source.py,sha256=IUI2FpYG2_bhVnP_LZeQymjc5D0F7lqYTADktzV3yr8,65735
338
343
  datahub/ingestion/source/looker/looker_template_language.py,sha256=EG4ZfVZ0x53lgaYh2ohzL4ZCy9KsX0TA51XqCmsCd2Q,14328
339
344
  datahub/ingestion/source/looker/looker_usage.py,sha256=qegMr-Rnqz3xNGSBfsuD3S_BPXf7UEMhwFN7DPQeLNo,22914
340
345
  datahub/ingestion/source/looker/looker_view_id_cache.py,sha256=92gDy6NONhJYBp92z_IBzDVZvezmUIkaBCZY1bdk6mE,4392
@@ -351,15 +356,15 @@ datahub/ingestion/source/metadata/lineage.py,sha256=XiZGuY6k3O9qBmgo7AzosIndJHwr
351
356
  datahub/ingestion/source/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
352
357
  datahub/ingestion/source/neo4j/neo4j_source.py,sha256=L9WiZ5yZrIDMrgj3gYU9j6zz3TRMXYpcWxeTegD7sFg,12409
353
358
  datahub/ingestion/source/powerbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
354
- datahub/ingestion/source/powerbi/config.py,sha256=LV8BOm2zzF9t0RMwQVVUNB0bStzBPo8A6JkaW0xlgsQ,23241
355
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=AIU89lVPoCWlzc_RfUjDJwRQ11akPtnGpBTluBMCKio,2242
359
+ datahub/ingestion/source/powerbi/config.py,sha256=9cgMrwp3DQqv14iTqulAONkbVe5nhycJ1ElkA_275go,22732
360
+ datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=-njW1kJOy-LY5JFwJLhVQ0bMBj9NQz5TZhQqsSi_KsM,2285
356
361
  datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=PRWzuZMMhKdOVoAaE8csHvUFbZHxYe5meJHgrqlgiuw,19795
357
362
  datahub/ingestion/source/powerbi/powerbi.py,sha256=7UsAEqaFlkWONcXJdQ2hotUYYn46ks6Fe71KXEMh7lI,54495
358
363
  datahub/ingestion/source/powerbi/m_query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
359
364
  datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=EbaEasEOGZ73jz0cQofH9ez65wSvRBof0R6GQaIVLnM,2009
360
365
  datahub/ingestion/source/powerbi/m_query/native_sql_parser.py,sha256=zzKVDGeUM3Yv3-zNah4D6mSnr6jXsstNuLmzczcPQEE,3683
361
366
  datahub/ingestion/source/powerbi/m_query/parser.py,sha256=pB1LGdb02Ryf8Pr8JPSgiOmLE6mEAgDbKodtKOOY6LU,5782
362
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=h1gBh9Gz1zF3m-hA0WMQaBdEtcZUW9vScNKEvqIWCfk,32442
367
+ datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=yDNdzPREzegarux_I9F_PsLe1hlSFa-uwoORrEgbO-c,32485
363
368
  datahub/ingestion/source/powerbi/m_query/resolver.py,sha256=TMt84_JaCazpP7vcneW0O3cUjtbIuh8Yid78JWfDxsI,16953
364
369
  datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=h77DunhlgOP0fAg8UXDXxxInOi7Pay85_d1Ca4YqyKs,6134
365
370
  datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
@@ -427,19 +432,19 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
427
432
  datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
428
433
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
429
434
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
430
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=suMICPFPvoV6shkjD_14JunLc8jAZBINzlFk2mYldkU,23676
435
+ datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=uMGmMEl4hWEmN7GxMyDBdwlIPAW7WmOnu41kZ0dvCG4,21551
431
436
  datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
432
- datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
437
+ datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=8QEihOfivalVR9vLo6vCUL-vnZfAGgMio0uhPYX0jTo,25883
433
438
  datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
434
439
  datahub/ingestion/source/snowflake/snowflake_report.py,sha256=_-rD7Q4MzKY8fYzJHSBnGX4gurwujL3UoRzcP_TZURs,6468
435
440
  datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=z5ZPgh-TILAz0DeIwDxRCsj980CM2BbftXiFpM1dV_Y,21674
436
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=vof3mNImstnlL8kc0OkTHzMIqnbEkt9RmnYBX1JX0oE,40386
441
+ datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=K-KEr3OpwMHye08lXAy-5doUUGoGJP3b-ntJAGU_NBY,42472
437
442
  datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-gPdzwtON9dJa0eqHt-8Yr5h2Q,6366
438
443
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
439
444
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
440
445
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
441
446
  datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
442
- datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=lo_3asTuIZbF-LuEUcYL-9NIZ720n7oB9mYA6WVTWA4,31960
447
+ datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=ecaTCJNAQ_IJOPInPGXA3jv1dE5lztSU82UhpBygiq0,31654
443
448
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
444
449
  datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
445
450
  datahub/ingestion/source/sql/clickhouse.py,sha256=jzvaXP5Wr0SMhj2rtuvVE821xnfpKiXhO3cm0xblgHs,27299
@@ -467,8 +472,8 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
467
472
  datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
468
473
  datahub/ingestion/source/sql/vertica.py,sha256=_9OgSgIgqBml0av063rb8nACiT3SAmzpw0ouyF91wv8,33382
469
474
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
470
- datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
471
- datahub/ingestion/source/sql/mssql/source.py,sha256=-B0bnFKEReciYzQ4p_2xJJzdn-H8vYz2MQ_h-1B0ibs,30329
475
+ datahub/ingestion/source/sql/mssql/job_models.py,sha256=ztXDrD4anhzwWvACIm9fucE2WhMDMKkJ4alMYOQOqWA,7083
476
+ datahub/ingestion/source/sql/mssql/source.py,sha256=ODdsOIbDA3X0E7En6GT15mD49W6RW9sXLwRoUgw2a8I,30925
472
477
  datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
473
478
  datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
474
479
  datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
@@ -486,11 +491,11 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
486
491
  datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
487
492
  datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
488
493
  datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
489
- datahub/ingestion/source/tableau/tableau.py,sha256=P_DUuUvXk5u2ihA0JghtRkYc_KI_yQR2ZiQVe9IUvsU,138197
494
+ datahub/ingestion/source/tableau/tableau.py,sha256=80m5a8g2q5jnm5pFtL4s_SmSeGPQIuZGqGe9RGn6OSI,138248
490
495
  datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
491
- datahub/ingestion/source/tableau/tableau_constant.py,sha256=jVQMgLXND5aPL6XLETKp81BehRkvyLTU_Vhhe_1NOkI,2576
492
- datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=PEGfcoUcBdsnOa5EzCqy1IiuQ3OZ9fZVEMzDqhhHOto,922
493
- datahub/ingestion/source/tableau/tableau_validation.py,sha256=l0DuXUuxJwEXMzo61xLx-KLc5u6tiz2n0e9EepJdWEM,1808
496
+ datahub/ingestion/source/tableau/tableau_constant.py,sha256=ZcAeHsQUXVVL26ORly0ByZk_GJAFbxaKuJAlX_sYMac,2686
497
+ datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=nSyx9RzC6TCQDm-cTVJ657qT8iDwzk_8JMKpohhmOc4,1046
498
+ datahub/ingestion/source/tableau/tableau_validation.py,sha256=pd--LcTLTfrFsouhCOvGC_2IjeMfKbJV81EEo3ibMwE,1820
494
499
  datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
495
500
  datahub/ingestion/source/unity/analyze_profiler.py,sha256=2pqkFY30CfN4aHgFZZntjeG0hNhBytZJvXC13VfTc1I,4689
496
501
  datahub/ingestion/source/unity/config.py,sha256=m4-n7mYz4Ct4L1QdfJFklwHyj8boKCbV7Sb3Ou6AT3Q,14756
@@ -501,7 +506,7 @@ datahub/ingestion/source/unity/proxy.py,sha256=2-pYQ-3B9UVUwO1yB9iTdi3DqgqZ2JrpQ
501
506
  datahub/ingestion/source/unity/proxy_profiling.py,sha256=WLqvYP6MziaisA4LYL4T_GA-kPt6Xdde7bfaYsjYw40,9663
502
507
  datahub/ingestion/source/unity/proxy_types.py,sha256=qrvHiwPzl5cPX-KRvcIGGeJVdr0I8XUQmoAI6ErZ-v8,9371
503
508
  datahub/ingestion/source/unity/report.py,sha256=0Y-ciHVTI6ZKNCJ5zWoQh3Ze1c_GMqmTMKFwzXDuuOg,2788
504
- datahub/ingestion/source/unity/source.py,sha256=YdUPCMJtpmvYVnRNnpqb4BVowFobkLvSJ_K2gHwrvCI,41752
509
+ datahub/ingestion/source/unity/source.py,sha256=ydjqJ91q-Mir_aJSmEdo1umgPvQ5la67rBhC04IyV78,41938
505
510
  datahub/ingestion/source/unity/usage.py,sha256=igRxYg8usukTAA229uJWi-0y-Zd0yOq9dEBi2k9f15o,11436
506
511
  datahub/ingestion/source/usage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
507
512
  datahub/ingestion/source/usage/clickhouse_usage.py,sha256=8nQqNAPKqivywjzsvqH0-HWFwjd4gECpw_xahLXk5ek,9970
@@ -512,7 +517,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
512
517
  datahub/ingestion/source_config/operation_config.py,sha256=Q0NlqiEh4s4DFIII5NsAp5hxWTVyyJz-ldcQmH-B47s,3504
513
518
  datahub/ingestion/source_config/pulsar.py,sha256=sklDkh62CrWV-i7Ifh6R3T3smYVso6gyRJG8HVc6RdA,5533
514
519
  datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
515
- datahub/ingestion/source_report/ingestion_stage.py,sha256=zijPKvJKnB298cMoG6j_w6QRva9t1RWqnmycmbsKdgs,1499
520
+ datahub/ingestion/source_report/ingestion_stage.py,sha256=pJcJeLSjaixlLqQyQtE3bfUcvXEVwrSaWWtU4iU9UEo,1557
516
521
  datahub/ingestion/source_report/pulsar.py,sha256=iKhzy644AjoFTV-gxyqBoXKMLwSMPxJFxU-3WDQRww0,1037
517
522
  datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
518
523
  datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -871,12 +876,12 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
871
876
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
872
877
  datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
873
878
  datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
874
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=F-aj7yqOwbo7FpxduFO5a7cLWkojL_Npv3_dlfHPNGY,69877
879
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=LBs1RjRqh3natrx4WfgRQGNpI56o12jtbABO5ipEBWA,69889
875
880
  datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
876
881
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
877
882
  datahub/sql_parsing/sqlglot_lineage.py,sha256=CLDOc0HNqL_539eahOP3QOoldIYC6CF29id4Xe3TlEM,47018
878
883
  datahub/sql_parsing/sqlglot_utils.py,sha256=n6yufzEGwSlFeCSU540hEldIuab0q8KGqm9x0vSawkc,14699
879
- datahub/sql_parsing/tool_meta_extractor.py,sha256=pE-pkRKBfNTXEJkaQM9NlG807mc-X6OtetgskJySCs8,2908
884
+ datahub/sql_parsing/tool_meta_extractor.py,sha256=7tY4FAClhFcqwc23lGVlnT6Dequ_5Xcpbt0hDvnlLzM,6670
880
885
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
881
886
  datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
882
887
  datahub/telemetry/telemetry.py,sha256=gzla-QGNsynGg2FqFxiDDFQ0emG53MJ9lhOA2-UUg-Y,15047
@@ -976,8 +981,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
976
981
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
977
982
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
978
983
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
979
- acryl_datahub-0.15.0rc25.dist-info/METADATA,sha256=G7pCS6M1IzZefvuba5jKgr6IvehacgjV1JsauQfsMsk,173639
980
- acryl_datahub-0.15.0rc25.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
981
- acryl_datahub-0.15.0rc25.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
982
- acryl_datahub-0.15.0rc25.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
983
- acryl_datahub-0.15.0rc25.dist-info/RECORD,,
984
+ acryl_datahub-0.15.0.1rc2.dist-info/METADATA,sha256=gLix1LBWIrfQF-dcU1JsLeAAFkHuALY9dHVboVmjIJg,173642
985
+ acryl_datahub-0.15.0.1rc2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
986
+ acryl_datahub-0.15.0.1rc2.dist-info/entry_points.txt,sha256=xnPSPLK3bJGADxe4TDS4wL4u0FT_PGlahDa-ENYdYCQ,9512
987
+ acryl_datahub-0.15.0.1rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
988
+ acryl_datahub-0.15.0.1rc2.dist-info/RECORD,,
@@ -57,7 +57,7 @@ hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource
57
57
  iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource
58
58
  json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource
59
59
  kafka = datahub.ingestion.source.kafka.kafka:KafkaSource
60
- kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource
60
+ kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource
61
61
  ldap = datahub.ingestion.source.ldap:LDAPSource
62
62
  looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource
63
63
  lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc25"
6
+ __version__ = "0.15.0.1rc2"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -14,7 +14,7 @@ from datahub.metadata.schema_classes import (
14
14
  PropertyValueClass,
15
15
  StructuredPropertyDefinitionClass,
16
16
  )
17
- from datahub.metadata.urns import StructuredPropertyUrn, Urn
17
+ from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn
18
18
  from datahub.utilities.urns._urn_base import URN_TYPES
19
19
 
20
20
  logging.basicConfig(level=logging.INFO)
@@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel):
86
86
 
87
87
  @validator("type")
88
88
  def validate_type(cls, v: str) -> str:
89
- # Convert to lowercase if needed
90
- if not v.islower():
89
+ # This logic is somewhat hacky, since we need to deal with
90
+ # 1. fully qualified urns
91
+ # 2. raw data types, that need to get the datahub namespace prefix
92
+ # While keeping the user-facing interface and error messages clean.
93
+
94
+ if not v.startswith("urn:li:") and not v.islower():
95
+ # Convert to lowercase if needed
96
+ v = v.lower()
91
97
  logger.warning(
92
- f"Structured property type should be lowercase. Updated to {v.lower()}"
98
+ f"Structured property type should be lowercase. Updated to {v}"
93
99
  )
94
- v = v.lower()
100
+
101
+ urn = Urn.make_data_type_urn(v)
95
102
 
96
103
  # Check if type is allowed
97
- if not AllowedTypes.check_allowed_type(v):
104
+ data_type_urn = DataTypeUrn.from_string(urn)
105
+ unqualified_data_type = data_type_urn.id
106
+ if unqualified_data_type.startswith("datahub."):
107
+ unqualified_data_type = unqualified_data_type[len("datahub.") :]
108
+ if not AllowedTypes.check_allowed_type(unqualified_data_type):
98
109
  raise ValueError(
99
- f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}"
110
+ f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}"
100
111
  )
101
- return v
112
+
113
+ return urn
102
114
 
103
115
  @property
104
116
  def fqn(self) -> str:
@@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin):
63
63
  default=None,
64
64
  description="A holder for platform -> platform_instance mappings to generate correct dataset urns",
65
65
  )
66
+
67
+
68
+ class PlatformDetail(ConfigModel):
69
+ platform_instance: Optional[str] = Field(
70
+ default=None,
71
+ description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match "
72
+ "with platform instance name used in ingestion "
73
+ "recipe of other datahub sources.",
74
+ )
75
+ env: str = Field(
76
+ default=DEFAULT_ENV,
77
+ description="The environment that all assets produced by DataHub platform ingestion source belong to",
78
+ )
@@ -291,6 +291,7 @@ class DataHubRestEmitter(Closeable, Emitter):
291
291
  mcps: List[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
292
292
  async_flag: Optional[bool] = None,
293
293
  ) -> int:
294
+ logger.debug("Attempting to emit batch mcps")
294
295
  url = f"{self._gms_server}/aspects?action=ingestProposalBatch"
295
296
  for mcp in mcps:
296
297
  ensure_has_system_metadata(mcp)
@@ -303,15 +304,22 @@ class DataHubRestEmitter(Closeable, Emitter):
303
304
  current_chunk_size = INGEST_MAX_PAYLOAD_BYTES
304
305
  for mcp_obj in mcp_objs:
305
306
  mcp_obj_size = len(json.dumps(mcp_obj))
307
+ logger.debug(
308
+ f"Iterating through object with size {mcp_obj_size} (type: {mcp_obj.get('aspectName')}"
309
+ )
306
310
 
307
311
  if (
308
312
  mcp_obj_size + current_chunk_size > INGEST_MAX_PAYLOAD_BYTES
309
313
  or len(mcp_obj_chunks[-1]) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
310
314
  ):
315
+ logger.debug("Decided to create new chunk")
311
316
  mcp_obj_chunks.append([])
312
317
  current_chunk_size = 0
313
318
  mcp_obj_chunks[-1].append(mcp_obj)
314
319
  current_chunk_size += mcp_obj_size
320
+ logger.debug(
321
+ f"Decided to send {len(mcps)} mcps in {len(mcp_obj_chunks)} chunks"
322
+ )
315
323
 
316
324
  for mcp_obj_chunk in mcp_obj_chunks:
317
325
  # TODO: We're calling json.dumps on each MCP object twice, once to estimate
@@ -338,8 +346,15 @@ class DataHubRestEmitter(Closeable, Emitter):
338
346
 
339
347
  def _emit_generic(self, url: str, payload: str) -> None:
340
348
  curl_command = make_curl_command(self._session, "POST", url, payload)
349
+ payload_size = len(payload)
350
+ if payload_size > INGEST_MAX_PAYLOAD_BYTES:
351
+ # since we know total payload size here, we could simply avoid sending such payload at all and report a warning, with current approach we are going to cause whole ingestion to fail
352
+ logger.warning(
353
+ f"Apparent payload size exceeded {INGEST_MAX_PAYLOAD_BYTES}, might fail with an exception due to the size"
354
+ )
341
355
  logger.debug(
342
- "Attempting to emit to DataHub GMS; using curl equivalent to:\n%s",
356
+ "Attempting to emit aspect (size: %s) to DataHub GMS; using curl equivalent to:\n%s",
357
+ payload_size,
343
358
  curl_command,
344
359
  )
345
360
  try:
@@ -0,0 +1,96 @@
1
+ import json
2
+ import logging
3
+ from typing import Iterable, List
4
+
5
+ from datahub.emitter.rest_emitter import INGEST_MAX_PAYLOAD_BYTES
6
+ from datahub.emitter.serialization_helper import pre_json_transform
7
+ from datahub.ingestion.api.source import SourceReport
8
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
9
+ from datahub.metadata.schema_classes import (
10
+ DatasetProfileClass,
11
+ SchemaFieldClass,
12
+ SchemaMetadataClass,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class EnsureAspectSizeProcessor:
19
+ def __init__(
20
+ self, report: SourceReport, payload_constraint: int = INGEST_MAX_PAYLOAD_BYTES
21
+ ):
22
+ self.report = report
23
+ self.payload_constraint = payload_constraint
24
+
25
+ def ensure_dataset_profile_size(
26
+ self, dataset_urn: str, profile: DatasetProfileClass
27
+ ) -> None:
28
+ """
29
+ This is quite arbitrary approach to ensuring dataset profile aspect does not exceed allowed size, might be adjusted
30
+ in the future
31
+ """
32
+ sample_fields_size = 0
33
+ if profile.fieldProfiles:
34
+ logger.debug(f"Length of field profiles: {len(profile.fieldProfiles)}")
35
+ for field in profile.fieldProfiles:
36
+ if field.sampleValues:
37
+ values_len = 0
38
+ for value in field.sampleValues:
39
+ if value:
40
+ values_len += len(value)
41
+ logger.debug(
42
+ f"Field {field.fieldPath} has {len(field.sampleValues)} sample values, taking total bytes {values_len}"
43
+ )
44
+ if sample_fields_size + values_len > self.payload_constraint:
45
+ field.sampleValues = []
46
+ self.report.warning(
47
+ title="Dataset profile truncated due to size constraint",
48
+ message="Dataset profile contained too much data and would have caused ingestion to fail",
49
+ context=f"Sample values for field {field.fieldPath} were removed from dataset profile for {dataset_urn} due to aspect size constraints",
50
+ )
51
+ else:
52
+ sample_fields_size += values_len
53
+ else:
54
+ logger.debug(f"Field {field.fieldPath} has no sample values")
55
+
56
+ def ensure_schema_metadata_size(
57
+ self, dataset_urn: str, schema: SchemaMetadataClass
58
+ ) -> None:
59
+ """
60
+ This is quite arbitrary approach to ensuring schema metadata aspect does not exceed allowed size, might be adjusted
61
+ in the future
62
+ """
63
+ total_fields_size = 0
64
+ logger.debug(f"Amount of schema fields: {len(schema.fields)}")
65
+ accepted_fields: List[SchemaFieldClass] = []
66
+ for field in schema.fields:
67
+ field_size = len(json.dumps(pre_json_transform(field.to_obj())))
68
+ logger.debug(f"Field {field.fieldPath} takes total {field_size}")
69
+ if total_fields_size + field_size < self.payload_constraint:
70
+ accepted_fields.append(field)
71
+ total_fields_size += field_size
72
+ else:
73
+ self.report.warning(
74
+ title="Schema truncated due to size constraint",
75
+ message="Dataset schema contained too much data and would have caused ingestion to fail",
76
+ context=f"Field {field.fieldPath} was removed from schema for {dataset_urn} due to aspect size constraints",
77
+ )
78
+
79
+ schema.fields = accepted_fields
80
+
81
+ def ensure_aspect_size(
82
+ self,
83
+ stream: Iterable[MetadataWorkUnit],
84
+ ) -> Iterable[MetadataWorkUnit]:
85
+ """
86
+ We have hard limitation of aspect size being 16 MB. Some aspects can exceed that value causing an exception
87
+ on GMS side and failure of the entire ingestion. This processor will attempt to trim suspected aspects.
88
+ """
89
+ for wu in stream:
90
+ logger.debug(f"Ensuring size of workunit: {wu.id}")
91
+
92
+ if schema := wu.get_aspect_of_type(SchemaMetadataClass):
93
+ self.ensure_schema_metadata_size(wu.get_urn(), schema)
94
+ elif profile := wu.get_aspect_of_type(DatasetProfileClass):
95
+ self.ensure_dataset_profile_size(wu.get_urn(), profile)
96
+ yield wu
@@ -10,6 +10,7 @@ from pyiceberg.exceptions import (
10
10
  NoSuchNamespaceError,
11
11
  NoSuchPropertyException,
12
12
  NoSuchTableError,
13
+ ServerError,
13
14
  )
14
15
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
15
16
  from pyiceberg.table import Table
@@ -145,6 +146,13 @@ class IcebergSource(StatefulIngestionSourceBase):
145
146
  self.report.report_no_listed_namespaces(len(namespaces))
146
147
  tables_count = 0
147
148
  for namespace in namespaces:
149
+ namespace_repr = ".".join(namespace)
150
+ if not self.config.namespace_pattern.allowed(namespace_repr):
151
+ LOGGER.info(
152
+ f"Namespace {namespace_repr} is not allowed by config pattern, skipping"
153
+ )
154
+ self.report.report_dropped(f"{namespace_repr}.*")
155
+ continue
148
156
  try:
149
157
  tables = catalog.list_tables(namespace)
150
158
  tables_count += len(tables)
@@ -181,6 +189,9 @@ class IcebergSource(StatefulIngestionSourceBase):
181
189
  if not self.config.table_pattern.allowed(dataset_name):
182
190
  # Dataset name is rejected by pattern, report as dropped.
183
191
  self.report.report_dropped(dataset_name)
192
+ LOGGER.debug(
193
+ f"Skipping table {dataset_name} due to not being allowed by the config pattern"
194
+ )
184
195
  return
185
196
  try:
186
197
  if not hasattr(thread_local, "local_catalog"):
@@ -219,6 +230,22 @@ class IcebergSource(StatefulIngestionSourceBase):
219
230
  LOGGER.warning(
220
231
  f"NoSuchTableError while processing table {dataset_path}, skipping it.",
221
232
  )
233
+ except FileNotFoundError as e:
234
+ self.report.report_warning(
235
+ "file-not-found",
236
+ f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
237
+ )
238
+ LOGGER.warning(
239
+ f"FileNotFoundError while processing table {dataset_path}, skipping it."
240
+ )
241
+ except ServerError as e:
242
+ self.report.report_warning(
243
+ "iceberg-rest-server-error",
244
+ f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
245
+ )
246
+ LOGGER.warning(
247
+ f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
248
+ )
222
249
  except Exception as e:
223
250
  self.report.report_failure("general", f"Failed to create workunit: {e}")
224
251
  LOGGER.exception(
@@ -269,7 +296,6 @@ class IcebergSource(StatefulIngestionSourceBase):
269
296
  ] = table.current_snapshot().manifest_list
270
297
  dataset_properties = DatasetPropertiesClass(
271
298
  name=table.name()[-1],
272
- tags=[],
273
299
  description=table.metadata.properties.get("comment", None),
274
300
  customProperties=custom_properties,
275
301
  )
@@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin)
68
68
  default=AllowDenyPattern.allow_all(),
69
69
  description="Regex patterns for tables to filter in ingestion.",
70
70
  )
71
+ namespace_pattern: AllowDenyPattern = Field(
72
+ default=AllowDenyPattern.allow_all(),
73
+ description="Regex patterns for namespaces to filter in ingestion.",
74
+ )
71
75
  user_ownership_property: Optional[str] = Field(
72
76
  default="owner",
73
77
  description="Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.",
File without changes