acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,92 @@
1
+ from typing import List, Optional, Union
2
+
3
+ from pydantic.fields import Field
4
+
5
+ from datahub.configuration.common import AllowDenyPattern
6
+ from datahub.configuration.source_common import DatasetSourceConfigMixin
7
+ from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
8
+ from datahub.ingestion.source.azure.azure_common import AzureConnectionConfig
9
+ from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
10
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
11
+ StatefulStaleMetadataRemovalConfig,
12
+ )
13
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
14
+ StatefulIngestionConfigBase,
15
+ )
16
+ from datahub.ingestion.source_config.operation_config import is_profiling_enabled
17
+
18
+
19
+ class ExcelSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
20
+ path_list: List[str] = Field(
21
+ description="List of paths to Excel files or folders to ingest."
22
+ )
23
+
24
+ path_pattern: AllowDenyPattern = Field(
25
+ default=AllowDenyPattern.allow_all(),
26
+ description="Regex patterns for file paths to filter in ingestion.",
27
+ )
28
+
29
+ aws_config: Optional[AwsConnectionConfig] = Field(
30
+ default=None, description="AWS configuration"
31
+ )
32
+
33
+ use_s3_bucket_tags: Optional[bool] = Field(
34
+ default=False,
35
+ description="Whether or not to create tags in datahub from the s3 bucket",
36
+ )
37
+
38
+ use_s3_object_tags: Optional[bool] = Field(
39
+ default=False,
40
+ description="Whether or not to create tags in datahub from the s3 object",
41
+ )
42
+
43
+ verify_ssl: Union[bool, str] = Field(
44
+ default=True,
45
+ description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.",
46
+ )
47
+
48
+ azure_config: Optional[AzureConnectionConfig] = Field(
49
+ default=None, description="Azure configuration"
50
+ )
51
+
52
+ use_abs_blob_tags: Optional[bool] = Field(
53
+ default=False,
54
+ description="Whether to create tags in datahub from the abs blob tags",
55
+ )
56
+
57
+ convert_urns_to_lowercase: bool = Field(
58
+ default=False,
59
+ description="Enable to convert the Excel asset urns to lowercase",
60
+ )
61
+
62
+ active_sheet_only: bool = Field(
63
+ default=False,
64
+ description="Enable to only ingest the active sheet of the workbook. If not set, all sheets will be ingested.",
65
+ )
66
+
67
+ worksheet_pattern: AllowDenyPattern = Field(
68
+ default=AllowDenyPattern.allow_all(),
69
+ description="Regex patterns for worksheets to ingest. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
70
+ "For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
71
+ )
72
+
73
+ profile_pattern: AllowDenyPattern = Field(
74
+ default=AllowDenyPattern.allow_all(),
75
+ description="Regex patterns for worksheets to profile. Worksheets are specified as 'filename_without_extension.worksheet_name'. "
76
+ "For example to allow the worksheet Sheet1 from file report.xlsx, use the pattern: 'report.Sheet1'.",
77
+ )
78
+
79
+ profiling: GEProfilingConfig = Field(
80
+ default=GEProfilingConfig(),
81
+ description="Configuration for profiling",
82
+ )
83
+
84
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
85
+ default=None,
86
+ description="Configuration for stateful ingestion and stale metadata removal.",
87
+ )
88
+
89
+ def is_profiling_enabled(self) -> bool:
90
+ return self.profiling.enabled and is_profiling_enabled(
91
+ self.profiling.operation_config
92
+ )
@@ -0,0 +1,539 @@
1
+ import logging
2
+ import re
3
+ from collections import Counter
4
+ from dataclasses import dataclass
5
+ from io import BytesIO
6
+ from typing import Any, Dict, Iterator, List, Optional, Union
7
+
8
+ import openpyxl
9
+ import pandas as pd
10
+ from openpyxl.workbook import Workbook
11
+
12
+ from datahub.ingestion.source.excel.report import ExcelSourceReport
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class ExcelTable:
19
+ df: pd.DataFrame
20
+ header_row: int
21
+ footer_row: int
22
+ row_count: int
23
+ column_count: int
24
+ metadata: Dict[str, Any]
25
+ sheet_name: str
26
+
27
+
28
+ class ExcelFile:
29
+ wb: Workbook
30
+ filename: str
31
+ data: BytesIO
32
+ sheet_list: List[str]
33
+ active_sheet: str
34
+ properties: Dict[str, Any]
35
+ report: ExcelSourceReport
36
+
37
+ def __init__(
38
+ self,
39
+ filename: str,
40
+ data: BytesIO,
41
+ report: ExcelSourceReport,
42
+ ) -> None:
43
+ self.filename = filename
44
+ self.data = data
45
+ self.report = report
46
+ self.sheet_list = []
47
+ self.active_sheet = ""
48
+ self.properties = {}
49
+
50
+ def load_workbook(self) -> bool:
51
+ try:
52
+ self.wb = openpyxl.load_workbook(self.data, data_only=True)
53
+ self.properties = self.read_excel_properties(self.wb)
54
+ self.sheet_list = self.wb.sheetnames
55
+ self.active_sheet = self.wb.active.title
56
+ return True
57
+ except Exception as e:
58
+ self.report.report_file_dropped(self.filename)
59
+ self.report.warning(
60
+ message="Error reading Excel file",
61
+ context=f"Filename={self.filename}",
62
+ exc=e,
63
+ )
64
+ return False
65
+
66
+ @property
67
+ def sheet_names(self) -> List[str]:
68
+ return self.sheet_list
69
+
70
+ @property
71
+ def active_sheet_name(self) -> str:
72
+ return self.active_sheet
73
+
74
+ @property
75
+ def workbook_properties(self) -> Dict[str, Any]:
76
+ return self.properties
77
+
78
+ def get_tables(self, active_only: Optional[bool] = False) -> Iterator[ExcelTable]:
79
+ sheet_list = [self.active_sheet] if active_only else self.sheet_list
80
+ for sheet in sheet_list:
81
+ table = self.get_table(sheet)
82
+ if table is not None:
83
+ yield table
84
+ else:
85
+ self.report.report_worksheet_dropped(sheet)
86
+ self.report.warning(
87
+ message="Worksheet does not contain a table",
88
+ context=f"Worksheet=[{self.filename}]{sheet}",
89
+ )
90
+
91
+ def get_table(self, sheet_name: str) -> Union[ExcelTable, None]:
92
+ sheet = self.wb[sheet_name]
93
+
94
+ # Extract all rows from the sheet
95
+ rows = [[cell.value for cell in row] for row in sheet.rows]
96
+
97
+ # Find a potential header row
98
+ header_row_idx = self.find_header_row(rows)
99
+ if header_row_idx is None:
100
+ return None
101
+
102
+ # Find where the footer starts
103
+ footer_start_idx = self.find_footer_start(rows, header_row_idx)
104
+
105
+ # Extract metadata before the header
106
+ header_metadata = self.extract_metadata(rows[:header_row_idx])
107
+
108
+ # Extract footer metadata
109
+ footer_metadata = {}
110
+ if footer_start_idx < len(rows):
111
+ footer_metadata = self.extract_metadata(rows[footer_start_idx:])
112
+
113
+ # Combine metadata
114
+ metadata = {}
115
+ metadata.update(self.properties)
116
+
117
+ # Add header metadata
118
+ for key, value in header_metadata.items():
119
+ if key not in metadata:
120
+ metadata[key] = value
121
+ else:
122
+ metadata[f"{key}_1"] = value
123
+
124
+ # Add footer metadata
125
+ for key, value in footer_metadata.items():
126
+ if key not in metadata:
127
+ metadata[key] = value
128
+ else:
129
+ metadata[f"{key}_1"] = value
130
+
131
+ # Get the header row
132
+ header_row = rows[header_row_idx]
133
+
134
+ # Find the last non-empty column in the header row
135
+ last_non_empty_idx = -1
136
+ for i in range(len(header_row) - 1, -1, -1):
137
+ if header_row[i] is not None and str(header_row[i]).strip() != "":
138
+ last_non_empty_idx = i
139
+ break
140
+
141
+ # Truncate the header row to remove empty trailing columns
142
+ if last_non_empty_idx >= 0:
143
+ header_row = header_row[: last_non_empty_idx + 1]
144
+
145
+ # Create the column names for the DataFrame
146
+ column_names: List[str] = []
147
+ seen_columns: Dict[str, int] = {}
148
+ for i, col in enumerate(header_row):
149
+ if col is None or str(col).strip() == "":
150
+ col_name = f"Unnamed_{i}"
151
+ else:
152
+ col_name = str(col).strip()
153
+
154
+ if col_name in seen_columns:
155
+ seen_columns[col_name] += 1
156
+ col_name = f"{col_name}_{seen_columns[col_name]}"
157
+ else:
158
+ seen_columns[col_name] = 0
159
+
160
+ column_names.append(col_name)
161
+
162
+ # Create the DataFrame with the table data
163
+ data_rows = rows[header_row_idx + 1 : footer_start_idx]
164
+
165
+ # Truncate data rows to match the header length
166
+ truncated_data_rows = [
167
+ row[: len(column_names)] if len(row) > len(column_names) else row
168
+ for row in data_rows
169
+ ]
170
+
171
+ # Create the final DataFrame
172
+ df = pd.DataFrame(truncated_data_rows, columns=column_names)
173
+
174
+ row_count = df.shape[0]
175
+ column_count = df.shape[1]
176
+
177
+ return ExcelTable(
178
+ df,
179
+ header_row_idx + 1,
180
+ footer_start_idx,
181
+ row_count,
182
+ column_count,
183
+ metadata,
184
+ sheet.title.strip(),
185
+ )
186
+
187
+ def find_header_row(self, rows: List[List[Any]]) -> Union[int, None]:
188
+ max_score = -1
189
+ header_idx = 0
190
+
191
+ # Skip empty rows at the beginning
192
+ start_idx = self._find_first_non_empty_row(rows)
193
+
194
+ # Evaluate each potential header row with a lookahead
195
+ min_rows_required = 2
196
+
197
+ # Skip evaluation if there aren't enough rows
198
+ if len(rows) < start_idx + min_rows_required + 1:
199
+ return header_idx
200
+
201
+ for i in range(start_idx, len(rows) - min_rows_required):
202
+ current_row = rows[i]
203
+ # Take as many next rows as available, up to 3
204
+ next_rows = rows[i + 1 : min(i + 4, len(rows))]
205
+
206
+ # Skip empty rows
207
+ if not self._is_non_empty_row(current_row):
208
+ continue
209
+
210
+ score = self._calculate_header_row_score(i, current_row, next_rows, rows)
211
+
212
+ if score > max_score:
213
+ max_score = score
214
+ header_idx = i
215
+
216
+ if max_score <= 0:
217
+ return None
218
+ else:
219
+ return header_idx
220
+
221
+ def _find_first_non_empty_row(self, rows: List[List[Any]]) -> int:
222
+ for i, row in enumerate(rows):
223
+ if self._is_non_empty_row(row):
224
+ return i
225
+ return 0
226
+
227
+ @staticmethod
228
+ def _is_non_empty_row(row: List[Any]) -> bool:
229
+ return any(cell is not None and str(cell).strip() != "" for cell in row)
230
+
231
+ def _calculate_header_row_score(
232
+ self,
233
+ row_idx: int,
234
+ current_row: List[Any],
235
+ next_rows: List[List[Any]],
236
+ all_rows: List[List[Any]],
237
+ ) -> int:
238
+ score = 0
239
+
240
+ score += ExcelFile._score_row_with_numeric_cells(current_row)
241
+ if score < 0:
242
+ return score
243
+ score += self._score_non_empty_cells(row_idx, current_row, all_rows)
244
+ score += self._score_header_like_text(current_row)
245
+ score += self._score_text_followed_by_numeric(current_row, next_rows)
246
+ score += self._score_column_type_consistency(current_row, next_rows)
247
+ score += self._score_metadata_patterns(row_idx, current_row, all_rows)
248
+
249
+ return score
250
+
251
+ @staticmethod
252
+ def _score_non_empty_cells(
253
+ row_idx: int, current_row: List[Any], all_rows: List[List[Any]]
254
+ ) -> int:
255
+ if row_idx <= 0:
256
+ return 0
257
+
258
+ non_empty_current = sum(
259
+ 1 for cell in current_row if cell is not None and str(cell).strip() != ""
260
+ )
261
+
262
+ non_empty_prev = sum(
263
+ 1
264
+ for cell in all_rows[row_idx - 1]
265
+ if cell is not None and str(cell).strip() != ""
266
+ )
267
+
268
+ return 2 if non_empty_current > non_empty_prev else 0
269
+
270
+ @staticmethod
271
+ def _score_header_like_text(row: List[Any]) -> int:
272
+ return sum(
273
+ 1
274
+ for cell in row
275
+ if cell is not None
276
+ and isinstance(cell, str)
277
+ and re.match(r"^[A-Z][a-zA-Z\s]*$", str(cell).strip())
278
+ )
279
+
280
+ @staticmethod
281
+ def _score_row_with_numeric_cells(row: List[Any]) -> int:
282
+ return sum(
283
+ -1 for cell in row if cell is not None and isinstance(cell, (int, float))
284
+ )
285
+
286
+ @staticmethod
287
+ def _score_text_followed_by_numeric(
288
+ current_row: List[Any], next_rows: List[List[Any]]
289
+ ) -> int:
290
+ if not next_rows:
291
+ return 0
292
+
293
+ header_text_count = sum(
294
+ 1 for cell in current_row if cell is not None and isinstance(cell, str)
295
+ )
296
+
297
+ next_rows_numeric_count = [
298
+ sum(
299
+ 1
300
+ for cell in row
301
+ if cell is not None
302
+ and (
303
+ isinstance(cell, (int, float))
304
+ or (
305
+ isinstance(cell, str)
306
+ and re.match(r"^-?\d+(\.\d+)?$", str(cell).strip())
307
+ )
308
+ )
309
+ )
310
+ for row in next_rows
311
+ ]
312
+
313
+ if header_text_count > 0 and any(
314
+ count > 0 for count in next_rows_numeric_count
315
+ ):
316
+ return 6 + sum(1 for count in next_rows_numeric_count if count > 0)
317
+ return 0
318
+
319
+ @staticmethod
320
+ def _score_column_type_consistency(
321
+ current_row: List[Any], next_rows: List[List[Any]]
322
+ ) -> int:
323
+ if len(next_rows) < 2:
324
+ return 0
325
+
326
+ col_types = []
327
+ for col_idx in range(len(current_row)):
328
+ if col_idx < len(current_row) and current_row[col_idx] is not None:
329
+ col_type_counter: Counter = Counter()
330
+ for row in next_rows:
331
+ if col_idx < len(row) and row[col_idx] is not None:
332
+ cell_type = type(row[col_idx]).__name__
333
+ col_type_counter[cell_type] += 1
334
+
335
+ if col_type_counter and col_type_counter.most_common(1)[0][1] >= 1:
336
+ col_types.append(col_type_counter.most_common(1)[0][0])
337
+
338
+ return 3 if len(col_types) >= 2 and len(set(col_types)) >= 1 else 0
339
+
340
+ @staticmethod
341
+ def _score_metadata_patterns(
342
+ row_idx: int, current_row: List[Any], all_rows: List[List[Any]]
343
+ ) -> int:
344
+ score = 0
345
+
346
+ if row_idx == 0 and len(current_row) <= 2:
347
+ metadata_like = sum(
348
+ 1
349
+ for cell in current_row
350
+ if cell is not None and isinstance(cell, str) and len(str(cell)) <= 20
351
+ )
352
+ if metadata_like <= 2:
353
+ score -= 1
354
+
355
+ if row_idx < len(all_rows) - 1 and len(current_row) >= 2:
356
+ if all(
357
+ isinstance(cell, str) for cell in current_row[:2] if cell is not None
358
+ ):
359
+ score -= 2
360
+
361
+ return score
362
+
363
+ @staticmethod
364
+ def find_footer_start(rows: List[List[Any]], header_row_idx: int) -> int:
365
+ if header_row_idx + 1 >= len(rows):
366
+ return len(rows)
367
+
368
+ # Start with the assumption that all rows after the header are data (no footer)
369
+ footer_start_idx = len(rows)
370
+
371
+ # Get the number of columns in the header row to determine table width
372
+ header_row = rows[header_row_idx]
373
+ table_width = sum(
374
+ 1 for cell in header_row if cell is not None and str(cell).strip() != ""
375
+ )
376
+
377
+ # Get a sample of data rows to establish patterns
378
+ data_sample_idx = min(header_row_idx + 5, len(rows) - 1)
379
+ data_rows = rows[header_row_idx + 1 : data_sample_idx + 1]
380
+
381
+ # Check for rows with significantly fewer populated cells than the data rows
382
+ avg_populated_cells = sum(
383
+ sum(1 for cell in row if cell is not None and str(cell).strip() != "")
384
+ for row in data_rows
385
+ ) / len(data_rows)
386
+
387
+ # Look for pattern breaks, empty rows, or format changes
388
+ for i in range(header_row_idx + 1, len(rows)):
389
+ current_row = rows[i]
390
+
391
+ # Skip completely empty rows unless followed by non-data-like rows
392
+ if not any(
393
+ cell is not None and str(cell).strip() != "" for cell in current_row
394
+ ):
395
+ # Look ahead to see if this empty row marks the start of footer
396
+ if i + 1 < len(rows):
397
+ next_row = rows[i + 1]
398
+ next_row_populated = sum(
399
+ 1
400
+ for cell in next_row
401
+ if cell is not None and str(cell).strip() != ""
402
+ )
403
+
404
+ # If the next row has significantly fewer populated cells or is text-heavy,
405
+ # consider this the start of footer
406
+ if (
407
+ next_row_populated < avg_populated_cells * 0.5
408
+ or sum(
409
+ 1
410
+ for cell in next_row
411
+ if cell is not None
412
+ and isinstance(cell, str)
413
+ and len(str(cell)) > 20
414
+ )
415
+ > 0
416
+ ):
417
+ footer_start_idx = i
418
+ break
419
+ continue
420
+
421
+ # Count populated cells
422
+ populated_cells = sum(
423
+ 1
424
+ for cell in current_row
425
+ if cell is not None and str(cell).strip() != ""
426
+ )
427
+
428
+ # Check for footer indicators
429
+ footer_indicators = [
430
+ "total",
431
+ "sum",
432
+ "average",
433
+ "mean",
434
+ "source",
435
+ "note",
436
+ "footnote",
437
+ ]
438
+ has_footer_text = any(
439
+ cell is not None
440
+ and isinstance(cell, str)
441
+ and any(
442
+ indicator in str(cell).lower() for indicator in footer_indicators
443
+ )
444
+ for cell in current_row
445
+ )
446
+
447
+ # Check for the summary row
448
+ looks_like_summary = has_footer_text and populated_cells <= table_width
449
+
450
+ # Check for notes or sources (often longer text spanning multiple columns)
451
+ long_text_cells = sum(
452
+ 1
453
+ for cell in current_row
454
+ if cell is not None and isinstance(cell, str) and len(str(cell)) > 50
455
+ )
456
+
457
+ # If this looks like the start of the footer, mark it
458
+ if (
459
+ (populated_cells < avg_populated_cells * 0.7 and i > header_row_idx + 3)
460
+ or looks_like_summary
461
+ or long_text_cells > 0
462
+ ):
463
+ footer_start_idx = i
464
+ break
465
+
466
+ # Check for inconsistent data types compared to data rows
467
+ if i > header_row_idx + 3:
468
+ data_type_mismatch = 0
469
+ for j, cell in enumerate(current_row):
470
+ if j < len(header_row) and header_row[j] is not None:
471
+ # Get the most common data type for this column in previous rows
472
+ col_types = [
473
+ type(rows[idx][j]).__name__
474
+ for idx in range(header_row_idx + 1, i)
475
+ if idx < len(rows)
476
+ and j < len(rows[idx])
477
+ and rows[idx][j] is not None
478
+ ]
479
+ if col_types and cell is not None:
480
+ most_common_type = Counter(col_types).most_common(1)[0][0]
481
+ if type(cell).__name__ != most_common_type:
482
+ data_type_mismatch += 1
483
+
484
+ # If many columns have type mismatches, this might be a footer row
485
+ if data_type_mismatch > table_width * 0.5:
486
+ footer_start_idx = i
487
+ break
488
+
489
+ return footer_start_idx
490
+
491
+ @staticmethod
492
+ def extract_metadata(rows: List[List[Any]]) -> Dict[str, Any]:
493
+ metadata = {}
494
+
495
+ for row in rows:
496
+ if len(row) >= 2 and all(item is None for item in row[2:]):
497
+ key, value = row[:2]
498
+ if key is not None and value is not None:
499
+ metadata[str(key).strip().rstrip(":=").rstrip()] = str(
500
+ value
501
+ ).strip()
502
+
503
+ return metadata
504
+
505
+ @staticmethod
506
+ def read_excel_properties(wb: Workbook) -> Dict[str, Any]:
507
+ # Core properties from DocumentProperties
508
+ core_props = wb.properties
509
+ properties = {
510
+ "title": core_props.title,
511
+ "author": core_props.creator,
512
+ "subject": core_props.subject,
513
+ "description": core_props.description,
514
+ "keywords": core_props.keywords,
515
+ "category": core_props.category,
516
+ "last_modified_by": core_props.lastModifiedBy,
517
+ "created": core_props.created,
518
+ "modified": core_props.modified,
519
+ "status": core_props.contentStatus,
520
+ "revision": core_props.revision,
521
+ "version": core_props.version,
522
+ "language": core_props.language,
523
+ "identifier": core_props.identifier,
524
+ }
525
+
526
+ # Remove None values
527
+ properties = {k: v for k, v in properties.items() if v is not None}
528
+
529
+ # Assign custom properties if they exist
530
+ if hasattr(wb, "custom_doc_props"):
531
+ for prop in wb.custom_doc_props.props:
532
+ if prop.value:
533
+ if prop.name in properties:
534
+ prop_name = f"custom.{prop.name}"
535
+ else:
536
+ prop_name = prop.name
537
+ properties[prop_name] = prop.value
538
+
539
+ return properties