acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +2 -1
  7. datahub/api/entities/external/__init__.py +0 -0
  8. datahub/api/entities/external/external_entities.py +239 -0
  9. datahub/api/entities/external/external_tag.py +145 -0
  10. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  11. datahub/api/entities/external/restricted_text.py +247 -0
  12. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  13. datahub/cli/check_cli.py +88 -7
  14. datahub/cli/cli_utils.py +63 -0
  15. datahub/cli/container_cli.py +5 -0
  16. datahub/cli/delete_cli.py +124 -27
  17. datahub/cli/docker_check.py +107 -12
  18. datahub/cli/docker_cli.py +149 -227
  19. datahub/cli/exists_cli.py +0 -2
  20. datahub/cli/get_cli.py +0 -2
  21. datahub/cli/iceberg_cli.py +5 -0
  22. datahub/cli/ingest_cli.py +12 -16
  23. datahub/cli/migrate.py +2 -0
  24. datahub/cli/put_cli.py +1 -4
  25. datahub/cli/quickstart_versioning.py +50 -7
  26. datahub/cli/specific/assertions_cli.py +0 -4
  27. datahub/cli/specific/datacontract_cli.py +0 -3
  28. datahub/cli/specific/dataproduct_cli.py +0 -11
  29. datahub/cli/specific/dataset_cli.py +1 -8
  30. datahub/cli/specific/forms_cli.py +0 -4
  31. datahub/cli/specific/group_cli.py +0 -2
  32. datahub/cli/specific/structuredproperties_cli.py +1 -4
  33. datahub/cli/specific/user_cli.py +0 -2
  34. datahub/cli/state_cli.py +0 -2
  35. datahub/cli/timeline_cli.py +0 -2
  36. datahub/emitter/response_helper.py +86 -1
  37. datahub/emitter/rest_emitter.py +71 -13
  38. datahub/entrypoints.py +4 -3
  39. datahub/ingestion/api/decorators.py +15 -3
  40. datahub/ingestion/api/report.py +332 -3
  41. datahub/ingestion/api/sink.py +3 -0
  42. datahub/ingestion/api/source.py +48 -44
  43. datahub/ingestion/autogenerated/__init__.py +0 -0
  44. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  45. datahub/ingestion/autogenerated/lineage.json +401 -0
  46. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  47. datahub/ingestion/extractor/schema_util.py +13 -4
  48. datahub/ingestion/glossary/classification_mixin.py +5 -0
  49. datahub/ingestion/graph/client.py +100 -15
  50. datahub/ingestion/graph/config.py +1 -0
  51. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  52. datahub/ingestion/run/pipeline.py +54 -2
  53. datahub/ingestion/sink/datahub_rest.py +13 -0
  54. datahub/ingestion/source/abs/source.py +1 -1
  55. datahub/ingestion/source/aws/aws_common.py +4 -0
  56. datahub/ingestion/source/aws/glue.py +489 -244
  57. datahub/ingestion/source/aws/tag_entities.py +292 -0
  58. datahub/ingestion/source/azure/azure_common.py +2 -2
  59. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  60. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  61. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  62. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  63. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  64. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  65. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  66. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  67. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  68. datahub/ingestion/source/common/subtypes.py +45 -0
  69. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  70. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  71. datahub/ingestion/source/datahub/config.py +11 -0
  72. datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
  73. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  74. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  75. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  76. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  77. datahub/ingestion/source/debug/__init__.py +0 -0
  78. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  79. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  80. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  81. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  82. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  83. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  84. datahub/ingestion/source/file.py +3 -0
  85. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  86. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  87. datahub/ingestion/source/ge_data_profiler.py +76 -28
  88. datahub/ingestion/source/ge_profiling_config.py +11 -0
  89. datahub/ingestion/source/hex/api.py +26 -1
  90. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  91. datahub/ingestion/source/identity/azure_ad.py +1 -1
  92. datahub/ingestion/source/identity/okta.py +1 -14
  93. datahub/ingestion/source/kafka/kafka.py +16 -0
  94. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  95. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  96. datahub/ingestion/source/looker/looker_source.py +1 -0
  97. datahub/ingestion/source/mlflow.py +11 -1
  98. datahub/ingestion/source/mock_data/__init__.py +0 -0
  99. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  100. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  101. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  102. datahub/ingestion/source/nifi.py +1 -1
  103. datahub/ingestion/source/openapi.py +12 -0
  104. datahub/ingestion/source/openapi_parser.py +56 -37
  105. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  106. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  108. datahub/ingestion/source/preset.py +2 -2
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  110. datahub/ingestion/source/redshift/redshift.py +21 -1
  111. datahub/ingestion/source/redshift/usage.py +4 -3
  112. datahub/ingestion/source/s3/report.py +4 -2
  113. datahub/ingestion/source/s3/source.py +367 -115
  114. datahub/ingestion/source/sac/sac.py +3 -1
  115. datahub/ingestion/source/salesforce.py +6 -3
  116. datahub/ingestion/source/sigma/sigma.py +7 -1
  117. datahub/ingestion/source/slack/slack.py +2 -1
  118. datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
  119. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  120. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  121. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  122. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  123. datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
  124. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  125. datahub/ingestion/source/sql/athena.py +119 -11
  126. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  127. datahub/ingestion/source/sql/clickhouse.py +3 -1
  128. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  129. datahub/ingestion/source/sql/hana.py +3 -1
  130. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  131. datahub/ingestion/source/sql/mariadb.py +0 -1
  132. datahub/ingestion/source/sql/mssql/source.py +239 -34
  133. datahub/ingestion/source/sql/mysql.py +0 -1
  134. datahub/ingestion/source/sql/oracle.py +1 -1
  135. datahub/ingestion/source/sql/postgres.py +0 -1
  136. datahub/ingestion/source/sql/sql_common.py +121 -34
  137. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  138. datahub/ingestion/source/sql/teradata.py +997 -235
  139. datahub/ingestion/source/sql/vertica.py +10 -6
  140. datahub/ingestion/source/sql_queries.py +2 -2
  141. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  142. datahub/ingestion/source/superset.py +58 -3
  143. datahub/ingestion/source/tableau/tableau.py +58 -37
  144. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  145. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  146. datahub/ingestion/source/unity/config.py +5 -0
  147. datahub/ingestion/source/unity/proxy.py +118 -0
  148. datahub/ingestion/source/unity/source.py +195 -17
  149. datahub/ingestion/source/unity/tag_entities.py +295 -0
  150. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  151. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  152. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  153. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  154. datahub/metadata/_internal_schema_classes.py +1446 -559
  155. datahub/metadata/_urns/urn_defs.py +1721 -1553
  156. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  158. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  159. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  160. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  161. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  162. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  163. datahub/metadata/schema.avsc +18055 -17802
  164. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  165. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  166. datahub/metadata/schemas/Applications.avsc +38 -0
  167. datahub/metadata/schemas/ChartKey.avsc +1 -0
  168. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  169. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  170. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  171. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  172. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  175. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  176. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  177. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  178. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  179. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  180. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  181. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  182. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  183. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  184. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  185. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  186. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  187. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  188. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  189. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  190. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  191. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  192. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  193. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  194. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  195. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  196. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  197. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  198. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  199. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  200. datahub/sdk/__init__.py +6 -0
  201. datahub/sdk/_all_entities.py +11 -0
  202. datahub/sdk/_shared.py +118 -1
  203. datahub/sdk/chart.py +315 -0
  204. datahub/sdk/container.py +7 -0
  205. datahub/sdk/dashboard.py +432 -0
  206. datahub/sdk/dataflow.py +309 -0
  207. datahub/sdk/datajob.py +367 -0
  208. datahub/sdk/dataset.py +8 -2
  209. datahub/sdk/entity_client.py +90 -2
  210. datahub/sdk/lineage_client.py +683 -82
  211. datahub/sdk/main_client.py +46 -16
  212. datahub/sdk/mlmodel.py +101 -38
  213. datahub/sdk/mlmodelgroup.py +7 -0
  214. datahub/sdk/search_client.py +4 -3
  215. datahub/specific/chart.py +1 -1
  216. datahub/specific/dataproduct.py +4 -0
  217. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  218. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  219. datahub/telemetry/telemetry.py +17 -11
  220. datahub/testing/sdk_v2_helpers.py +7 -1
  221. datahub/upgrade/upgrade.py +46 -13
  222. datahub/utilities/server_config_util.py +8 -0
  223. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  224. datahub/utilities/stats_collections.py +4 -0
  225. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  226. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,472 @@
1
+ import logging
2
+ from typing import Dict, Iterable, List, Optional, Tuple
3
+
4
+ from pydantic import Field
5
+
6
+ from datahub.configuration.common import ConfigModel
7
+ from datahub.emitter.mce_builder import make_dataset_urn
8
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.api.decorators import (
11
+ SupportStatus,
12
+ config_class,
13
+ platform_name,
14
+ support_status,
15
+ )
16
+ from datahub.ingestion.api.source import Source, SourceReport
17
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
18
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
19
+ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
20
+ DataHubMockDataReport,
21
+ )
22
+ from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
23
+ from datahub.metadata.schema_classes import (
24
+ CalendarIntervalClass,
25
+ DatasetLineageTypeClass,
26
+ DatasetProfileClass,
27
+ DatasetUsageStatisticsClass,
28
+ StatusClass,
29
+ SubTypesClass,
30
+ TimeWindowSizeClass,
31
+ UpstreamClass,
32
+ UpstreamLineageClass,
33
+ )
34
+ from datahub.utilities.str_enum import StrEnum
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class SubTypePattern(StrEnum):
40
+ ALTERNATING = "alternating"
41
+ ALL_TABLE = "all_table"
42
+ ALL_VIEW = "all_view"
43
+ LEVEL_BASED = "level_based"
44
+
45
+
46
+ class LineageConfigGen1(ConfigModel):
47
+ """
48
+ Configuration for generating mock lineage data for testing purposes.
49
+
50
+ This configuration controls how the mock data source generates a hierarchical
51
+ lineage graph with multiple levels of upstream/downstream relationships.
52
+
53
+ The lineage graph is structured as follows:
54
+ - Level 0: 1 table (root)
55
+ - Level 1: lineage_fan_out tables (each connected to the root)
56
+ - Level 2+: If lineage_fan_out_after_first_hop is set, uses that value;
57
+ otherwise uses lineage_fan_out^level tables (each connected to a level 1 table)
58
+ - ... and so on for lineage_hops levels
59
+
60
+ Examples:
61
+ - With lineage_fan_out=2, lineage_hops=1: Creates 3 tables total
62
+ (1 root + 2 downstream) with 2 lineage relationships
63
+ - With lineage_fan_out=3, lineage_hops=2: Creates 13 tables total
64
+ (1 + 3 + 9) with 12 lineage relationships
65
+ - With lineage_fan_out=4, lineage_hops=1: Creates 5 tables total
66
+ (1 + 4) with 4 lineage relationships
67
+ - With lineage_fan_out=3, lineage_hops=3, lineage_fan_out_after_first_hop=2:
68
+ Creates 1 + 3 + 6 + 12 = 22 tables total (prevents exponential growth)
69
+
70
+ Table naming convention: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
71
+ """
72
+
73
+ enabled: bool = Field(
74
+ default=False,
75
+ description="Whether this source is enabled",
76
+ )
77
+
78
+ emit_lineage: bool = Field(
79
+ default=True,
80
+ description="Whether to emit lineage data for testing purposes. When False, no lineage data is generated regardless of other settings.",
81
+ )
82
+ emit_usage: bool = Field(
83
+ default=True,
84
+ description="Whether to emit usage data for testing purposes. When False, no usage data is generated regardless of other settings.",
85
+ )
86
+
87
+ lineage_fan_out: int = Field(
88
+ default=3,
89
+ description="Number of downstream tables that each upstream table connects to. This controls the 'width' of the lineage graph. Higher values create more parallel downstream tables per level.",
90
+ )
91
+
92
+ lineage_hops: int = Field(
93
+ default=2,
94
+ description="Number of hops (levels) in the lineage graph. This controls the 'depth' of the lineage graph. Level 0 is the root table, and each subsequent level contains downstream tables. Higher values create deeper lineage chains.",
95
+ )
96
+
97
+ lineage_fan_out_after_first_hop: Optional[int] = Field(
98
+ default=None,
99
+ description="Optional limit on fanout for hops after the first hop. When set, prevents exponential growth by limiting the number of downstream tables per upstream table at levels 2 and beyond. When None, uses the standard exponential growth (lineage_fan_out^level).",
100
+ )
101
+
102
+ subtype_pattern: SubTypePattern = Field(
103
+ default=SubTypePattern.ALTERNATING,
104
+ description="Pattern for determining SubTypes. Options: 'alternating', 'all_table', 'all_view', 'level_based'",
105
+ )
106
+
107
+ level_subtypes: Dict[int, str] = Field(
108
+ default={0: "Table", 1: "View", 2: "Table"},
109
+ description="Mapping of level to subtype for level_based pattern",
110
+ )
111
+
112
+
113
+ class DataHubMockDataConfig(ConfigModel):
114
+ enabled: bool = Field(
115
+ default=True,
116
+ description="Whether this source is enabled",
117
+ )
118
+ throw_uncaught_exceptions: bool = Field(
119
+ default=False,
120
+ description="Whether to throw an uncaught exception for testing",
121
+ )
122
+ num_errors: int = Field(
123
+ default=0,
124
+ description="Number of errors to add in report for testing",
125
+ )
126
+ num_warnings: int = Field(
127
+ default=0,
128
+ description="Number of warnings to add in report for testing",
129
+ )
130
+
131
+ gen_1: LineageConfigGen1 = Field(
132
+ default_factory=LineageConfigGen1,
133
+ description="Configuration for lineage data generation",
134
+ )
135
+
136
+
137
+ @platform_name("DataHubMockData")
138
+ @config_class(DataHubMockDataConfig)
139
+ @support_status(SupportStatus.TESTING)
140
+ class DataHubMockDataSource(Source):
141
+ """
142
+ This source is for generating mock data for testing purposes.
143
+ Expect breaking changes as we iterate on the mock data source.
144
+ """
145
+
146
+ def __init__(self, ctx: PipelineContext, config: DataHubMockDataConfig):
147
+ self.ctx = ctx
148
+ self.config = config
149
+ self.report = DataHubMockDataReport()
150
+
151
+ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
152
+ if self.config.throw_uncaught_exceptions:
153
+ raise Exception("This is a test exception")
154
+
155
+ if self.config.num_errors > 0:
156
+ for i in range(self.config.num_errors):
157
+ self.report.failure(
158
+ message="This is test error message",
159
+ title="Test Error",
160
+ context=f"This is test error {i}",
161
+ )
162
+
163
+ if self.config.num_warnings > 0:
164
+ for i in range(self.config.num_warnings):
165
+ self.report.warning(
166
+ message="This is test warning",
167
+ title="Test Warning",
168
+ context=f"This is test warning {i}",
169
+ )
170
+
171
+ # We don't want any implicit aspects to be produced
172
+ # so we are not using get_workunits_internal
173
+ if self.config.gen_1.enabled:
174
+ for wu in self._data_gen_1():
175
+ if self.report.first_urn_seen is None:
176
+ self.report.first_urn_seen = wu.get_urn()
177
+ self.report.report_workunit(wu)
178
+ yield wu
179
+
180
+ yield from []
181
+
182
+ def _calculate_lineage_tables(
183
+ self, fan_out: int, hops: int, fan_out_after_first: Optional[int] = None
184
+ ) -> Tuple[int, List[int]]:
185
+ """
186
+ Calculate the total number of tables and tables at each level for lineage generation.
187
+
188
+ Args:
189
+ fan_out: Number of downstream tables per upstream table at level 1
190
+ hops: Number of hops (levels) in the lineage graph
191
+ fan_out_after_first: Optional limit on fanout for hops after the first hop
192
+
193
+ Returns:
194
+ Tuple of (total_tables, tables_at_levels) where tables_at_levels is a list
195
+ containing the number of tables at each level (index 0 = level 0, etc.)
196
+ """
197
+ tables_to_be_created = 0
198
+ tables_at_levels: List[int] = []
199
+
200
+ for i in range(hops + 1):
201
+ if i == 0:
202
+ # Level 0: always 1 table
203
+ tables_at_level = 1
204
+ elif i == 1:
205
+ # Level 1: uses lineage_fan_out
206
+ tables_at_level = fan_out
207
+ else:
208
+ # Level 2+: use fan_out_after_first_hop if set, otherwise exponential growth
209
+ if fan_out_after_first is not None:
210
+ # Each table at previous level creates fan_out_after_first tables
211
+ tables_at_level = tables_at_levels[i - 1] * fan_out_after_first
212
+ else:
213
+ # Original exponential behavior
214
+ tables_at_level = fan_out**i
215
+
216
+ tables_at_levels.append(tables_at_level)
217
+ tables_to_be_created += tables_at_level
218
+
219
+ return tables_to_be_created, tables_at_levels
220
+
221
+ def _calculate_fanout_for_level(
222
+ self, level: int, fan_out: int, fan_out_after_first: Optional[int] = None
223
+ ) -> int:
224
+ """
225
+ Calculate the fanout (number of downstream tables) for a specific level.
226
+
227
+ Args:
228
+ level: The current level (0-based)
229
+ fan_out: Number of downstream tables per upstream table at level 1
230
+ fan_out_after_first: Optional limit on fanout for hops after the first hop
231
+
232
+ Returns:
233
+ The number of downstream tables that each table at this level should connect to
234
+ """
235
+ if level == 0:
236
+ # Level 0: uses the standard fan_out
237
+ return fan_out
238
+ else:
239
+ # Level 1+: use fan_out_after_first if set, otherwise use fan_out
240
+ return fan_out_after_first if fan_out_after_first is not None else fan_out
241
+
242
+ def _determine_subtype(
243
+ self, table_name: str, table_level: int, table_index: int
244
+ ) -> str:
245
+ """
246
+ Determine subtype based on configured pattern.
247
+
248
+ Args:
249
+ table_name: Name of the table
250
+ table_level: Level of the table in the lineage graph
251
+ table_index: Index of the table within its level
252
+
253
+ Returns:
254
+ The determined subtype ("Table" or "View")
255
+ """
256
+ pattern = self.config.gen_1.subtype_pattern
257
+
258
+ if pattern == SubTypePattern.ALTERNATING:
259
+ return (
260
+ DatasetSubTypes.TABLE if table_index % 2 == 0 else DatasetSubTypes.VIEW
261
+ )
262
+ elif pattern == SubTypePattern.LEVEL_BASED:
263
+ return self.config.gen_1.level_subtypes.get(
264
+ table_level, DatasetSubTypes.TABLE
265
+ )
266
+ elif pattern == SubTypePattern.ALL_TABLE:
267
+ return DatasetSubTypes.TABLE
268
+ elif pattern == SubTypePattern.ALL_VIEW:
269
+ return DatasetSubTypes.VIEW
270
+ else:
271
+ return DatasetSubTypes.TABLE # default
272
+
273
+ def _get_subtypes_aspect(
274
+ self, table_name: str, table_level: int, table_index: int
275
+ ) -> MetadataWorkUnit:
276
+ """
277
+ Create a SubTypes aspect for a table based on deterministic pattern.
278
+
279
+ Args:
280
+ table_name: Name of the table
281
+ table_level: Level of the table in the lineage graph
282
+ table_index: Index of the table within its level
283
+
284
+ Returns:
285
+ MetadataWorkUnit containing the SubTypes aspect
286
+ """
287
+ # Determine subtype based on pattern
288
+ subtype = self._determine_subtype(table_name, table_level, table_index)
289
+
290
+ urn = make_dataset_urn(platform="fake", name=table_name)
291
+ mcp = MetadataChangeProposalWrapper(
292
+ entityUrn=urn,
293
+ entityType="dataset",
294
+ aspect=SubTypesClass(typeNames=[subtype]),
295
+ )
296
+ return mcp.as_workunit()
297
+
298
+ def _data_gen_1(self) -> Iterable[MetadataWorkUnit]:
299
+ """Generate mock lineage data for testing purposes."""
300
+ gen_1 = self.config.gen_1
301
+ fan_out = gen_1.lineage_fan_out
302
+ hops = gen_1.lineage_hops
303
+ fan_out_after_first = gen_1.lineage_fan_out_after_first_hop
304
+
305
+ logger.info(
306
+ f"Generating lineage data with fan_out={fan_out}, hops={hops}, fan_out_after_first={fan_out_after_first}"
307
+ )
308
+
309
+ tables_to_be_created, tables_at_levels = self._calculate_lineage_tables(
310
+ fan_out, hops, fan_out_after_first
311
+ )
312
+
313
+ logger.info(f"About to create {tables_to_be_created} datasets mock data")
314
+
315
+ for i in range(hops + 1):
316
+ tables_at_level = tables_at_levels[i]
317
+
318
+ for j in range(tables_at_level):
319
+ table_name = TableNamingHelper.generate_table_name(hops, fan_out, i, j)
320
+
321
+ yield self._get_status_aspect(table_name)
322
+
323
+ yield self._get_subtypes_aspect(table_name, i, j)
324
+
325
+ yield self._get_profile_aspect(table_name)
326
+
327
+ if self.config.gen_1.emit_usage:
328
+ yield self._get_usage_aspect(table_name)
329
+
330
+ if self.config.gen_1.emit_lineage:
331
+ yield from self._generate_lineage_for_table(
332
+ table_name=table_name,
333
+ table_level=i,
334
+ table_index=j,
335
+ hops=hops,
336
+ fan_out=fan_out,
337
+ fan_out_after_first=fan_out_after_first,
338
+ tables_at_levels=tables_at_levels,
339
+ )
340
+
341
+ def _generate_lineage_for_table(
342
+ self,
343
+ table_name: str,
344
+ table_level: int,
345
+ table_index: int,
346
+ hops: int,
347
+ fan_out: int,
348
+ fan_out_after_first: Optional[int],
349
+ tables_at_levels: List[int],
350
+ ) -> Iterable[MetadataWorkUnit]:
351
+ """Generate lineage relationships for a specific table."""
352
+ # Only generate lineage if there are downstream levels
353
+ if table_level + 1 > hops:
354
+ return
355
+
356
+ current_fan_out = self._calculate_fanout_for_level(
357
+ table_level, fan_out, fan_out_after_first
358
+ )
359
+
360
+ yield from self._generate_downstream_lineage(
361
+ upstream_table_name=table_name,
362
+ upstream_table_index=table_index,
363
+ upstream_table_level=table_level,
364
+ current_fan_out=current_fan_out,
365
+ hops=hops,
366
+ fan_out=fan_out,
367
+ tables_at_levels=tables_at_levels,
368
+ )
369
+
370
+ def _generate_downstream_lineage(
371
+ self,
372
+ upstream_table_name: str,
373
+ upstream_table_index: int,
374
+ upstream_table_level: int,
375
+ current_fan_out: int,
376
+ hops: int,
377
+ fan_out: int,
378
+ tables_at_levels: List[int],
379
+ ) -> Iterable[MetadataWorkUnit]:
380
+ """Generate lineage relationships to downstream tables."""
381
+ downstream_level = upstream_table_level + 1
382
+ downstream_tables_count = tables_at_levels[downstream_level]
383
+
384
+ # Calculate range of downstream tables this upstream table connects to
385
+ start_downstream = upstream_table_index * current_fan_out
386
+ end_downstream = min(
387
+ (upstream_table_index + 1) * current_fan_out, downstream_tables_count
388
+ )
389
+
390
+ for downstream_index in range(start_downstream, end_downstream):
391
+ downstream_table_name = TableNamingHelper.generate_table_name(
392
+ hops, fan_out, downstream_level, downstream_index
393
+ )
394
+ yield self._get_upstream_aspect(
395
+ upstream_table=upstream_table_name,
396
+ downstream_table=downstream_table_name,
397
+ )
398
+
399
+ def _get_status_aspect(self, table: str) -> MetadataWorkUnit:
400
+ urn = make_dataset_urn(
401
+ platform="fake",
402
+ name=table,
403
+ )
404
+ mcp = MetadataChangeProposalWrapper(
405
+ entityUrn=urn,
406
+ entityType="dataset",
407
+ aspect=StatusClass(removed=False),
408
+ )
409
+ return mcp.as_workunit()
410
+
411
+ def _get_upstream_aspect(
412
+ self, upstream_table: str, downstream_table: str
413
+ ) -> MetadataWorkUnit:
414
+ mcp = MetadataChangeProposalWrapper(
415
+ entityUrn=make_dataset_urn(
416
+ platform="fake",
417
+ name=downstream_table,
418
+ ),
419
+ entityType="dataset",
420
+ aspect=UpstreamLineageClass(
421
+ upstreams=[
422
+ UpstreamClass(
423
+ dataset=make_dataset_urn(
424
+ platform="fake",
425
+ name=upstream_table,
426
+ ),
427
+ type=DatasetLineageTypeClass.TRANSFORMED,
428
+ )
429
+ ],
430
+ ),
431
+ )
432
+ return mcp.as_workunit()
433
+
434
+ def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
435
+ urn = make_dataset_urn(
436
+ platform="fake",
437
+ name=table,
438
+ )
439
+ mcp = MetadataChangeProposalWrapper(
440
+ entityUrn=urn,
441
+ entityType="dataset",
442
+ aspect=DatasetProfileClass(
443
+ timestampMillis=0,
444
+ rowCount=100,
445
+ columnCount=10,
446
+ sizeInBytes=1000,
447
+ ),
448
+ )
449
+ return mcp.as_workunit()
450
+
451
+ def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
452
+ urn = make_dataset_urn(
453
+ platform="fake",
454
+ name=table,
455
+ )
456
+ mcp = MetadataChangeProposalWrapper(
457
+ entityUrn=urn,
458
+ entityType="dataset",
459
+ aspect=DatasetUsageStatisticsClass(
460
+ timestampMillis=0,
461
+ eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
462
+ uniqueUserCount=0,
463
+ totalSqlQueries=0,
464
+ topSqlQueries=[],
465
+ userCounts=[],
466
+ fieldCounts=[],
467
+ ),
468
+ )
469
+ return mcp.as_workunit()
470
+
471
+ def get_report(self) -> SourceReport:
472
+ return self.report
@@ -0,0 +1,12 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+ from datahub.ingestion.api.source import SourceReport
5
+
6
+
7
+ @dataclass
8
+ class DataHubMockDataReport(SourceReport):
9
+ first_urn_seen: Optional[str] = field(
10
+ default=None,
11
+ metadata={"description": "The first URN encountered during ingestion"},
12
+ )
@@ -0,0 +1,91 @@
1
+ from typing import Dict
2
+
3
+
4
+ class TableNamingHelper:
5
+ """
6
+ Helper class for managing table naming conventions in mock data generation.
7
+
8
+ Table naming pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
9
+ """
10
+
11
+ @staticmethod
12
+ def generate_table_name(
13
+ lineage_hops: int, lineage_fan_out: int, level: int, table_index: int
14
+ ) -> str:
15
+ """
16
+ Generate a table name following the standard naming convention.
17
+
18
+ Args:
19
+ lineage_hops: Total number of hops in the lineage graph
20
+ lineage_fan_out: Number of downstream tables per upstream table
21
+ level: Level of the table in the lineage graph (0-based)
22
+ table_index: Index of the table within its level (0-based)
23
+
24
+ Returns:
25
+ Table name following the pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
26
+ """
27
+ return f"hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
28
+
29
+ @staticmethod
30
+ def parse_table_name(table_name: str) -> Dict[str, int]:
31
+ """
32
+ Parse a table name to extract its components.
33
+
34
+ Args:
35
+ table_name: Table name following the standard naming convention
36
+
37
+ Returns:
38
+ Dictionary containing parsed components:
39
+ - lineage_hops: Total number of hops in the lineage graph
40
+ - lineage_fan_out: Number of downstream tables per upstream table
41
+ - level: Level of the table in the lineage graph (0-based)
42
+ - table_index: Index of the table within its level (0-based)
43
+
44
+ Raises:
45
+ ValueError: If the table name doesn't follow the expected pattern
46
+ """
47
+ try:
48
+ # Expected pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
49
+ parts = table_name.split("_")
50
+
51
+ if (
52
+ len(parts) != 6
53
+ or parts[0] != "hops"
54
+ or parts[2] != "f"
55
+ or not parts[4].startswith("h")
56
+ or not parts[5].startswith("t")
57
+ ):
58
+ raise ValueError(f"Invalid table name format: {table_name}")
59
+
60
+ lineage_hops = int(parts[1])
61
+ lineage_fan_out = int(parts[3]) # lineage_fan_out is at index 3
62
+ level = int(parts[4][1:]) # Remove 'h' prefix from parts[4]
63
+ table_index = int(parts[5][1:]) # Remove 't' prefix from parts[5]
64
+
65
+ return {
66
+ "lineage_hops": lineage_hops,
67
+ "lineage_fan_out": lineage_fan_out,
68
+ "level": level,
69
+ "table_index": table_index,
70
+ }
71
+ except (ValueError, IndexError) as e:
72
+ raise ValueError(
73
+ f"Failed to parse table name '{table_name}': {str(e)}"
74
+ ) from e
75
+
76
+ @staticmethod
77
+ def is_valid_table_name(table_name: str) -> bool:
78
+ """
79
+ Check if a table name follows the expected naming convention.
80
+
81
+ Args:
82
+ table_name: Table name to validate
83
+
84
+ Returns:
85
+ True if the table name follows the expected pattern, False otherwise
86
+ """
87
+ try:
88
+ TableNamingHelper.parse_table_name(table_name)
89
+ return True
90
+ except ValueError:
91
+ return False
@@ -72,7 +72,7 @@ NIFI = "nifi"
72
72
  # and here - https://github.com/psf/requests/issues/1573
73
73
  class SSLAdapter(HTTPAdapter):
74
74
  def __init__(self, certfile, keyfile, password=None):
75
- self.context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
75
+ self.context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
76
76
  self.context.load_cert_chain(
77
77
  certfile=certfile, keyfile=keyfile, password=password
78
78
  )
@@ -82,6 +82,9 @@ class OpenApiConfig(ConfigModel):
82
82
  get_token: dict = Field(
83
83
  default={}, description="Retrieving a token from the endpoint."
84
84
  )
85
+ verify_ssl: bool = Field(
86
+ default=True, description="Enable SSL certificate verification"
87
+ )
85
88
 
86
89
  @validator("bearer_token", always=True)
87
90
  def ensure_only_one_token(
@@ -129,12 +132,14 @@ class OpenApiConfig(ConfigModel):
129
132
  tok_url=url4req,
130
133
  method=self.get_token["request_type"],
131
134
  proxies=self.proxies,
135
+ verify_ssl=self.verify_ssl,
132
136
  )
133
137
  sw_dict = get_swag_json(
134
138
  self.url,
135
139
  token=self.token,
136
140
  swagger_file=self.swagger_file,
137
141
  proxies=self.proxies,
142
+ verify_ssl=self.verify_ssl,
138
143
  ) # load the swagger file
139
144
 
140
145
  else: # using basic auth for accessing endpoints
@@ -144,6 +149,7 @@ class OpenApiConfig(ConfigModel):
144
149
  password=self.password,
145
150
  swagger_file=self.swagger_file,
146
151
  proxies=self.proxies,
152
+ verify_ssl=self.verify_ssl,
147
153
  )
148
154
  return sw_dict
149
155
 
@@ -343,6 +349,7 @@ class APISource(Source, ABC):
343
349
  tot_url,
344
350
  token=config.token,
345
351
  proxies=config.proxies,
352
+ verify_ssl=config.verify_ssl,
346
353
  )
347
354
  else:
348
355
  response = request_call(
@@ -350,6 +357,7 @@ class APISource(Source, ABC):
350
357
  username=config.username,
351
358
  password=config.password,
352
359
  proxies=config.proxies,
360
+ verify_ssl=config.verify_ssl,
353
361
  )
354
362
  if response.status_code == 200:
355
363
  fields2add, root_dataset_samples[dataset_name] = extract_fields(
@@ -380,6 +388,7 @@ class APISource(Source, ABC):
380
388
  tot_url,
381
389
  token=config.token,
382
390
  proxies=config.proxies,
391
+ verify_ssl=config.verify_ssl,
383
392
  )
384
393
  else:
385
394
  response = request_call(
@@ -387,6 +396,7 @@ class APISource(Source, ABC):
387
396
  username=config.username,
388
397
  password=config.password,
389
398
  proxies=config.proxies,
399
+ verify_ssl=config.verify_ssl,
390
400
  )
391
401
  if response.status_code == 200:
392
402
  fields2add, _ = extract_fields(response, dataset_name)
@@ -415,6 +425,7 @@ class APISource(Source, ABC):
415
425
  tot_url,
416
426
  token=config.token,
417
427
  proxies=config.proxies,
428
+ verify_ssl=config.verify_ssl,
418
429
  )
419
430
  else:
420
431
  response = request_call(
@@ -422,6 +433,7 @@ class APISource(Source, ABC):
422
433
  username=config.username,
423
434
  password=config.password,
424
435
  proxies=config.proxies,
436
+ verify_ssl=config.verify_ssl,
425
437
  )
426
438
  if response.status_code == 200:
427
439
  fields2add, _ = extract_fields(response, dataset_name)