acryl-datahub 1.2.0.3rc1__py3-none-any.whl → 1.2.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (74) hide show
  1. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/METADATA +2609 -2608
  2. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/RECORD +74 -73
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/dataset/dataset.py +3 -3
  5. datahub/api/entities/external/external_tag.py +6 -4
  6. datahub/api/entities/external/lake_formation_external_entites.py +50 -49
  7. datahub/api/entities/external/restricted_text.py +105 -180
  8. datahub/api/entities/external/unity_catalog_external_entites.py +51 -52
  9. datahub/api/entities/forms/forms.py +3 -3
  10. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  11. datahub/cli/quickstart_versioning.py +1 -1
  12. datahub/cli/specific/assertions_cli.py +37 -2
  13. datahub/cli/specific/datacontract_cli.py +54 -4
  14. datahub/emitter/rest_emitter.py +18 -5
  15. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +1 -1
  16. datahub/ingestion/api/report.py +21 -2
  17. datahub/ingestion/api/source.py +81 -7
  18. datahub/ingestion/autogenerated/capability_summary.json +47 -19
  19. datahub/ingestion/graph/client.py +19 -3
  20. datahub/ingestion/sink/datahub_rest.py +2 -0
  21. datahub/ingestion/source/abs/config.py +1 -1
  22. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  23. datahub/ingestion/source/abs/source.py +9 -0
  24. datahub/ingestion/source/aws/glue.py +18 -2
  25. datahub/ingestion/source/aws/tag_entities.py +4 -4
  26. datahub/ingestion/source/data_lake_common/path_spec.py +6 -3
  27. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  28. datahub/ingestion/source/dbt/dbt_cloud.py +6 -3
  29. datahub/ingestion/source/dbt/dbt_common.py +10 -0
  30. datahub/ingestion/source/delta_lake/source.py +8 -1
  31. datahub/ingestion/source/dremio/dremio_source.py +19 -2
  32. datahub/ingestion/source/fivetran/fivetran.py +9 -3
  33. datahub/ingestion/source/fivetran/fivetran_log_api.py +4 -3
  34. datahub/ingestion/source/ge_data_profiler.py +8 -0
  35. datahub/ingestion/source/grafana/models.py +6 -0
  36. datahub/ingestion/source/hex/hex.py +1 -1
  37. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  38. datahub/ingestion/source/iceberg/iceberg.py +4 -4
  39. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  40. datahub/ingestion/source/mock_data/datahub_mock_data.py +26 -10
  41. datahub/ingestion/source/powerbi/powerbi.py +4 -1
  42. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  43. datahub/ingestion/source/redshift/datashares.py +1 -1
  44. datahub/ingestion/source/redshift/redshift.py +1 -0
  45. datahub/ingestion/source/salesforce.py +8 -0
  46. datahub/ingestion/source/slack/slack.py +7 -14
  47. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -4
  48. datahub/ingestion/source/sql/athena_properties_extractor.py +2 -2
  49. datahub/ingestion/source/sql/hive_metastore.py +8 -0
  50. datahub/ingestion/source/sql/teradata.py +8 -1
  51. datahub/ingestion/source/sql/trino.py +9 -0
  52. datahub/ingestion/source/tableau/tableau.py +1 -1
  53. datahub/ingestion/source/unity/config.py +36 -1
  54. datahub/ingestion/source/unity/proxy.py +332 -46
  55. datahub/ingestion/source/unity/proxy_types.py +12 -2
  56. datahub/ingestion/source/unity/source.py +91 -34
  57. datahub/ingestion/source/unity/tag_entities.py +5 -5
  58. datahub/ingestion/source/usage/starburst_trino_usage.py +2 -2
  59. datahub/ingestion/transformer/base_transformer.py +8 -5
  60. datahub/metadata/_internal_schema_classes.py +513 -513
  61. datahub/metadata/_urns/urn_defs.py +1684 -1684
  62. datahub/metadata/schema.avsc +16745 -16348
  63. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  64. datahub/sdk/entity_client.py +22 -7
  65. datahub/sdk/search_client.py +3 -0
  66. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  67. datahub/specific/datajob.py +15 -1
  68. datahub/specific/dataset.py +37 -59
  69. datahub/utilities/mapping.py +29 -2
  70. datahub/utilities/server_config_util.py +2 -1
  71. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/WHEEL +0 -0
  72. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/entry_points.txt +0 -0
  73. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/licenses/LICENSE +0 -0
  74. {acryl_datahub-1.2.0.3rc1.dist-info → acryl_datahub-1.2.0.4.dist-info}/top_level.txt +0 -0
@@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel):
72
72
  description="Whether to profile for the sample values for all columns.",
73
73
  )
74
74
 
75
- @pydantic.root_validator()
75
+ @pydantic.root_validator(skip_on_failure=True)
76
76
  def ensure_field_level_settings_are_normalized(
77
77
  cls: "DataLakeProfilerConfig", values: Dict[str, Any]
78
78
  ) -> Dict[str, Any]:
@@ -44,6 +44,7 @@ from datahub.ingestion.source.azure.abs_utils import (
44
44
  get_key_prefix,
45
45
  strip_abs_prefix,
46
46
  )
47
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
47
48
  from datahub.ingestion.source.data_lake_common.data_lake_utils import (
48
49
  ContainerWUCreator,
49
50
  add_partition_columns_to_schema,
@@ -128,6 +129,14 @@ class TableData:
128
129
  @support_status(SupportStatus.INCUBATING)
129
130
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
130
131
  @capability(SourceCapability.TAGS, "Can extract ABS object/container tags if enabled")
132
+ @capability(
133
+ SourceCapability.CONTAINERS,
134
+ "Extract ABS containers and folders",
135
+ subtype_modifier=[
136
+ SourceCapabilityModifier.FOLDER,
137
+ SourceCapabilityModifier.ABS_CONTAINER,
138
+ ],
139
+ )
131
140
  class ABSSource(StatefulIngestionSourceBase):
132
141
  source_config: DataLakeSourceConfig
133
142
  report: DataLakeSourceReport
@@ -395,7 +395,7 @@ class GlueSource(StatefulIngestionSourceBase):
395
395
  t = LakeFormationTag(
396
396
  key=tag_key,
397
397
  value=tag_value,
398
- catalog_id=catalog_id,
398
+ catalog=catalog_id,
399
399
  )
400
400
  tags.append(t)
401
401
  return tags
@@ -438,7 +438,7 @@ class GlueSource(StatefulIngestionSourceBase):
438
438
  t = LakeFormationTag(
439
439
  key=tag_key,
440
440
  value=tag_value,
441
- catalog_id=catalog_id,
441
+ catalog=catalog_id,
442
442
  )
443
443
  tags.append(t)
444
444
  return tags
@@ -522,6 +522,14 @@ class GlueSource(StatefulIngestionSourceBase):
522
522
  bucket = url.netloc
523
523
  key = url.path[1:]
524
524
 
525
+ # validate that we have a non-empty key
526
+ if not key:
527
+ self.report.num_job_script_location_invalid += 1
528
+ logger.warning(
529
+ f"Error parsing DAG for Glue job. The script {script_path} is not a valid S3 path for flow urn: {flow_urn}."
530
+ )
531
+ return None
532
+
525
533
  # download the script contents
526
534
  # see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
527
535
  try:
@@ -533,6 +541,14 @@ class GlueSource(StatefulIngestionSourceBase):
533
541
  )
534
542
  self.report.num_job_script_failed_download += 1
535
543
  return None
544
+ except botocore.exceptions.ParamValidationError as e:
545
+ self.report_warning(
546
+ flow_urn,
547
+ f"Invalid S3 path for Glue job script {script_path}: {e}",
548
+ )
549
+ self.report.num_job_script_location_invalid += 1
550
+ return None
551
+
536
552
  script = obj["Body"].read().decode("utf-8")
537
553
 
538
554
  try:
@@ -37,7 +37,7 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
37
37
 
38
38
  tag_key: str
39
39
  tag_value: Optional[str] = None
40
- platform_instance: Optional[str]
40
+ platform_instance: Optional[str] = None
41
41
  catalog: Optional[str] = None
42
42
  exists_in_lake_formation: bool = False
43
43
  persisted: bool = False
@@ -88,8 +88,8 @@ class LakeFormationTagPlatformResourceId(BaseModel, ExternalEntityId):
88
88
  return existing_platform_resource
89
89
 
90
90
  return LakeFormationTagPlatformResourceId(
91
- tag_key=tag.key,
92
- tag_value=tag.value if tag.value is not None else None,
91
+ tag_key=str(tag.key),
92
+ tag_value=str(tag.value) if tag.value is not None else None,
93
93
  platform_instance=platform_instance,
94
94
  exists_in_lake_formation=exists_in_lake_formation,
95
95
  catalog=catalog,
@@ -227,7 +227,7 @@ class LakeFormationTagPlatformResource(BaseModel, ExternalEntity):
227
227
  datahub_urns: LinkedResourceSet
228
228
  managed_by_datahub: bool
229
229
  id: LakeFormationTagPlatformResourceId
230
- allowed_values: Optional[List[str]]
230
+ allowed_values: Optional[List[str]] = None
231
231
 
232
232
  def get_id(self) -> ExternalEntityId:
233
233
  return self.id
@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
62
62
 
63
63
  date_format: Optional[str] = Field(
64
64
  default=None,
65
- type=str,
66
65
  description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
67
66
  )
68
67
 
@@ -260,7 +259,7 @@ class PathSpec(ConfigModel):
260
259
  ) -> Union[None, parse.Result, parse.Match]:
261
260
  return self.compiled_folder_include.parse(path)
262
261
 
263
- @pydantic.root_validator()
262
+ @pydantic.root_validator(skip_on_failure=True)
264
263
  def validate_no_double_stars(cls, values: Dict) -> Dict:
265
264
  if "include" not in values:
266
265
  return values
@@ -456,7 +455,11 @@ class PathSpec(ConfigModel):
456
455
  partition = partition.rsplit("/", 1)[0]
457
456
  for partition_key in partition.split("/"):
458
457
  if partition_key.find("=") != -1:
459
- partition_keys.append(tuple(partition_key.split("=")))
458
+ key_value = partition_key.split(
459
+ "=", 1
460
+ ) # Split into at most 2 parts
461
+ if len(key_value) == 2:
462
+ partition_keys.append((key_value[0], key_value[1]))
460
463
  else:
461
464
  partition_split = partition.rsplit("/", 1)
462
465
  if len(partition_split) == 1:
@@ -19,6 +19,7 @@ from datahub.ingestion.api.source_helpers import (
19
19
  auto_workunit_reporter,
20
20
  )
21
21
  from datahub.ingestion.api.workunit import MetadataWorkUnit
22
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
22
23
  from datahub.ingestion.source.datahub.config import DataHubSourceConfig
23
24
  from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader
24
25
  from datahub.ingestion.source.datahub.datahub_database_reader import (
@@ -39,7 +40,13 @@ logger = logging.getLogger(__name__)
39
40
  @platform_name("DataHub")
40
41
  @config_class(DataHubSourceConfig)
41
42
  @support_status(SupportStatus.TESTING)
42
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
43
+ @capability(
44
+ SourceCapability.CONTAINERS,
45
+ "Enabled by default",
46
+ subtype_modifier=[
47
+ SourceCapabilityModifier.DATABASE,
48
+ ],
49
+ )
43
50
  class DataHubSource(StatefulIngestionSourceBase):
44
51
  platform: str = "datahub"
45
52
 
@@ -370,9 +370,12 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
370
370
  name = node["alias"]
371
371
 
372
372
  comment = node.get("comment", "")
373
- description = node["description"]
374
- if node.get("sourceDescription"):
375
- description = node["sourceDescription"]
373
+
374
+ # In dbt sources, there are two types of descriptions:
375
+ # - description: table-level description (specific to the source table)
376
+ # - sourceDescription: schema-level description (describes the overall source schema)
377
+ # The table-level description should take precedence since it's more specific.
378
+ description = node["description"] or node.get("sourceDescription", "")
376
379
 
377
380
  if node["resourceType"] == "model":
378
381
  materialization = node["materializedType"]
@@ -120,6 +120,7 @@ logger = logging.getLogger(__name__)
120
120
  DBT_PLATFORM = "dbt"
121
121
 
122
122
  _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
123
+ _DBT_MAX_COMPILED_CODE_LENGTH = 1 * 1024 * 1024 # 1MB
123
124
 
124
125
 
125
126
  @dataclass
@@ -1684,6 +1685,12 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1684
1685
  def get_external_url(self, node: DBTNode) -> Optional[str]:
1685
1686
  pass
1686
1687
 
1688
+ @staticmethod
1689
+ def _truncate_code(code: str, max_length: int) -> str:
1690
+ if len(code) > max_length:
1691
+ return code[:max_length] + "..."
1692
+ return code
1693
+
1687
1694
  def _create_view_properties_aspect(
1688
1695
  self, node: DBTNode
1689
1696
  ) -> Optional[ViewPropertiesClass]:
@@ -1695,6 +1702,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
1695
1702
  compiled_code = try_format_query(
1696
1703
  node.compiled_code, platform=self.config.target_platform
1697
1704
  )
1705
+ compiled_code = self._truncate_code(
1706
+ compiled_code, _DBT_MAX_COMPILED_CODE_LENGTH
1707
+ )
1698
1708
 
1699
1709
  materialized = node.materialization in {"table", "incremental", "snapshot"}
1700
1710
  view_properties = ViewPropertiesClass(
@@ -29,6 +29,7 @@ from datahub.ingestion.source.aws.s3_util import (
29
29
  get_key_prefix,
30
30
  strip_s3_prefix,
31
31
  )
32
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
32
33
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
33
34
  from datahub.ingestion.source.delta_lake.config import DeltaLakeSourceConfig
34
35
  from datahub.ingestion.source.delta_lake.delta_lake_utils import (
@@ -85,7 +86,13 @@ OPERATION_STATEMENT_TYPES = {
85
86
  @config_class(DeltaLakeSourceConfig)
86
87
  @support_status(SupportStatus.INCUBATING)
87
88
  @capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
88
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
89
+ @capability(
90
+ SourceCapability.CONTAINERS,
91
+ "Enabled by default",
92
+ subtype_modifier=[
93
+ SourceCapabilityModifier.FOLDER,
94
+ ],
95
+ )
89
96
  class DeltaLakeSource(StatefulIngestionSourceBase):
90
97
  """
91
98
  This plugin extracts:
@@ -22,6 +22,7 @@ from datahub.ingestion.api.source import (
22
22
  SourceReport,
23
23
  )
24
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
25
26
  from datahub.ingestion.source.dremio.dremio_api import (
26
27
  DremioAPIOperations,
27
28
  DremioEdition,
@@ -86,11 +87,27 @@ class DremioSourceMapEntry:
86
87
  @platform_name("Dremio")
87
88
  @config_class(DremioSourceConfig)
88
89
  @support_status(SupportStatus.CERTIFIED)
89
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
90
+ @capability(
91
+ SourceCapability.CONTAINERS,
92
+ "Enabled by default",
93
+ )
90
94
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
91
95
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
92
96
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
93
- @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
97
+ @capability(
98
+ SourceCapability.LINEAGE_COARSE,
99
+ "Enabled by default",
100
+ subtype_modifier=[
101
+ SourceCapabilityModifier.TABLE,
102
+ ],
103
+ )
104
+ @capability(
105
+ SourceCapability.LINEAGE_FINE,
106
+ "Extract column-level lineage",
107
+ subtype_modifier=[
108
+ SourceCapabilityModifier.TABLE,
109
+ ],
110
+ )
94
111
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
95
112
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
96
113
  @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
@@ -16,7 +16,11 @@ from datahub.ingestion.api.decorators import (
16
16
  platform_name,
17
17
  support_status,
18
18
  )
19
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
19
+ from datahub.ingestion.api.source import (
20
+ MetadataWorkUnitProcessor,
21
+ SourceReport,
22
+ StructuredLogCategory,
23
+ )
20
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
21
25
  from datahub.ingestion.source.fivetran.config import (
22
26
  KNOWN_DATA_PLATFORM_MAPPING,
@@ -96,8 +100,10 @@ class FivetranSource(StatefulIngestionSourceBase):
96
100
  self.report.info(
97
101
  title="Guessing source platform for lineage",
98
102
  message="We encountered a connector type that we don't fully support yet. "
99
- "We will attempt to guess the platform based on the connector type.",
100
- context=f"{connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
103
+ "We will attempt to guess the platform based on the connector type. "
104
+ "Note that we use connector_id as the key not connector_name which you may see in the UI of Fivetran. ",
105
+ context=f"connector_name: {connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
106
+ log_category=StructuredLogCategory.LINEAGE,
101
107
  )
102
108
  source_details.platform = connector.connector_type
103
109
 
@@ -69,9 +69,10 @@ class FivetranLogAPI:
69
69
  fivetran_log_query.set_schema(bigquery_destination_config.dataset)
70
70
 
71
71
  # The "database" should be the BigQuery project name.
72
- fivetran_log_database = engine.execute(
73
- "SELECT @@project_id"
74
- ).fetchone()[0]
72
+ result = engine.execute("SELECT @@project_id").fetchone()
73
+ if result is None:
74
+ raise ValueError("Failed to retrieve BigQuery project ID")
75
+ fivetran_log_database = result[0]
75
76
  else:
76
77
  raise ConfigurationError(
77
78
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -216,6 +216,14 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
216
216
  )
217
217
  ).scalar()
218
218
  )
219
+ elif self.engine.dialect.name.lower() == DATABRICKS:
220
+ return convert_to_json_serializable(
221
+ self.engine.execute(
222
+ sa.select(sa.func.approx_count_distinct(sa.column(column))).select_from(
223
+ self._table
224
+ )
225
+ ).scalar()
226
+ )
219
227
  return convert_to_json_serializable(
220
228
  self.engine.execute(
221
229
  sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
@@ -12,6 +12,7 @@ from typing import Any, Dict, List, Optional
12
12
 
13
13
  from pydantic import BaseModel, Field
14
14
 
15
+ from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
15
16
  from datahub.emitter.mcp_builder import ContainerKey
16
17
 
17
18
  # Grafana-specific type definitions for better type safety
@@ -106,6 +107,11 @@ class Folder(BaseModel):
106
107
  title: str
107
108
  description: Optional[str] = ""
108
109
 
110
+ if PYDANTIC_VERSION_2:
111
+ from pydantic import ConfigDict
112
+
113
+ model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
114
+
109
115
 
110
116
  class FolderKey(ContainerKey):
111
117
  """Key for identifying a Grafana folder."""
@@ -69,7 +69,7 @@ class HexSourceConfig(
69
69
  )
70
70
  include_components: bool = Field(
71
71
  default=True,
72
- desciption="Include Hex Components in the ingestion",
72
+ description="Include Hex Components in the ingestion",
73
73
  )
74
74
  page_size: int = Field(
75
75
  default=HEX_API_PAGE_SIZE_DEFAULT,
@@ -97,7 +97,7 @@ class HexQueryFetcher:
97
97
  if not query_urns or not entities_by_urn:
98
98
  self.report.warning(
99
99
  title="No Queries found with Hex as origin",
100
- message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
100
+ message="No lineage because of no Queries found with Hex as origin in the given time range. You may need to set use_queries_v2: true on your warehouse ingestion or you may consider extending the time range to fetch more queries.",
101
101
  context=str(
102
102
  dict(
103
103
  workspace_name=self.workspace_name,
@@ -524,11 +524,11 @@ class IcebergSource(StatefulIngestionSourceBase):
524
524
  custom_properties["format-version"] = str(table.metadata.format_version)
525
525
  custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
526
526
  last_modified: Optional[int] = table.metadata.last_updated_ms
527
- if table.current_snapshot():
528
- custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
529
- custom_properties["manifest-list"] = table.current_snapshot().manifest_list
527
+ if current_snapshot := table.current_snapshot():
528
+ custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
529
+ custom_properties["manifest-list"] = current_snapshot.manifest_list
530
530
  if not last_modified:
531
- last_modified = int(table.current_snapshot().timestamp_ms)
531
+ last_modified = int(current_snapshot.timestamp_ms)
532
532
  if "created-at" in custom_properties:
533
533
  try:
534
534
  dt = dateutil_parser.isoparse(custom_properties["created-at"])
@@ -1,5 +1,5 @@
1
1
  from functools import lru_cache
2
- from typing import ClassVar, Optional, TextIO
2
+ from typing import ClassVar, Optional, TextIO, Type
3
3
 
4
4
  from liquid import Environment
5
5
  from liquid.ast import Node
@@ -20,16 +20,27 @@ class CustomTagException(Exception):
20
20
  class ConditionNode(Node):
21
21
  def __init__(self, tok: Token, sql_or_lookml_reference: str, filter_name: str):
22
22
  self.tok = tok
23
-
24
23
  self.sql_or_lookml_reference = sql_or_lookml_reference
25
-
26
24
  self.filter_name = filter_name
27
25
 
28
26
  def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]:
29
27
  # This implementation will make sure that sql parse work correctly if looker condition tag
30
28
  # is used in lookml sql field
31
29
  buffer.write(f"{self.sql_or_lookml_reference}='dummy_value'")
30
+ return True
32
31
 
32
+
33
+ class IncrementConditionNode(Node):
34
+ def __init__(self, tok: Token, sql_or_lookml_reference: str):
35
+ self.tok = tok
36
+ self.sql_or_lookml_reference = sql_or_lookml_reference
37
+
38
+ def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]:
39
+ # For incrementcondition, we need to generate a condition that would be used
40
+ # in incremental PDT updates. This typically involves date/time comparisons.
41
+ # We'll render it as a date comparison with a placeholder value
42
+ # See details in Looker documentation for incrementcondition tag -> cloud.google.com/looker/docs/reference/param-view-increment-key
43
+ buffer.write(f"{self.sql_or_lookml_reference} > '2023-01-01'")
33
44
  return True
34
45
 
35
46
 
@@ -44,7 +55,6 @@ class ConditionTag(Tag):
44
55
  This class render the below tag as order.region='ap-south-1' if order_region is provided in config.liquid_variables
45
56
  as order_region: 'ap-south-1'
46
57
  {% condition order_region %} order.region {% endcondition %}
47
-
48
58
  """
49
59
 
50
60
  TAG_START: ClassVar[str] = "condition"
@@ -79,7 +89,48 @@ class ConditionTag(Tag):
79
89
  )
80
90
 
81
91
 
82
- custom_tags = [ConditionTag]
92
+ class IncrementConditionTag(Tag):
93
+ """
94
+ IncrementConditionTag is the equivalent implementation of looker's custom liquid tag "incrementcondition".
95
+ Refer doc: https://cloud.google.com/looker/docs/incremental-pdts#using_the_incrementcondition_tag
96
+
97
+ This tag is used for incremental PDTs to determine which records should be updated.
98
+ It typically works with date/time fields to filter data that has changed since the last update.
99
+
100
+ Example usage in Looker:
101
+ {% incrementcondition created_at %} order.created_at {% endincrementcondition %}
102
+
103
+ This would generate SQL like: order.created_at > '2023-01-01 00:00:00'
104
+ """
105
+
106
+ TAG_START: ClassVar[str] = "incrementcondition"
107
+ TAG_END: ClassVar[str] = "endincrementcondition"
108
+ name: str = "incrementcondition"
109
+
110
+ def __init__(self, env: Environment):
111
+ super().__init__(env)
112
+ self.parser = get_parser(self.env)
113
+
114
+ def parse(self, stream: TokenStream) -> Node:
115
+ expect(stream, TOKEN_TAG, value=IncrementConditionTag.TAG_START)
116
+
117
+ start_token = stream.current
118
+
119
+ stream.next_token()
120
+ expect(stream, TOKEN_LITERAL)
121
+ sql_or_lookml_reference: str = stream.current.value.strip()
122
+
123
+ stream.next_token()
124
+ expect(stream, TOKEN_TAG, value=IncrementConditionTag.TAG_END)
125
+
126
+ return IncrementConditionNode(
127
+ tok=start_token,
128
+ sql_or_lookml_reference=sql_or_lookml_reference,
129
+ )
130
+
131
+
132
+ # Updated custom_tags list to include both tags
133
+ custom_tags: list[Type[Tag]] = [ConditionTag, IncrementConditionTag]
83
134
 
84
135
 
85
136
  @string_filter
@@ -13,7 +13,7 @@ from datahub.ingestion.api.decorators import (
13
13
  platform_name,
14
14
  support_status,
15
15
  )
16
- from datahub.ingestion.api.source import Source, SourceReport
16
+ from datahub.ingestion.api.source import Source, SourceReport, StructuredLogCategory
17
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
18
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
19
19
  from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
@@ -35,6 +35,8 @@ from datahub.utilities.str_enum import StrEnum
35
35
 
36
36
  logger = logging.getLogger(__name__)
37
37
 
38
+ PLATFORM_NAME = "fake"
39
+
38
40
 
39
41
  class SubTypePattern(StrEnum):
40
42
  ALTERNATING = "alternating"
@@ -137,6 +139,10 @@ class DataHubMockDataConfig(ConfigModel):
137
139
  default=0,
138
140
  description="Number of warnings to add in report for testing",
139
141
  )
142
+ num_info: int = Field(
143
+ default=0,
144
+ description="Number of info to add in report for testing",
145
+ )
140
146
 
141
147
  gen_1: LineageConfigGen1 = Field(
142
148
  default_factory=LineageConfigGen1,
@@ -144,7 +150,7 @@ class DataHubMockDataConfig(ConfigModel):
144
150
  )
145
151
 
146
152
 
147
- @platform_name("DataHubMockData")
153
+ @platform_name(PLATFORM_NAME)
148
154
  @config_class(DataHubMockDataConfig)
149
155
  @support_status(SupportStatus.TESTING)
150
156
  class DataHubMockDataSource(Source):
@@ -159,6 +165,9 @@ class DataHubMockDataSource(Source):
159
165
  self.report = DataHubMockDataReport()
160
166
 
161
167
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
168
+ # We don't want any implicit aspects to be produced
169
+ # so we are not using get_workunits_internal
170
+
162
171
  if self.config.throw_uncaught_exceptions:
163
172
  raise Exception("This is a test exception")
164
173
 
@@ -176,10 +185,17 @@ class DataHubMockDataSource(Source):
176
185
  message="This is test warning",
177
186
  title="Test Warning",
178
187
  context=f"This is test warning {i}",
188
+ log_category=StructuredLogCategory.LINEAGE,
189
+ )
190
+
191
+ if self.config.num_info > 0:
192
+ for i in range(self.config.num_info):
193
+ self.report.info(
194
+ message="This is test info",
195
+ title="Test Info",
196
+ context=f"This is test info {i}",
179
197
  )
180
198
 
181
- # We don't want any implicit aspects to be produced
182
- # so we are not using get_workunits_internal
183
199
  if self.config.gen_1.enabled:
184
200
  for wu in self._data_gen_1():
185
201
  if self.report.first_urn_seen is None:
@@ -309,7 +325,7 @@ class DataHubMockDataSource(Source):
309
325
  table_level, table_index, subtype_pattern, subtype_types, level_subtypes
310
326
  )
311
327
 
312
- urn = make_dataset_urn(platform="fake", name=table_name)
328
+ urn = make_dataset_urn(platform=PLATFORM_NAME, name=table_name)
313
329
  mcp = MetadataChangeProposalWrapper(
314
330
  entityUrn=urn,
315
331
  entityType="dataset",
@@ -433,7 +449,7 @@ class DataHubMockDataSource(Source):
433
449
 
434
450
  def _get_status_aspect(self, table: str) -> MetadataWorkUnit:
435
451
  urn = make_dataset_urn(
436
- platform="fake",
452
+ platform=PLATFORM_NAME,
437
453
  name=table,
438
454
  )
439
455
  mcp = MetadataChangeProposalWrapper(
@@ -448,7 +464,7 @@ class DataHubMockDataSource(Source):
448
464
  ) -> MetadataWorkUnit:
449
465
  mcp = MetadataChangeProposalWrapper(
450
466
  entityUrn=make_dataset_urn(
451
- platform="fake",
467
+ platform=PLATFORM_NAME,
452
468
  name=downstream_table,
453
469
  ),
454
470
  entityType="dataset",
@@ -456,7 +472,7 @@ class DataHubMockDataSource(Source):
456
472
  upstreams=[
457
473
  UpstreamClass(
458
474
  dataset=make_dataset_urn(
459
- platform="fake",
475
+ platform=PLATFORM_NAME,
460
476
  name=upstream_table,
461
477
  ),
462
478
  type=DatasetLineageTypeClass.TRANSFORMED,
@@ -468,7 +484,7 @@ class DataHubMockDataSource(Source):
468
484
 
469
485
  def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
470
486
  urn = make_dataset_urn(
471
- platform="fake",
487
+ platform=PLATFORM_NAME,
472
488
  name=table,
473
489
  )
474
490
  mcp = MetadataChangeProposalWrapper(
@@ -485,7 +501,7 @@ class DataHubMockDataSource(Source):
485
501
 
486
502
  def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
487
503
  urn = make_dataset_urn(
488
- platform="fake",
504
+ platform=PLATFORM_NAME,
489
505
  name=table,
490
506
  )
491
507
  mcp = MetadataChangeProposalWrapper(
@@ -1226,7 +1226,10 @@ class Mapper:
1226
1226
  @platform_name("PowerBI")
1227
1227
  @config_class(PowerBiDashboardSourceConfig)
1228
1228
  @support_status(SupportStatus.CERTIFIED)
1229
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
1229
+ @capability(
1230
+ SourceCapability.CONTAINERS,
1231
+ "Enabled by default",
1232
+ )
1230
1233
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
1231
1234
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
1232
1235
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
@@ -27,10 +27,8 @@ class CatalogItem(BaseModel):
27
27
  is_favorite: bool = Field(alias="IsFavorite")
28
28
  user_info: Any = Field(None, alias="UserInfo")
29
29
  display_name: Optional[str] = Field(None, alias="DisplayName")
30
- has_data_sources: bool = Field(default=False, alias="HasDataSources")
31
- data_sources: Optional[List["DataSource"]] = Field(
32
- default_factory=list, alias="DataSources"
33
- )
30
+ has_data_sources: bool = Field(False, alias="HasDataSources")
31
+ data_sources: Optional[List["DataSource"]] = Field(None, alias="DataSources")
34
32
 
35
33
  @validator("display_name", always=True)
36
34
  def validate_diplay_name(cls, value, values):
@@ -26,7 +26,7 @@ from datahub.utilities.search_utils import LogicalOperator
26
26
 
27
27
  class OutboundSharePlatformResource(BaseModel):
28
28
  namespace: str
29
- platform_instance: Optional[str]
29
+ platform_instance: Optional[str] = None
30
30
  env: str
31
31
  source_database: str
32
32
  share_name: str
@@ -132,6 +132,7 @@ logger: logging.Logger = logging.getLogger(__name__)
132
132
  "Enabled by default",
133
133
  subtype_modifier=[
134
134
  SourceCapabilityModifier.DATABASE,
135
+ SourceCapabilityModifier.SCHEMA,
135
136
  ],
136
137
  )
137
138
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")