acryl-datahub-airflow-plugin 1.3.1.3rc2__py3-none-any.whl → 1.3.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +303 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +65 -0
  3. datahub_airflow_plugin/_airflow_compat.py +32 -0
  4. datahub_airflow_plugin/_airflow_shims.py +64 -31
  5. datahub_airflow_plugin/_airflow_version_specific.py +184 -0
  6. datahub_airflow_plugin/_config.py +97 -19
  7. datahub_airflow_plugin/_constants.py +16 -0
  8. datahub_airflow_plugin/_datahub_ol_adapter.py +14 -2
  9. datahub_airflow_plugin/_version.py +1 -1
  10. datahub_airflow_plugin/airflow2/__init__.py +6 -0
  11. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +402 -0
  12. datahub_airflow_plugin/airflow2/_airflow_compat.py +95 -0
  13. datahub_airflow_plugin/airflow2/_extractors.py +477 -0
  14. datahub_airflow_plugin/airflow2/_legacy_shims.py +20 -0
  15. datahub_airflow_plugin/airflow2/_openlineage_compat.py +123 -0
  16. datahub_airflow_plugin/airflow2/_provider_shims.py +29 -0
  17. datahub_airflow_plugin/airflow2/_shims.py +88 -0
  18. datahub_airflow_plugin/airflow2/datahub_listener.py +1072 -0
  19. datahub_airflow_plugin/airflow3/__init__.py +6 -0
  20. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +408 -0
  21. datahub_airflow_plugin/airflow3/_airflow_compat.py +108 -0
  22. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +153 -0
  23. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +273 -0
  24. datahub_airflow_plugin/airflow3/_shims.py +82 -0
  25. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +88 -0
  26. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +308 -0
  27. datahub_airflow_plugin/airflow3/datahub_listener.py +1452 -0
  28. datahub_airflow_plugin/client/airflow_generator.py +147 -43
  29. datahub_airflow_plugin/datahub_listener.py +19 -790
  30. datahub_airflow_plugin/example_dags/__init__.py +32 -0
  31. datahub_airflow_plugin/example_dags/airflow2/__init__.py +8 -0
  32. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +54 -0
  33. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +43 -0
  34. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +69 -0
  35. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +69 -0
  36. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +81 -0
  37. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +68 -0
  38. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +99 -0
  39. datahub_airflow_plugin/example_dags/airflow3/__init__.py +8 -0
  40. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +51 -0
  41. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +51 -0
  42. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +89 -0
  43. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +12 -4
  44. datahub_airflow_plugin/hooks/datahub.py +11 -2
  45. datahub_airflow_plugin/operators/datahub.py +20 -3
  46. acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/METADATA +0 -90
  47. acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info/RECORD +0 -33
  48. datahub_airflow_plugin/_extractors.py +0 -336
  49. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.3rc2.dist-info → acryl_datahub_airflow_plugin-1.3.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,477 @@
1
+ import contextlib
2
+ import logging
3
+ import sys
4
+ import unittest.mock
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional
6
+
7
+ from openlineage.client.facet import (
8
+ ExtractionError,
9
+ ExtractionErrorRunFacet,
10
+ SqlJobFacet,
11
+ )
12
+
13
+ import datahub.emitter.mce_builder as builder
14
+ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
15
+ get_platform_from_sqlalchemy_uri,
16
+ )
17
+ from datahub.sql_parsing.sqlglot_lineage import (
18
+ SqlParsingResult,
19
+ create_lineage_sql_parsed_result,
20
+ )
21
+ from datahub_airflow_plugin._constants import SQL_PARSING_RESULT_KEY
22
+ from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
23
+ from datahub_airflow_plugin.airflow2._openlineage_compat import (
24
+ USE_OPENLINEAGE_PROVIDER,
25
+ BaseExtractor,
26
+ OLExtractorManager,
27
+ OperatorLineage,
28
+ SnowflakeExtractor,
29
+ SqlExtractor,
30
+ TaskMetadata,
31
+ get_operator_class,
32
+ try_import_from_string,
33
+ )
34
+ from datahub_airflow_plugin.airflow2._shims import Operator
35
+
36
+ if TYPE_CHECKING:
37
+ from airflow.models import DagRun, TaskInstance
38
+
39
+ from datahub.ingestion.graph.client import DataHubGraph
40
+
41
+ # For type checking, define a union type that covers both versions
42
+ if sys.version_info >= (3, 10):
43
+ from typing import TypeAlias
44
+ else:
45
+ from typing_extensions import TypeAlias
46
+
47
+ # Define proper type aliases for the union type
48
+ # Note: BaseExtractor, OLExtractorManager, etc. are already imported above at runtime
49
+ from typing import Union
50
+
51
+ ExtractResult: TypeAlias = Union[
52
+ Any, Any
53
+ ] # Will be TaskMetadata or OperatorLineage at runtime
54
+
55
+ logger = logging.getLogger(__name__)
56
+ _DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph"
57
+
58
+ # Runtime type alias for the return type of extract() methods
59
+ if not TYPE_CHECKING:
60
+ if USE_OPENLINEAGE_PROVIDER:
61
+ ExtractResult = OperatorLineage
62
+ else:
63
+ ExtractResult = TaskMetadata
64
+
65
+
66
+ class ExtractorManager(OLExtractorManager):
67
+ # TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API.
68
+ # When available, we should use that instead. The same goe for most of the OL
69
+ # extractors.
70
+
71
+ def __init__(
72
+ self,
73
+ patch_sql_parser: bool = True,
74
+ patch_snowflake_schema: bool = True,
75
+ extract_athena_operator: bool = True,
76
+ extract_bigquery_insert_job_operator: bool = True,
77
+ extract_teradata_operator: bool = True,
78
+ ):
79
+ super().__init__()
80
+
81
+ # Store patch/extractor configuration
82
+ self._patch_sql_parser = patch_sql_parser
83
+ self._patch_snowflake_schema = patch_snowflake_schema
84
+ self._extract_athena_operator = extract_athena_operator
85
+ self._extract_bigquery_insert_job_operator = (
86
+ extract_bigquery_insert_job_operator
87
+ )
88
+ self._extract_teradata_operator = extract_teradata_operator
89
+
90
+ # Legacy OpenLineage has task_to_extractor attribute, OpenLineage Provider doesn't
91
+ # Register custom extractors only for Legacy OpenLineage (Provider has its own)
92
+ if not USE_OPENLINEAGE_PROVIDER:
93
+ _sql_operator_overrides = [
94
+ # The OL BigQuery extractor has some complex logic to fetch detect
95
+ # the BigQuery job_id and fetch lineage from there. However, it can't
96
+ # generate CLL, so we disable it and use our own extractor instead.
97
+ "BigQueryOperator",
98
+ "BigQueryExecuteQueryOperator",
99
+ # Athena also does something similar.
100
+ "AWSAthenaOperator",
101
+ # Additional types that OL doesn't support. This is only necessary because
102
+ # on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator.
103
+ "SqliteOperator",
104
+ ]
105
+ for operator in _sql_operator_overrides:
106
+ self.task_to_extractor.extractors[operator] = GenericSqlExtractor # type: ignore[attr-defined]
107
+
108
+ # Register custom extractors based on configuration
109
+ if self._extract_athena_operator:
110
+ self.task_to_extractor.extractors["AthenaOperator"] = ( # type: ignore[attr-defined]
111
+ AthenaOperatorExtractor
112
+ )
113
+
114
+ if self._extract_bigquery_insert_job_operator:
115
+ self.task_to_extractor.extractors["BigQueryInsertJobOperator"] = ( # type: ignore[attr-defined]
116
+ BigQueryInsertJobOperatorExtractor
117
+ )
118
+
119
+ if self._extract_teradata_operator:
120
+ self.task_to_extractor.extractors["TeradataOperator"] = (
121
+ TeradataOperatorExtractor
122
+ )
123
+
124
+ self._graph: Optional["DataHubGraph"] = None
125
+
126
+ @contextlib.contextmanager
127
+ def _patch_extractors(self):
128
+ with contextlib.ExitStack() as stack:
129
+ # Patch the SqlExtractor.extract() method if configured and available
130
+ if self._patch_sql_parser and SqlExtractor is not None:
131
+ stack.enter_context(
132
+ unittest.mock.patch.object(
133
+ SqlExtractor,
134
+ "extract",
135
+ _sql_extractor_extract,
136
+ )
137
+ )
138
+
139
+ # Patch the SnowflakeExtractor.default_schema property if configured and available
140
+ if self._patch_snowflake_schema and SnowflakeExtractor is not None:
141
+ stack.enter_context(
142
+ unittest.mock.patch.object(
143
+ SnowflakeExtractor,
144
+ "default_schema",
145
+ property(_snowflake_default_schema),
146
+ )
147
+ )
148
+
149
+ yield
150
+
151
+ def extract_metadata( # type: ignore[override]
152
+ self,
153
+ dagrun: "DagRun",
154
+ task: "Operator",
155
+ complete: bool = False,
156
+ task_instance: Optional["TaskInstance"] = None,
157
+ task_uuid: Optional[str] = None,
158
+ graph: Optional["DataHubGraph"] = None,
159
+ ) -> ExtractResult:
160
+ self._graph = graph
161
+ with self._patch_extractors():
162
+ if USE_OPENLINEAGE_PROVIDER:
163
+ # OpenLineage Provider: Does not have task_uuid parameter
164
+ # In Airflow 3.x, the 'complete' parameter type changed from bool to TaskInstanceState
165
+ return super().extract_metadata(dagrun, task, complete, task_instance) # type: ignore[call-arg,arg-type]
166
+ else:
167
+ # Legacy OpenLineage: Has task_uuid parameter
168
+ return super().extract_metadata( # type: ignore[call-arg,arg-type]
169
+ dagrun,
170
+ task,
171
+ complete, # type: ignore[arg-type]
172
+ task_instance,
173
+ task_uuid,
174
+ )
175
+
176
+ def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]:
177
+ # For Legacy OpenLineage: Register GenericSqlExtractor as fallback for
178
+ # any operator that inherits from SQLExecuteQueryOperator.
179
+ # For OpenLineage Provider: Rely on SQLParser patch approach instead.
180
+ if not USE_OPENLINEAGE_PROVIDER:
181
+ clazz = get_operator_class(task) # type: ignore[arg-type]
182
+ SQLExecuteQueryOperator = try_import_from_string(
183
+ "airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator"
184
+ )
185
+ if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator):
186
+ # Legacy OpenLineage: Register GenericSqlExtractor in task_to_extractor.extractors
187
+ self.task_to_extractor.extractors.setdefault( # type: ignore[attr-defined]
188
+ clazz.__name__, GenericSqlExtractor
189
+ )
190
+
191
+ extractor = super()._get_extractor(task)
192
+
193
+ # For OpenLineage Provider: If no extractor was found, check if this is a SQL operator
194
+ # that should use GenericSqlExtractor (e.g., SqliteOperator which provider doesn't support)
195
+ if (
196
+ USE_OPENLINEAGE_PROVIDER
197
+ and extractor is None
198
+ and GenericSqlExtractor is not None
199
+ ):
200
+ clazz = get_operator_class(task) # type: ignore[arg-type]
201
+ # Check if this is SqliteOperator (provider doesn't have an extractor for it)
202
+ if clazz.__name__ == "SqliteOperator":
203
+ # Create a GenericSqlExtractor instance for this operator
204
+ extractor = GenericSqlExtractor(task) # type: ignore[call-arg]
205
+
206
+ if extractor and not USE_OPENLINEAGE_PROVIDER:
207
+ # set_context only exists in Legacy OpenLineage
208
+ extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph) # type: ignore[attr-defined]
209
+ return extractor
210
+
211
+
212
+ if SqlExtractor is not None:
213
+
214
+ class GenericSqlExtractor(SqlExtractor): # type: ignore
215
+ # Note that the extract() method is patched elsewhere.
216
+
217
+ @property
218
+ def default_schema(self):
219
+ return super().default_schema
220
+
221
+ def _get_scheme(self) -> Optional[str]:
222
+ # Best effort conversion to DataHub platform names.
223
+
224
+ with contextlib.suppress(Exception):
225
+ if self.hook:
226
+ if hasattr(self.hook, "get_uri"):
227
+ uri = self.hook.get_uri()
228
+ return get_platform_from_sqlalchemy_uri(uri)
229
+
230
+ return self.conn.conn_type or super().dialect
231
+
232
+ def _get_database(self) -> Optional[str]:
233
+ if self.conn:
234
+ # For BigQuery, the "database" is the project name.
235
+ if hasattr(self.conn, "project_id"):
236
+ return self.conn.project_id
237
+
238
+ return self.conn.schema
239
+ return None
240
+
241
+ else:
242
+ # SqlExtractor is not available (OpenLineage Provider package)
243
+ GenericSqlExtractor = None # type: ignore
244
+
245
+
246
+ def _sql_extractor_extract(self: "SqlExtractor") -> Optional[ExtractResult]:
247
+ # Why not override the OL sql_parse method directly, instead of overriding
248
+ # extract()? A few reasons:
249
+ #
250
+ # 1. We would want to pass the default_db and graph instance into our sql parser
251
+ # method. The OL code doesn't pass the default_db (despite having it available),
252
+ # and it's not clear how to get the graph instance into that method.
253
+ # 2. OL has some janky logic to fetch table schemas as part of the sql extractor.
254
+ # We don't want that behavior and this lets us disable it.
255
+ # 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would
256
+ # require us to convert those urns to OL uris, just for them to get converted
257
+ # back to urns later on in our processing.
258
+
259
+ task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
260
+ sql = self.operator.sql
261
+
262
+ default_database = getattr(self.operator, "database", None)
263
+ if not default_database:
264
+ default_database = self.database
265
+ default_schema = self.default_schema
266
+
267
+ # TODO: Add better handling for sql being a list of statements.
268
+ if isinstance(sql, list):
269
+ logger.info(f"Got list of SQL statements for {task_name}. Using first one.")
270
+ sql = sql[0]
271
+
272
+ # Run the SQL parser.
273
+ scheme = self.scheme
274
+ platform = OL_SCHEME_TWEAKS.get(scheme, scheme)
275
+
276
+ return _parse_sql_into_task_metadata(
277
+ self,
278
+ sql,
279
+ platform=platform,
280
+ default_database=default_database,
281
+ default_schema=default_schema,
282
+ )
283
+
284
+
285
+ def _normalize_sql(sql: str) -> str:
286
+ """Normalize SQL for logging (strip extra whitespace)"""
287
+ if SqlExtractor is not None and hasattr(SqlExtractor, "_normalize_sql"):
288
+ return SqlExtractor._normalize_sql(sql)
289
+ # Fallback normalization
290
+ return " ".join(sql.split())
291
+
292
+
293
+ def _create_lineage_metadata(
294
+ task_name: str,
295
+ run_facets: Dict[str, Any],
296
+ job_facets: Dict[str, Any],
297
+ ) -> Optional[ExtractResult]:
298
+ """Create TaskMetadata (Legacy OpenLineage) or OperatorLineage (OpenLineage Provider)"""
299
+ if USE_OPENLINEAGE_PROVIDER:
300
+ # OpenLineage Provider: Return OperatorLineage (no name field)
301
+ return OperatorLineage( # type: ignore
302
+ inputs=[],
303
+ outputs=[],
304
+ run_facets=run_facets,
305
+ job_facets=job_facets,
306
+ )
307
+ else:
308
+ # Legacy OpenLineage: Return TaskMetadata (with name field)
309
+ return TaskMetadata( # type: ignore
310
+ name=task_name,
311
+ inputs=[],
312
+ outputs=[],
313
+ run_facets=run_facets,
314
+ job_facets=job_facets,
315
+ )
316
+
317
+
318
+ def _parse_sql_into_task_metadata(
319
+ self: "BaseExtractor",
320
+ sql: str,
321
+ platform: str,
322
+ default_database: Optional[str],
323
+ default_schema: Optional[str],
324
+ ) -> Optional[ExtractResult]:
325
+ task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
326
+
327
+ run_facets = {}
328
+ job_facets = {"sql": SqlJobFacet(query=_normalize_sql(sql))}
329
+
330
+ # Get graph from context (Legacy OpenLineage only)
331
+ graph = None
332
+ if hasattr(self, "context"):
333
+ graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None) # type: ignore[attr-defined]
334
+
335
+ self.log.debug(
336
+ "Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
337
+ "with graph client" if graph else "in offline mode",
338
+ platform,
339
+ default_database,
340
+ default_schema,
341
+ sql,
342
+ )
343
+ sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result(
344
+ query=sql,
345
+ graph=graph,
346
+ platform=platform,
347
+ platform_instance=None,
348
+ env=builder.DEFAULT_ENV,
349
+ default_db=default_database,
350
+ default_schema=default_schema,
351
+ )
352
+ self.log.debug(f"Got sql lineage {sql_parsing_result}")
353
+
354
+ if sql_parsing_result.debug_info.error:
355
+ error = sql_parsing_result.debug_info.error
356
+ run_facets["extractionError"] = ExtractionErrorRunFacet(
357
+ totalTasks=1,
358
+ failedTasks=1,
359
+ errors=[
360
+ ExtractionError(
361
+ errorMessage=str(error),
362
+ stackTrace=None,
363
+ task="datahub_sql_parser",
364
+ taskNumber=None,
365
+ )
366
+ ],
367
+ )
368
+
369
+ # Save sql_parsing_result to the facets dict. It is removed from the
370
+ # facet dict in the extractor's processing logic.
371
+ run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore
372
+
373
+ return _create_lineage_metadata(task_name, run_facets, job_facets)
374
+
375
+
376
+ class BigQueryInsertJobOperatorExtractor(BaseExtractor):
377
+ def extract(self) -> Optional[ExtractResult]:
378
+ from airflow.providers.google.cloud.operators.bigquery import (
379
+ BigQueryInsertJobOperator, # type: ignore
380
+ )
381
+
382
+ operator: "BigQueryInsertJobOperator" = self.operator
383
+ sql = operator.configuration.get("query", {}).get("query")
384
+ if not sql:
385
+ self.log.warning("No query found in BigQueryInsertJobOperator")
386
+ return None
387
+
388
+ destination_table = operator.configuration.get("query", {}).get(
389
+ "destinationTable"
390
+ )
391
+ destination_table_urn = None
392
+ if destination_table:
393
+ project_id = destination_table.get("projectId")
394
+ dataset_id = destination_table.get("datasetId")
395
+ table_id = destination_table.get("tableId")
396
+
397
+ if project_id and dataset_id and table_id:
398
+ destination_table_urn = builder.make_dataset_urn(
399
+ platform="bigquery",
400
+ name=f"{project_id}.{dataset_id}.{table_id}",
401
+ env=builder.DEFAULT_ENV,
402
+ )
403
+
404
+ task_metadata = _parse_sql_into_task_metadata(
405
+ self,
406
+ sql,
407
+ platform="bigquery",
408
+ default_database=operator.project_id,
409
+ default_schema=None,
410
+ )
411
+
412
+ if destination_table_urn and task_metadata:
413
+ sql_parsing_result = task_metadata.run_facets.get(SQL_PARSING_RESULT_KEY)
414
+ if sql_parsing_result and isinstance(sql_parsing_result, SqlParsingResult):
415
+ sql_parsing_result.out_tables.append(destination_table_urn)
416
+
417
+ return task_metadata
418
+
419
+
420
+ class AthenaOperatorExtractor(BaseExtractor):
421
+ def extract(self) -> Optional[ExtractResult]:
422
+ from airflow.providers.amazon.aws.operators.athena import (
423
+ AthenaOperator, # type: ignore
424
+ )
425
+
426
+ operator: "AthenaOperator" = self.operator
427
+ sql = operator.query
428
+ if not sql:
429
+ self.log.warning("No query found in AthenaOperator")
430
+ return None
431
+
432
+ return _parse_sql_into_task_metadata(
433
+ self,
434
+ sql,
435
+ platform="athena",
436
+ default_database=None,
437
+ default_schema=self.operator.database,
438
+ )
439
+
440
+
441
+ def _snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
442
+ if hasattr(self.operator, "schema") and self.operator.schema is not None:
443
+ return self.operator.schema
444
+ return (
445
+ self.conn.extra_dejson.get("extra__snowflake__schema", "")
446
+ or self.conn.extra_dejson.get("schema", "")
447
+ or self.conn.schema
448
+ )
449
+ # TODO: Should we try a fallback of:
450
+ # execute_query_on_hook(self.hook, "SELECT current_schema();")[0][0]
451
+
452
+ # execute_query_on_hook(self.hook, "SELECT current_schema();")
453
+
454
+
455
+ class TeradataOperatorExtractor(BaseExtractor):
456
+ """Extractor for Teradata SQL operations.
457
+
458
+ Extracts lineage from TeradataOperator tasks by parsing the SQL queries
459
+ and understanding Teradata's two-tier database.table naming convention.
460
+ """
461
+
462
+ def extract(self) -> Optional[ExtractResult]:
463
+ from airflow.providers.teradata.operators.teradata import TeradataOperator
464
+
465
+ operator: "TeradataOperator" = self.operator
466
+ sql = operator.sql
467
+ if not sql:
468
+ self.log.warning("No query found in TeradataOperator")
469
+ return None
470
+
471
+ return _parse_sql_into_task_metadata(
472
+ self,
473
+ sql,
474
+ platform="teradata",
475
+ default_database=None,
476
+ default_schema=None,
477
+ )
@@ -0,0 +1,20 @@
1
+ """
2
+ Shims for legacy openlineage-airflow package.
3
+ This module is used when openlineage-airflow is installed (Airflow 2.x with legacy OpenLineage).
4
+ """
5
+
6
+ from openlineage.airflow.listener import TaskHolder
7
+ from openlineage.airflow.plugin import OpenLineagePlugin
8
+ from openlineage.airflow.utils import (
9
+ get_operator_class,
10
+ redact_with_exclusions,
11
+ try_import_from_string,
12
+ )
13
+
14
+ __all__ = [
15
+ "TaskHolder",
16
+ "OpenLineagePlugin",
17
+ "get_operator_class",
18
+ "redact_with_exclusions",
19
+ "try_import_from_string",
20
+ ]
@@ -0,0 +1,123 @@
1
+ """
2
+ Compatibility layer for OpenLineage imports in Airflow 2.x.
3
+
4
+ This module handles two different OpenLineage variants that can be used with Airflow 2.x:
5
+ 1. Legacy OpenLineage (openlineage-airflow package) - used in Airflow 2.5-2.6
6
+ 2. OpenLineage Provider (apache-airflow-providers-openlineage) - used in Airflow 2.7+
7
+
8
+ The module detects which variant is installed and imports the appropriate classes.
9
+
10
+ Note: This file is only used for Airflow 2.x. Airflow 3.x has its own separate module.
11
+ """
12
+
13
+ from typing import TYPE_CHECKING, Any
14
+
15
+ if TYPE_CHECKING:
16
+ # For type checking, use proper types based on what's available
17
+ # Try OpenLineage Provider first, fall back to Legacy OpenLineage
18
+ try:
19
+ from airflow.providers.openlineage.extractors.base import (
20
+ BaseExtractor,
21
+ OperatorLineage,
22
+ )
23
+ from airflow.providers.openlineage.extractors.manager import (
24
+ ExtractorManager as OLExtractorManager,
25
+ )
26
+ from airflow.providers.openlineage.extractors.snowflake import (
27
+ SnowflakeExtractor,
28
+ )
29
+ from airflow.providers.openlineage.extractors.sql import SqlExtractor
30
+ from airflow.providers.openlineage.utils.utils import (
31
+ get_operator_class,
32
+ try_import_from_string,
33
+ )
34
+ from openlineage.airflow.extractors import TaskMetadata
35
+
36
+ USE_OPENLINEAGE_PROVIDER: bool = True
37
+ except ImportError:
38
+ # Legacy OpenLineage types
39
+ from openlineage.airflow.extractors import ( # type: ignore[no-redef]
40
+ BaseExtractor,
41
+ ExtractorManager as OLExtractorManager,
42
+ TaskMetadata,
43
+ )
44
+ from openlineage.airflow.extractors.snowflake_extractor import ( # type: ignore[no-redef]
45
+ SnowflakeExtractor,
46
+ )
47
+ from openlineage.airflow.extractors.sql_extractor import (
48
+ SqlExtractor, # type: ignore[no-redef]
49
+ )
50
+ from openlineage.airflow.utils import ( # type: ignore[no-redef]
51
+ get_operator_class,
52
+ try_import_from_string,
53
+ )
54
+
55
+ OperatorLineage: Any # type: ignore[no-redef] # Doesn't exist in Legacy OpenLineage
56
+ USE_OPENLINEAGE_PROVIDER: bool = False # type: ignore[no-redef]
57
+
58
+ else:
59
+ # Runtime imports - detect which OpenLineage variant is installed
60
+ USE_OPENLINEAGE_PROVIDER = False
61
+
62
+ try:
63
+ # Try OpenLineage Provider (apache-airflow-providers-openlineage)
64
+ # Available in Airflow 2.7+ when installed with [airflow2-provider] extra
65
+ from airflow.providers.openlineage.extractors.base import (
66
+ BaseExtractor,
67
+ OperatorLineage,
68
+ )
69
+ from airflow.providers.openlineage.extractors.manager import (
70
+ ExtractorManager as OLExtractorManager,
71
+ )
72
+ from airflow.providers.openlineage.utils.utils import (
73
+ get_operator_class,
74
+ try_import_from_string,
75
+ )
76
+
77
+ USE_OPENLINEAGE_PROVIDER = True
78
+
79
+ try:
80
+ from airflow.providers.openlineage.extractors.snowflake import (
81
+ SnowflakeExtractor,
82
+ )
83
+ except ImportError:
84
+ SnowflakeExtractor = None # type: ignore
85
+
86
+ try:
87
+ from airflow.providers.openlineage.extractors.sql import SqlExtractor
88
+ except ImportError:
89
+ SqlExtractor = None # type: ignore
90
+
91
+ # OpenLineage Provider uses OperatorLineage, not TaskMetadata
92
+ TaskMetadata = None # type: ignore
93
+
94
+ except (ImportError, ModuleNotFoundError):
95
+ # Fall back to Legacy OpenLineage (openlineage-airflow package)
96
+ # Used in Airflow 2.5-2.6 or when installed with [airflow2] extra
97
+ from openlineage.airflow.extractors import (
98
+ BaseExtractor,
99
+ ExtractorManager as OLExtractorManager,
100
+ TaskMetadata,
101
+ )
102
+ from openlineage.airflow.extractors.snowflake_extractor import (
103
+ SnowflakeExtractor,
104
+ )
105
+ from openlineage.airflow.extractors.sql_extractor import SqlExtractor
106
+ from openlineage.airflow.utils import get_operator_class, try_import_from_string
107
+
108
+ # Legacy OpenLineage uses TaskMetadata, not OperatorLineage
109
+ OperatorLineage = None # type: ignore
110
+
111
+
112
+ # Export all symbols
113
+ __all__ = [
114
+ "USE_OPENLINEAGE_PROVIDER",
115
+ "BaseExtractor",
116
+ "OperatorLineage",
117
+ "TaskMetadata",
118
+ "OLExtractorManager",
119
+ "get_operator_class",
120
+ "try_import_from_string",
121
+ "SnowflakeExtractor",
122
+ "SqlExtractor",
123
+ ]
@@ -0,0 +1,29 @@
1
+ """
2
+ Shims for apache-airflow-providers-openlineage package.
3
+ This module is used when apache-airflow-providers-openlineage is installed
4
+ (Airflow 2.10+ with native OpenLineage provider).
5
+ """
6
+
7
+ from airflow.providers.openlineage.plugins.openlineage import (
8
+ OpenLineageProviderPlugin as OpenLineagePlugin,
9
+ )
10
+ from airflow.providers.openlineage.utils.utils import (
11
+ get_operator_class,
12
+ try_import_from_string,
13
+ )
14
+
15
+ # Provider package doesn't have TaskHolder - not needed with modern Airflow
16
+ # The task_instance.task attribute is directly available in Airflow 2.10+
17
+ TaskHolder = None # type: ignore[misc,assignment]
18
+
19
+ # Provider package doesn't have redact_with_exclusions - not needed
20
+ # This was only used for logging/debugging in the legacy package
21
+ redact_with_exclusions = None # type: ignore[misc,assignment]
22
+
23
+ __all__ = [
24
+ "TaskHolder",
25
+ "OpenLineagePlugin",
26
+ "get_operator_class",
27
+ "redact_with_exclusions",
28
+ "try_import_from_string",
29
+ ]