acryl-datahub-airflow-plugin 1.3.1.5__py3-none-any.whl → 1.3.1.5rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/METADATA +91 -0
  2. acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info/RECORD +33 -0
  3. datahub_airflow_plugin/_airflow_shims.py +31 -64
  4. datahub_airflow_plugin/_config.py +19 -97
  5. datahub_airflow_plugin/_datahub_ol_adapter.py +2 -14
  6. datahub_airflow_plugin/_extractors.py +365 -0
  7. datahub_airflow_plugin/_version.py +1 -1
  8. datahub_airflow_plugin/client/airflow_generator.py +43 -147
  9. datahub_airflow_plugin/datahub_listener.py +790 -19
  10. datahub_airflow_plugin/example_dags/__init__.py +0 -32
  11. datahub_airflow_plugin/example_dags/graph_usage_sample_dag.py +4 -12
  12. datahub_airflow_plugin/hooks/datahub.py +2 -11
  13. datahub_airflow_plugin/operators/datahub.py +3 -20
  14. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/METADATA +0 -303
  15. acryl_datahub_airflow_plugin-1.3.1.5.dist-info/RECORD +0 -65
  16. datahub_airflow_plugin/_airflow_compat.py +0 -32
  17. datahub_airflow_plugin/_airflow_version_specific.py +0 -184
  18. datahub_airflow_plugin/_constants.py +0 -16
  19. datahub_airflow_plugin/airflow2/__init__.py +0 -6
  20. datahub_airflow_plugin/airflow2/_airflow2_sql_parser_patch.py +0 -402
  21. datahub_airflow_plugin/airflow2/_airflow_compat.py +0 -95
  22. datahub_airflow_plugin/airflow2/_extractors.py +0 -477
  23. datahub_airflow_plugin/airflow2/_legacy_shims.py +0 -20
  24. datahub_airflow_plugin/airflow2/_openlineage_compat.py +0 -123
  25. datahub_airflow_plugin/airflow2/_provider_shims.py +0 -29
  26. datahub_airflow_plugin/airflow2/_shims.py +0 -88
  27. datahub_airflow_plugin/airflow2/datahub_listener.py +0 -1072
  28. datahub_airflow_plugin/airflow3/__init__.py +0 -6
  29. datahub_airflow_plugin/airflow3/_airflow3_sql_parser_patch.py +0 -408
  30. datahub_airflow_plugin/airflow3/_airflow_compat.py +0 -108
  31. datahub_airflow_plugin/airflow3/_athena_openlineage_patch.py +0 -153
  32. datahub_airflow_plugin/airflow3/_bigquery_openlineage_patch.py +0 -273
  33. datahub_airflow_plugin/airflow3/_shims.py +0 -82
  34. datahub_airflow_plugin/airflow3/_sqlite_openlineage_patch.py +0 -88
  35. datahub_airflow_plugin/airflow3/_teradata_openlineage_patch.py +0 -308
  36. datahub_airflow_plugin/airflow3/datahub_listener.py +0 -1452
  37. datahub_airflow_plugin/example_dags/airflow2/__init__.py +0 -8
  38. datahub_airflow_plugin/example_dags/airflow2/generic_recipe_sample_dag.py +0 -54
  39. datahub_airflow_plugin/example_dags/airflow2/graph_usage_sample_dag.py +0 -43
  40. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_demo.py +0 -69
  41. datahub_airflow_plugin/example_dags/airflow2/lineage_backend_taskflow_demo.py +0 -69
  42. datahub_airflow_plugin/example_dags/airflow2/lineage_emission_dag.py +0 -81
  43. datahub_airflow_plugin/example_dags/airflow2/mysql_sample_dag.py +0 -68
  44. datahub_airflow_plugin/example_dags/airflow2/snowflake_sample_dag.py +0 -99
  45. datahub_airflow_plugin/example_dags/airflow3/__init__.py +0 -8
  46. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_demo.py +0 -51
  47. datahub_airflow_plugin/example_dags/airflow3/lineage_backend_taskflow_demo.py +0 -51
  48. datahub_airflow_plugin/example_dags/airflow3/snowflake_sample_dag.py +0 -89
  49. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/WHEEL +0 -0
  50. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/entry_points.txt +0 -0
  51. {acryl_datahub_airflow_plugin-1.3.1.5.dist-info → acryl_datahub_airflow_plugin-1.3.1.5rc1.dist-info}/top_level.txt +0 -0
@@ -1,95 +0,0 @@
1
- # Airflow 2.x compatibility module
2
- # This module must be imported before any Airflow imports in any of our files.
3
-
4
- from datahub.utilities._markupsafe_compat import MARKUPSAFE_PATCHED
5
-
6
- # Critical safety check: Ensure MarkupSafe compatibility patch is applied
7
- # This must happen before importing Airflow to prevent MarkupSafe version conflicts
8
- # Using explicit exception instead of assert to ensure it runs even with python -O
9
- if not MARKUPSAFE_PATCHED:
10
- raise RuntimeError(
11
- "MarkupSafe compatibility patch must be applied before importing Airflow modules. "
12
- "This is a critical safety check that cannot be disabled. "
13
- "The patch ensures compatibility between different MarkupSafe versions used by "
14
- "Airflow and DataHub dependencies."
15
- )
16
-
17
- # Apply SQLParser patch for Airflow 2.10+ with apache-airflow-providers-openlineage
18
- # When using the provider package, SQL operators call SQLParser.generate_openlineage_metadata_from_sql()
19
- # directly (similar to Airflow 3.x), so we need to patch that method to use DataHub's SQL parser.
20
- #
21
- # For legacy openlineage-airflow package (Airflow 2.5-2.9), we use the extractor-based approach
22
- # in _extractors.py instead.
23
- import importlib.util
24
- import logging
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
- # Check if OpenLineage provider package is available
29
- # Use try-except because find_spec can raise ModuleNotFoundError if parent module doesn't exist
30
- try:
31
- has_openlineage_provider = (
32
- importlib.util.find_spec("airflow.providers.openlineage.sqlparser") is not None
33
- )
34
- except (ModuleNotFoundError, ImportError, ValueError):
35
- # Parent module doesn't exist or other import error
36
- has_openlineage_provider = False
37
-
38
- if has_openlineage_provider:
39
- # Provider package detected - apply SQL parser patch
40
- from datahub_airflow_plugin.airflow2._airflow2_sql_parser_patch import (
41
- patch_sqlparser,
42
- )
43
-
44
- patch_sqlparser()
45
- else:
46
- # Provider package not available - using legacy openlineage-airflow package
47
- # No patching needed, extractors will handle SQL parsing
48
- pass
49
-
50
- # Apply operator-specific patches for provider mode
51
- # These patches work for both Airflow 2.x and 3.x when using OpenLineage provider
52
- try:
53
- from datahub_airflow_plugin._config import get_lineage_config
54
-
55
- config = get_lineage_config()
56
- enable_extractors = config.enable_extractors
57
- extract_teradata_operator = config.extract_teradata_operator
58
- except Exception:
59
- # If config loading fails, apply patches by default (backward compatibility)
60
- enable_extractors = True
61
- extract_teradata_operator = True
62
-
63
- if enable_extractors and extract_teradata_operator:
64
- # TeradataOperator patch - works for both Airflow 2.x provider mode and Airflow 3.x
65
- # The patch checks for method existence, so it's safe to import from airflow3 module
66
- # Note: We defer the import to avoid potential issues with Airflow 3.x specific imports
67
- # in Airflow 2.x environments. The patch function itself handles version compatibility.
68
- import logging
69
-
70
- logger = logging.getLogger(__name__)
71
- try:
72
- logger.debug("Attempting to import and apply TeradataOperator patch")
73
- # Use importlib to safely import the patch module
74
- import importlib.util
75
-
76
- patch_module_path = (
77
- "datahub_airflow_plugin.airflow3._teradata_openlineage_patch"
78
- )
79
- patch_module = importlib.import_module(patch_module_path)
80
- patch_teradata_operator = patch_module.patch_teradata_operator
81
-
82
- patch_teradata_operator()
83
- logger.debug("TeradataOperator patch import and call completed")
84
- except ImportError as e:
85
- # Teradata provider not installed or patch not available
86
- logger.debug(f"Could not import TeradataOperator patch: {e}")
87
- except Exception as e:
88
- # Log error but don't fail - this is optional functionality
89
- logger.warning(f"Error applying TeradataOperator patch: {e}", exc_info=True)
90
-
91
- AIRFLOW_PATCHED = True
92
-
93
- __all__ = [
94
- "AIRFLOW_PATCHED",
95
- ]
@@ -1,477 +0,0 @@
1
- import contextlib
2
- import logging
3
- import sys
4
- import unittest.mock
5
- from typing import TYPE_CHECKING, Any, Dict, Optional
6
-
7
- from openlineage.client.facet import (
8
- ExtractionError,
9
- ExtractionErrorRunFacet,
10
- SqlJobFacet,
11
- )
12
-
13
- import datahub.emitter.mce_builder as builder
14
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
15
- get_platform_from_sqlalchemy_uri,
16
- )
17
- from datahub.sql_parsing.sqlglot_lineage import (
18
- SqlParsingResult,
19
- create_lineage_sql_parsed_result,
20
- )
21
- from datahub_airflow_plugin._constants import SQL_PARSING_RESULT_KEY
22
- from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS
23
- from datahub_airflow_plugin.airflow2._openlineage_compat import (
24
- USE_OPENLINEAGE_PROVIDER,
25
- BaseExtractor,
26
- OLExtractorManager,
27
- OperatorLineage,
28
- SnowflakeExtractor,
29
- SqlExtractor,
30
- TaskMetadata,
31
- get_operator_class,
32
- try_import_from_string,
33
- )
34
- from datahub_airflow_plugin.airflow2._shims import Operator
35
-
36
- if TYPE_CHECKING:
37
- from airflow.models import DagRun, TaskInstance
38
-
39
- from datahub.ingestion.graph.client import DataHubGraph
40
-
41
- # For type checking, define a union type that covers both versions
42
- if sys.version_info >= (3, 10):
43
- from typing import TypeAlias
44
- else:
45
- from typing_extensions import TypeAlias
46
-
47
- # Define proper type aliases for the union type
48
- # Note: BaseExtractor, OLExtractorManager, etc. are already imported above at runtime
49
- from typing import Union
50
-
51
- ExtractResult: TypeAlias = Union[
52
- Any, Any
53
- ] # Will be TaskMetadata or OperatorLineage at runtime
54
-
55
- logger = logging.getLogger(__name__)
56
- _DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph"
57
-
58
- # Runtime type alias for the return type of extract() methods
59
- if not TYPE_CHECKING:
60
- if USE_OPENLINEAGE_PROVIDER:
61
- ExtractResult = OperatorLineage
62
- else:
63
- ExtractResult = TaskMetadata
64
-
65
-
66
- class ExtractorManager(OLExtractorManager):
67
- # TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API.
68
- # When available, we should use that instead. The same goe for most of the OL
69
- # extractors.
70
-
71
- def __init__(
72
- self,
73
- patch_sql_parser: bool = True,
74
- patch_snowflake_schema: bool = True,
75
- extract_athena_operator: bool = True,
76
- extract_bigquery_insert_job_operator: bool = True,
77
- extract_teradata_operator: bool = True,
78
- ):
79
- super().__init__()
80
-
81
- # Store patch/extractor configuration
82
- self._patch_sql_parser = patch_sql_parser
83
- self._patch_snowflake_schema = patch_snowflake_schema
84
- self._extract_athena_operator = extract_athena_operator
85
- self._extract_bigquery_insert_job_operator = (
86
- extract_bigquery_insert_job_operator
87
- )
88
- self._extract_teradata_operator = extract_teradata_operator
89
-
90
- # Legacy OpenLineage has task_to_extractor attribute, OpenLineage Provider doesn't
91
- # Register custom extractors only for Legacy OpenLineage (Provider has its own)
92
- if not USE_OPENLINEAGE_PROVIDER:
93
- _sql_operator_overrides = [
94
- # The OL BigQuery extractor has some complex logic to fetch detect
95
- # the BigQuery job_id and fetch lineage from there. However, it can't
96
- # generate CLL, so we disable it and use our own extractor instead.
97
- "BigQueryOperator",
98
- "BigQueryExecuteQueryOperator",
99
- # Athena also does something similar.
100
- "AWSAthenaOperator",
101
- # Additional types that OL doesn't support. This is only necessary because
102
- # on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator.
103
- "SqliteOperator",
104
- ]
105
- for operator in _sql_operator_overrides:
106
- self.task_to_extractor.extractors[operator] = GenericSqlExtractor # type: ignore[attr-defined]
107
-
108
- # Register custom extractors based on configuration
109
- if self._extract_athena_operator:
110
- self.task_to_extractor.extractors["AthenaOperator"] = ( # type: ignore[attr-defined]
111
- AthenaOperatorExtractor
112
- )
113
-
114
- if self._extract_bigquery_insert_job_operator:
115
- self.task_to_extractor.extractors["BigQueryInsertJobOperator"] = ( # type: ignore[attr-defined]
116
- BigQueryInsertJobOperatorExtractor
117
- )
118
-
119
- if self._extract_teradata_operator:
120
- self.task_to_extractor.extractors["TeradataOperator"] = (
121
- TeradataOperatorExtractor
122
- )
123
-
124
- self._graph: Optional["DataHubGraph"] = None
125
-
126
- @contextlib.contextmanager
127
- def _patch_extractors(self):
128
- with contextlib.ExitStack() as stack:
129
- # Patch the SqlExtractor.extract() method if configured and available
130
- if self._patch_sql_parser and SqlExtractor is not None:
131
- stack.enter_context(
132
- unittest.mock.patch.object(
133
- SqlExtractor,
134
- "extract",
135
- _sql_extractor_extract,
136
- )
137
- )
138
-
139
- # Patch the SnowflakeExtractor.default_schema property if configured and available
140
- if self._patch_snowflake_schema and SnowflakeExtractor is not None:
141
- stack.enter_context(
142
- unittest.mock.patch.object(
143
- SnowflakeExtractor,
144
- "default_schema",
145
- property(_snowflake_default_schema),
146
- )
147
- )
148
-
149
- yield
150
-
151
- def extract_metadata( # type: ignore[override]
152
- self,
153
- dagrun: "DagRun",
154
- task: "Operator",
155
- complete: bool = False,
156
- task_instance: Optional["TaskInstance"] = None,
157
- task_uuid: Optional[str] = None,
158
- graph: Optional["DataHubGraph"] = None,
159
- ) -> ExtractResult:
160
- self._graph = graph
161
- with self._patch_extractors():
162
- if USE_OPENLINEAGE_PROVIDER:
163
- # OpenLineage Provider: Does not have task_uuid parameter
164
- # In Airflow 3.x, the 'complete' parameter type changed from bool to TaskInstanceState
165
- return super().extract_metadata(dagrun, task, complete, task_instance) # type: ignore[call-arg,arg-type]
166
- else:
167
- # Legacy OpenLineage: Has task_uuid parameter
168
- return super().extract_metadata( # type: ignore[call-arg,arg-type]
169
- dagrun,
170
- task,
171
- complete, # type: ignore[arg-type]
172
- task_instance,
173
- task_uuid,
174
- )
175
-
176
- def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]:
177
- # For Legacy OpenLineage: Register GenericSqlExtractor as fallback for
178
- # any operator that inherits from SQLExecuteQueryOperator.
179
- # For OpenLineage Provider: Rely on SQLParser patch approach instead.
180
- if not USE_OPENLINEAGE_PROVIDER:
181
- clazz = get_operator_class(task) # type: ignore[arg-type]
182
- SQLExecuteQueryOperator = try_import_from_string(
183
- "airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator"
184
- )
185
- if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator):
186
- # Legacy OpenLineage: Register GenericSqlExtractor in task_to_extractor.extractors
187
- self.task_to_extractor.extractors.setdefault( # type: ignore[attr-defined]
188
- clazz.__name__, GenericSqlExtractor
189
- )
190
-
191
- extractor = super()._get_extractor(task)
192
-
193
- # For OpenLineage Provider: If no extractor was found, check if this is a SQL operator
194
- # that should use GenericSqlExtractor (e.g., SqliteOperator which provider doesn't support)
195
- if (
196
- USE_OPENLINEAGE_PROVIDER
197
- and extractor is None
198
- and GenericSqlExtractor is not None
199
- ):
200
- clazz = get_operator_class(task) # type: ignore[arg-type]
201
- # Check if this is SqliteOperator (provider doesn't have an extractor for it)
202
- if clazz.__name__ == "SqliteOperator":
203
- # Create a GenericSqlExtractor instance for this operator
204
- extractor = GenericSqlExtractor(task) # type: ignore[call-arg]
205
-
206
- if extractor and not USE_OPENLINEAGE_PROVIDER:
207
- # set_context only exists in Legacy OpenLineage
208
- extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph) # type: ignore[attr-defined]
209
- return extractor
210
-
211
-
212
- if SqlExtractor is not None:
213
-
214
- class GenericSqlExtractor(SqlExtractor): # type: ignore
215
- # Note that the extract() method is patched elsewhere.
216
-
217
- @property
218
- def default_schema(self):
219
- return super().default_schema
220
-
221
- def _get_scheme(self) -> Optional[str]:
222
- # Best effort conversion to DataHub platform names.
223
-
224
- with contextlib.suppress(Exception):
225
- if self.hook:
226
- if hasattr(self.hook, "get_uri"):
227
- uri = self.hook.get_uri()
228
- return get_platform_from_sqlalchemy_uri(uri)
229
-
230
- return self.conn.conn_type or super().dialect
231
-
232
- def _get_database(self) -> Optional[str]:
233
- if self.conn:
234
- # For BigQuery, the "database" is the project name.
235
- if hasattr(self.conn, "project_id"):
236
- return self.conn.project_id
237
-
238
- return self.conn.schema
239
- return None
240
-
241
- else:
242
- # SqlExtractor is not available (OpenLineage Provider package)
243
- GenericSqlExtractor = None # type: ignore
244
-
245
-
246
- def _sql_extractor_extract(self: "SqlExtractor") -> Optional[ExtractResult]:
247
- # Why not override the OL sql_parse method directly, instead of overriding
248
- # extract()? A few reasons:
249
- #
250
- # 1. We would want to pass the default_db and graph instance into our sql parser
251
- # method. The OL code doesn't pass the default_db (despite having it available),
252
- # and it's not clear how to get the graph instance into that method.
253
- # 2. OL has some janky logic to fetch table schemas as part of the sql extractor.
254
- # We don't want that behavior and this lets us disable it.
255
- # 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would
256
- # require us to convert those urns to OL uris, just for them to get converted
257
- # back to urns later on in our processing.
258
-
259
- task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
260
- sql = self.operator.sql
261
-
262
- default_database = getattr(self.operator, "database", None)
263
- if not default_database:
264
- default_database = self.database
265
- default_schema = self.default_schema
266
-
267
- # TODO: Add better handling for sql being a list of statements.
268
- if isinstance(sql, list):
269
- logger.info(f"Got list of SQL statements for {task_name}. Using first one.")
270
- sql = sql[0]
271
-
272
- # Run the SQL parser.
273
- scheme = self.scheme
274
- platform = OL_SCHEME_TWEAKS.get(scheme, scheme)
275
-
276
- return _parse_sql_into_task_metadata(
277
- self,
278
- sql,
279
- platform=platform,
280
- default_database=default_database,
281
- default_schema=default_schema,
282
- )
283
-
284
-
285
- def _normalize_sql(sql: str) -> str:
286
- """Normalize SQL for logging (strip extra whitespace)"""
287
- if SqlExtractor is not None and hasattr(SqlExtractor, "_normalize_sql"):
288
- return SqlExtractor._normalize_sql(sql)
289
- # Fallback normalization
290
- return " ".join(sql.split())
291
-
292
-
293
- def _create_lineage_metadata(
294
- task_name: str,
295
- run_facets: Dict[str, Any],
296
- job_facets: Dict[str, Any],
297
- ) -> Optional[ExtractResult]:
298
- """Create TaskMetadata (Legacy OpenLineage) or OperatorLineage (OpenLineage Provider)"""
299
- if USE_OPENLINEAGE_PROVIDER:
300
- # OpenLineage Provider: Return OperatorLineage (no name field)
301
- return OperatorLineage( # type: ignore
302
- inputs=[],
303
- outputs=[],
304
- run_facets=run_facets,
305
- job_facets=job_facets,
306
- )
307
- else:
308
- # Legacy OpenLineage: Return TaskMetadata (with name field)
309
- return TaskMetadata( # type: ignore
310
- name=task_name,
311
- inputs=[],
312
- outputs=[],
313
- run_facets=run_facets,
314
- job_facets=job_facets,
315
- )
316
-
317
-
318
- def _parse_sql_into_task_metadata(
319
- self: "BaseExtractor",
320
- sql: str,
321
- platform: str,
322
- default_database: Optional[str],
323
- default_schema: Optional[str],
324
- ) -> Optional[ExtractResult]:
325
- task_name = f"{self.operator.dag_id}.{self.operator.task_id}"
326
-
327
- run_facets = {}
328
- job_facets = {"sql": SqlJobFacet(query=_normalize_sql(sql))}
329
-
330
- # Get graph from context (Legacy OpenLineage only)
331
- graph = None
332
- if hasattr(self, "context"):
333
- graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None) # type: ignore[attr-defined]
334
-
335
- self.log.debug(
336
- "Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s",
337
- "with graph client" if graph else "in offline mode",
338
- platform,
339
- default_database,
340
- default_schema,
341
- sql,
342
- )
343
- sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result(
344
- query=sql,
345
- graph=graph,
346
- platform=platform,
347
- platform_instance=None,
348
- env=builder.DEFAULT_ENV,
349
- default_db=default_database,
350
- default_schema=default_schema,
351
- )
352
- self.log.debug(f"Got sql lineage {sql_parsing_result}")
353
-
354
- if sql_parsing_result.debug_info.error:
355
- error = sql_parsing_result.debug_info.error
356
- run_facets["extractionError"] = ExtractionErrorRunFacet(
357
- totalTasks=1,
358
- failedTasks=1,
359
- errors=[
360
- ExtractionError(
361
- errorMessage=str(error),
362
- stackTrace=None,
363
- task="datahub_sql_parser",
364
- taskNumber=None,
365
- )
366
- ],
367
- )
368
-
369
- # Save sql_parsing_result to the facets dict. It is removed from the
370
- # facet dict in the extractor's processing logic.
371
- run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore
372
-
373
- return _create_lineage_metadata(task_name, run_facets, job_facets)
374
-
375
-
376
- class BigQueryInsertJobOperatorExtractor(BaseExtractor):
377
- def extract(self) -> Optional[ExtractResult]:
378
- from airflow.providers.google.cloud.operators.bigquery import (
379
- BigQueryInsertJobOperator, # type: ignore
380
- )
381
-
382
- operator: "BigQueryInsertJobOperator" = self.operator
383
- sql = operator.configuration.get("query", {}).get("query")
384
- if not sql:
385
- self.log.warning("No query found in BigQueryInsertJobOperator")
386
- return None
387
-
388
- destination_table = operator.configuration.get("query", {}).get(
389
- "destinationTable"
390
- )
391
- destination_table_urn = None
392
- if destination_table:
393
- project_id = destination_table.get("projectId")
394
- dataset_id = destination_table.get("datasetId")
395
- table_id = destination_table.get("tableId")
396
-
397
- if project_id and dataset_id and table_id:
398
- destination_table_urn = builder.make_dataset_urn(
399
- platform="bigquery",
400
- name=f"{project_id}.{dataset_id}.{table_id}",
401
- env=builder.DEFAULT_ENV,
402
- )
403
-
404
- task_metadata = _parse_sql_into_task_metadata(
405
- self,
406
- sql,
407
- platform="bigquery",
408
- default_database=operator.project_id,
409
- default_schema=None,
410
- )
411
-
412
- if destination_table_urn and task_metadata:
413
- sql_parsing_result = task_metadata.run_facets.get(SQL_PARSING_RESULT_KEY)
414
- if sql_parsing_result and isinstance(sql_parsing_result, SqlParsingResult):
415
- sql_parsing_result.out_tables.append(destination_table_urn)
416
-
417
- return task_metadata
418
-
419
-
420
- class AthenaOperatorExtractor(BaseExtractor):
421
- def extract(self) -> Optional[ExtractResult]:
422
- from airflow.providers.amazon.aws.operators.athena import (
423
- AthenaOperator, # type: ignore
424
- )
425
-
426
- operator: "AthenaOperator" = self.operator
427
- sql = operator.query
428
- if not sql:
429
- self.log.warning("No query found in AthenaOperator")
430
- return None
431
-
432
- return _parse_sql_into_task_metadata(
433
- self,
434
- sql,
435
- platform="athena",
436
- default_database=None,
437
- default_schema=self.operator.database,
438
- )
439
-
440
-
441
- def _snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]:
442
- if hasattr(self.operator, "schema") and self.operator.schema is not None:
443
- return self.operator.schema
444
- return (
445
- self.conn.extra_dejson.get("extra__snowflake__schema", "")
446
- or self.conn.extra_dejson.get("schema", "")
447
- or self.conn.schema
448
- )
449
- # TODO: Should we try a fallback of:
450
- # execute_query_on_hook(self.hook, "SELECT current_schema();")[0][0]
451
-
452
- # execute_query_on_hook(self.hook, "SELECT current_schema();")
453
-
454
-
455
- class TeradataOperatorExtractor(BaseExtractor):
456
- """Extractor for Teradata SQL operations.
457
-
458
- Extracts lineage from TeradataOperator tasks by parsing the SQL queries
459
- and understanding Teradata's two-tier database.table naming convention.
460
- """
461
-
462
- def extract(self) -> Optional[ExtractResult]:
463
- from airflow.providers.teradata.operators.teradata import TeradataOperator
464
-
465
- operator: "TeradataOperator" = self.operator
466
- sql = operator.sql
467
- if not sql:
468
- self.log.warning("No query found in TeradataOperator")
469
- return None
470
-
471
- return _parse_sql_into_task_metadata(
472
- self,
473
- sql,
474
- platform="teradata",
475
- default_database=None,
476
- default_schema=None,
477
- )
@@ -1,20 +0,0 @@
1
- """
2
- Shims for legacy openlineage-airflow package.
3
- This module is used when openlineage-airflow is installed (Airflow 2.x with legacy OpenLineage).
4
- """
5
-
6
- from openlineage.airflow.listener import TaskHolder
7
- from openlineage.airflow.plugin import OpenLineagePlugin
8
- from openlineage.airflow.utils import (
9
- get_operator_class,
10
- redact_with_exclusions,
11
- try_import_from_string,
12
- )
13
-
14
- __all__ = [
15
- "TaskHolder",
16
- "OpenLineagePlugin",
17
- "get_operator_class",
18
- "redact_with_exclusions",
19
- "try_import_from_string",
20
- ]