apache-airflow-providers-yandex 3.11.0rc1__py3-none-any.whl → 3.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,14 +25,11 @@ from __future__ import annotations
25
25
 
26
26
  import packaging.version
27
27
 
28
- __all__ = ["__version__"]
28
+ from airflow import __version__ as airflow_version
29
29
 
30
- __version__ = "3.11.0"
30
+ __all__ = ["__version__"]
31
31
 
32
- try:
33
- from airflow import __version__ as airflow_version
34
- except ImportError:
35
- from airflow.version import version as airflow_version
32
+ __version__ = "3.11.1"
36
33
 
37
34
  if packaging.version.parse(packaging.version.parse(airflow_version).base_version) < packaging.version.parse(
38
35
  "2.7.0"
@@ -28,8 +28,9 @@ def get_provider_info():
28
28
  "name": "Yandex",
29
29
  "description": "This package is for Yandex, including:\n\n - `Yandex.Cloud <https://cloud.yandex.com/>`__\n",
30
30
  "state": "ready",
31
- "source-date-epoch": 1714477757,
31
+ "source-date-epoch": 1716289256,
32
32
  "versions": [
33
+ "3.11.1",
33
34
  "3.11.0",
34
35
  "3.10.0",
35
36
  "3.9.0",
@@ -52,7 +53,7 @@ def get_provider_info():
52
53
  "1.0.1",
53
54
  "1.0.0",
54
55
  ],
55
- "dependencies": ["apache-airflow>=2.7.0", "yandexcloud>=0.228.0", "yandex-query-client>=0.1.2"],
56
+ "dependencies": ["apache-airflow>=2.7.0", "yandexcloud>=0.278.0", "yandex-query-client>=0.1.4"],
56
57
  "integrations": [
57
58
  {
58
59
  "integration-name": "Yandex.Cloud",
@@ -63,14 +64,14 @@ def get_provider_info():
63
64
  {
64
65
  "integration-name": "Yandex.Cloud Dataproc",
65
66
  "external-doc-url": "https://cloud.yandex.com/dataproc",
66
- "how-to-guide": ["/docs/apache-airflow-providers-yandex/operators.rst"],
67
+ "how-to-guide": ["/docs/apache-airflow-providers-yandex/operators/dataproc.rst"],
67
68
  "logo": "/integration-logos/yandex/Yandex-Cloud.png",
68
69
  "tags": ["service"],
69
70
  },
70
71
  {
71
72
  "integration-name": "Yandex.Cloud YQ",
72
73
  "external-doc-url": "https://cloud.yandex.com/en/services/query",
73
- "how-to-guide": ["/docs/apache-airflow-providers-yandex/operators.rst"],
74
+ "how-to-guide": ["/docs/apache-airflow-providers-yandex/operators/yq.rst"],
74
75
  "logo": "/integration-logos/yandex/Yandex-Cloud.png",
75
76
  "tags": ["service"],
76
77
  },
@@ -78,7 +79,7 @@ def get_provider_info():
78
79
  "operators": [
79
80
  {
80
81
  "integration-name": "Yandex.Cloud Dataproc",
81
- "python-modules": ["airflow.providers.yandex.operators.yandexcloud_dataproc"],
82
+ "python-modules": ["airflow.providers.yandex.operators.dataproc"],
82
83
  },
83
84
  {
84
85
  "integration-name": "Yandex.Cloud YQ",
@@ -89,7 +90,7 @@ def get_provider_info():
89
90
  {"integration-name": "Yandex.Cloud", "python-modules": ["airflow.providers.yandex.hooks.yandex"]},
90
91
  {
91
92
  "integration-name": "Yandex.Cloud Dataproc",
92
- "python-modules": ["airflow.providers.yandex.hooks.yandexcloud_dataproc"],
93
+ "python-modules": ["airflow.providers.yandex.hooks.dataproc"],
93
94
  },
94
95
  {"integration-name": "Yandex.Cloud YQ", "python-modules": ["airflow.providers.yandex.hooks.yq"]},
95
96
  ],
@@ -0,0 +1,35 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+ from __future__ import annotations
18
+
19
+ from airflow.providers.yandex.hooks.yandex import YandexCloudBaseHook
20
+
21
+
22
+ class DataprocHook(YandexCloudBaseHook):
23
+ """
24
+ A base hook for Yandex.Cloud Data Proc.
25
+
26
+ :param yandex_conn_id: The connection ID to use when fetching connection info.
27
+ """
28
+
29
+ def __init__(self, *args, **kwargs) -> None:
30
+ super().__init__(*args, **kwargs)
31
+ self.cluster_id = None
32
+ self.client = self.sdk.wrappers.Dataproc(
33
+ default_folder_id=self.default_folder_id,
34
+ default_public_ssh_key=self.default_public_ssh_key,
35
+ )
@@ -14,22 +14,17 @@
14
14
  # KIND, either express or implied. See the License for the
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
- from __future__ import annotations
18
-
19
- from airflow.providers.yandex.hooks.yandex import YandexCloudBaseHook
17
+ """This module is deprecated. Please use :mod:`airflow.providers.yandex.hooks.dataproc` instead."""
20
18
 
19
+ from __future__ import annotations
21
20
 
22
- class DataprocHook(YandexCloudBaseHook):
23
- """
24
- A base hook for Yandex.Cloud Data Proc.
21
+ import warnings
25
22
 
26
- :param yandex_conn_id: The connection ID to use when fetching connection info.
27
- """
23
+ from airflow.exceptions import AirflowProviderDeprecationWarning
24
+ from airflow.providers.yandex.hooks.dataproc import * # noqa: F403
28
25
 
29
- def __init__(self, *args, **kwargs) -> None:
30
- super().__init__(*args, **kwargs)
31
- self.cluster_id = None
32
- self.client = self.sdk.wrappers.Dataproc(
33
- default_folder_id=self.default_folder_id,
34
- default_public_ssh_key=self.default_public_ssh_key,
35
- )
26
+ warnings.warn(
27
+ "This module is deprecated. Please use `airflow.providers.yandex.hooks.dataproc` instead.",
28
+ AirflowProviderDeprecationWarning,
29
+ stacklevel=2,
30
+ )
@@ -19,9 +19,7 @@ from __future__ import annotations
19
19
  from datetime import timedelta
20
20
  from typing import Any
21
21
 
22
- import yandexcloud
23
- import yandexcloud._auth_fabric as auth_fabric
24
- from yandex.cloud.iam.v1.iam_token_service_pb2_grpc import IamTokenServiceStub
22
+ import yandexcloud.auth as yc_auth
25
23
  from yandex_query_client import YQHttpClient, YQHttpClientConfig
26
24
 
27
25
  from airflow.providers.yandex.hooks.yandex import YandexCloudBaseHook
@@ -100,13 +98,4 @@ class YQHook(YandexCloudBaseHook):
100
98
  if iam_token is not None:
101
99
  return iam_token
102
100
 
103
- service_account_key = self.credentials.get("service_account_key")
104
- # if service_account_key is None metadata server will be used
105
- token_requester = auth_fabric.get_auth_token_requester(service_account_key=service_account_key)
106
-
107
- if service_account_key is None:
108
- return token_requester.get_token()
109
-
110
- sdk = yandexcloud.SDK()
111
- client = sdk.client(IamTokenServiceStub)
112
- return client.Create(token_requester.get_token_request()).iam_token
101
+ return yc_auth.get_auth_token(service_account_key=self.credentials.get("service_account_key"))
@@ -0,0 +1,535 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+ from __future__ import annotations
18
+
19
+ import warnings
20
+ from dataclasses import dataclass
21
+ from typing import TYPE_CHECKING, Iterable, Sequence
22
+
23
+ from airflow.exceptions import AirflowProviderDeprecationWarning
24
+ from airflow.models import BaseOperator
25
+ from airflow.providers.yandex.hooks.dataproc import DataprocHook
26
+
27
+ if TYPE_CHECKING:
28
+ from airflow.utils.context import Context
29
+
30
+
31
+ @dataclass
32
+ class InitializationAction:
33
+ """Data for initialization action to be run at start of DataProc cluster."""
34
+
35
+ uri: str # Uri of the executable file
36
+ args: Sequence[str] # Arguments to the initialization action
37
+ timeout: int # Execution timeout
38
+
39
+
40
+ class DataprocCreateClusterOperator(BaseOperator):
41
+ """Creates Yandex.Cloud Data Proc cluster.
42
+
43
+ :param folder_id: ID of the folder in which cluster should be created.
44
+ :param cluster_name: Cluster name. Must be unique inside the folder.
45
+ :param cluster_description: Cluster description.
46
+ :param cluster_image_version: Cluster image version. Use default.
47
+ :param ssh_public_keys: List of SSH public keys that will be deployed to created compute instances.
48
+ :param subnet_id: ID of the subnetwork. All Data Proc cluster nodes will use one subnetwork.
49
+ :param services: List of services that will be installed to the cluster. Possible options:
50
+ HDFS, YARN, MAPREDUCE, HIVE, TEZ, ZOOKEEPER, HBASE, SQOOP, FLUME, SPARK, SPARK, ZEPPELIN, OOZIE
51
+ :param s3_bucket: Yandex.Cloud S3 bucket to store cluster logs.
52
+ Jobs will not work if the bucket is not specified.
53
+ :param zone: Availability zone to create cluster in.
54
+ Currently there are ru-central1-a, ru-central1-b and ru-central1-c.
55
+ :param service_account_id: Service account id for the cluster.
56
+ Service account can be created inside the folder.
57
+ :param masternode_resource_preset: Resources preset (CPU+RAM configuration)
58
+ for the primary node of the cluster.
59
+ :param masternode_disk_size: Masternode storage size in GiB.
60
+ :param masternode_disk_type: Masternode storage type. Possible options: network-ssd, network-hdd.
61
+ :param datanode_resource_preset: Resources preset (CPU+RAM configuration)
62
+ for the data nodes of the cluster.
63
+ :param datanode_disk_size: Datanodes storage size in GiB.
64
+ :param datanode_disk_type: Datanodes storage type. Possible options: network-ssd, network-hdd.
65
+ :param computenode_resource_preset: Resources preset (CPU+RAM configuration)
66
+ for the compute nodes of the cluster.
67
+ :param computenode_disk_size: Computenodes storage size in GiB.
68
+ :param computenode_disk_type: Computenodes storage type. Possible options: network-ssd, network-hdd.
69
+ :param connection_id: ID of the Yandex.Cloud Airflow connection.
70
+ :param computenode_max_count: Maximum number of nodes of compute autoscaling subcluster.
71
+ :param computenode_warmup_duration: The warmup time of the instance in seconds. During this time,
72
+ traffic is sent to the instance,
73
+ but instance metrics are not collected. In seconds.
74
+ :param computenode_stabilization_duration: Minimum amount of time in seconds for monitoring before
75
+ Instance Groups can reduce the number of instances in the group.
76
+ During this time, the group size doesn't decrease,
77
+ even if the new metric values indicate that it should. In seconds.
78
+ :param computenode_preemptible: Preemptible instances are stopped at least once every 24 hours,
79
+ and can be stopped at any time if their resources are needed by Compute.
80
+ :param computenode_cpu_utilization_target: Defines an autoscaling rule
81
+ based on the average CPU utilization of the instance group.
82
+ in percents. 10-100.
83
+ By default is not set and default autoscaling strategy is used.
84
+ :param computenode_decommission_timeout: Timeout to gracefully decommission nodes during downscaling.
85
+ In seconds
86
+ :param properties: Properties passed to main node software.
87
+ Docs: https://cloud.yandex.com/docs/data-proc/concepts/settings-list
88
+ :param enable_ui_proxy: Enable UI Proxy feature for forwarding Hadoop components web interfaces
89
+ Docs: https://cloud.yandex.com/docs/data-proc/concepts/ui-proxy
90
+ :param host_group_ids: Dedicated host groups to place VMs of cluster on.
91
+ Docs: https://cloud.yandex.com/docs/compute/concepts/dedicated-host
92
+ :param security_group_ids: User security groups.
93
+ Docs: https://cloud.yandex.com/docs/data-proc/concepts/network#security-groups
94
+ :param log_group_id: Id of log group to write logs. By default logs will be sent to default log group.
95
+ To disable cloud log sending set cluster property dataproc:disable_cloud_logging = true
96
+ Docs: https://cloud.yandex.com/docs/data-proc/concepts/logs
97
+ :param initialization_actions: Set of init-actions to run when cluster starts.
98
+ Docs: https://cloud.yandex.com/docs/data-proc/concepts/init-action
99
+ :param labels: Cluster labels as key:value pairs. No more than 64 per resource.
100
+ Docs: https://cloud.yandex.com/docs/resource-manager/concepts/labels
101
+ """
102
+
103
+ def __init__(
104
+ self,
105
+ *,
106
+ folder_id: str | None = None,
107
+ cluster_name: str | None = None,
108
+ cluster_description: str | None = "",
109
+ cluster_image_version: str | None = None,
110
+ ssh_public_keys: str | Iterable[str] | None = None,
111
+ subnet_id: str | None = None,
112
+ services: Iterable[str] = ("HDFS", "YARN", "MAPREDUCE", "HIVE", "SPARK"),
113
+ s3_bucket: str | None = None,
114
+ zone: str = "ru-central1-b",
115
+ service_account_id: str | None = None,
116
+ masternode_resource_preset: str | None = None,
117
+ masternode_disk_size: int | None = None,
118
+ masternode_disk_type: str | None = None,
119
+ datanode_resource_preset: str | None = None,
120
+ datanode_disk_size: int | None = None,
121
+ datanode_disk_type: str | None = None,
122
+ datanode_count: int = 1,
123
+ computenode_resource_preset: str | None = None,
124
+ computenode_disk_size: int | None = None,
125
+ computenode_disk_type: str | None = None,
126
+ computenode_count: int = 0,
127
+ computenode_max_hosts_count: int | None = None,
128
+ computenode_measurement_duration: int | None = None,
129
+ computenode_warmup_duration: int | None = None,
130
+ computenode_stabilization_duration: int | None = None,
131
+ computenode_preemptible: bool = False,
132
+ computenode_cpu_utilization_target: int | None = None,
133
+ computenode_decommission_timeout: int | None = None,
134
+ connection_id: str | None = None,
135
+ properties: dict[str, str] | None = None,
136
+ enable_ui_proxy: bool = False,
137
+ host_group_ids: Iterable[str] | None = None,
138
+ security_group_ids: Iterable[str] | None = None,
139
+ log_group_id: str | None = None,
140
+ initialization_actions: Iterable[InitializationAction] | None = None,
141
+ labels: dict[str, str] | None = None,
142
+ **kwargs,
143
+ ) -> None:
144
+ super().__init__(**kwargs)
145
+ self.folder_id = folder_id
146
+ self.yandex_conn_id = connection_id
147
+ self.cluster_name = cluster_name
148
+ self.cluster_description = cluster_description
149
+ self.cluster_image_version = cluster_image_version
150
+ self.ssh_public_keys = ssh_public_keys
151
+ self.subnet_id = subnet_id
152
+ self.services = services
153
+ self.s3_bucket = s3_bucket
154
+ self.zone = zone
155
+ self.service_account_id = service_account_id
156
+ self.masternode_resource_preset = masternode_resource_preset
157
+ self.masternode_disk_size = masternode_disk_size
158
+ self.masternode_disk_type = masternode_disk_type
159
+ self.datanode_resource_preset = datanode_resource_preset
160
+ self.datanode_disk_size = datanode_disk_size
161
+ self.datanode_disk_type = datanode_disk_type
162
+ self.datanode_count = datanode_count
163
+ self.computenode_resource_preset = computenode_resource_preset
164
+ self.computenode_disk_size = computenode_disk_size
165
+ self.computenode_disk_type = computenode_disk_type
166
+ self.computenode_count = computenode_count
167
+ self.computenode_max_hosts_count = computenode_max_hosts_count
168
+ self.computenode_measurement_duration = computenode_measurement_duration
169
+ self.computenode_warmup_duration = computenode_warmup_duration
170
+ self.computenode_stabilization_duration = computenode_stabilization_duration
171
+ self.computenode_preemptible = computenode_preemptible
172
+ self.computenode_cpu_utilization_target = computenode_cpu_utilization_target
173
+ self.computenode_decommission_timeout = computenode_decommission_timeout
174
+ self.properties = properties
175
+ self.enable_ui_proxy = enable_ui_proxy
176
+ self.host_group_ids = host_group_ids
177
+ self.security_group_ids = security_group_ids
178
+ self.log_group_id = log_group_id
179
+ self.initialization_actions = initialization_actions
180
+ self.labels = labels
181
+
182
+ self.hook: DataprocHook | None = None
183
+
184
+ def execute(self, context: Context) -> dict:
185
+ self.hook = DataprocHook(
186
+ yandex_conn_id=self.yandex_conn_id,
187
+ )
188
+ operation_result = self.hook.client.create_cluster(
189
+ folder_id=self.folder_id,
190
+ cluster_name=self.cluster_name,
191
+ cluster_description=self.cluster_description,
192
+ cluster_image_version=self.cluster_image_version,
193
+ ssh_public_keys=self.ssh_public_keys,
194
+ subnet_id=self.subnet_id,
195
+ services=self.services,
196
+ s3_bucket=self.s3_bucket,
197
+ zone=self.zone,
198
+ service_account_id=self.service_account_id or self.hook.default_service_account_id,
199
+ masternode_resource_preset=self.masternode_resource_preset,
200
+ masternode_disk_size=self.masternode_disk_size,
201
+ masternode_disk_type=self.masternode_disk_type,
202
+ datanode_resource_preset=self.datanode_resource_preset,
203
+ datanode_disk_size=self.datanode_disk_size,
204
+ datanode_disk_type=self.datanode_disk_type,
205
+ datanode_count=self.datanode_count,
206
+ computenode_resource_preset=self.computenode_resource_preset,
207
+ computenode_disk_size=self.computenode_disk_size,
208
+ computenode_disk_type=self.computenode_disk_type,
209
+ computenode_count=self.computenode_count,
210
+ computenode_max_hosts_count=self.computenode_max_hosts_count,
211
+ computenode_measurement_duration=self.computenode_measurement_duration,
212
+ computenode_warmup_duration=self.computenode_warmup_duration,
213
+ computenode_stabilization_duration=self.computenode_stabilization_duration,
214
+ computenode_preemptible=self.computenode_preemptible,
215
+ computenode_cpu_utilization_target=self.computenode_cpu_utilization_target,
216
+ computenode_decommission_timeout=self.computenode_decommission_timeout,
217
+ properties=self.properties,
218
+ enable_ui_proxy=self.enable_ui_proxy,
219
+ host_group_ids=self.host_group_ids,
220
+ security_group_ids=self.security_group_ids,
221
+ log_group_id=self.log_group_id,
222
+ labels=self.labels,
223
+ initialization_actions=self.initialization_actions
224
+ and [
225
+ self.hook.sdk.wrappers.InitializationAction(
226
+ uri=init_action.uri,
227
+ args=init_action.args,
228
+ timeout=init_action.timeout,
229
+ )
230
+ for init_action in self.initialization_actions
231
+ ],
232
+ )
233
+ cluster_id = operation_result.response.id
234
+
235
+ context["task_instance"].xcom_push(key="cluster_id", value=cluster_id)
236
+ # Deprecated
237
+ context["task_instance"].xcom_push(key="yandexcloud_connection_id", value=self.yandex_conn_id)
238
+ return cluster_id
239
+
240
+ @property
241
+ def cluster_id(self):
242
+ return self.output
243
+
244
+
245
+ class DataprocBaseOperator(BaseOperator):
246
+ """Base class for DataProc operators working with given cluster.
247
+
248
+ :param connection_id: ID of the Yandex.Cloud Airflow connection.
249
+ :param cluster_id: ID of the cluster to remove. (templated)
250
+ """
251
+
252
+ template_fields: Sequence[str] = ("cluster_id",)
253
+
254
+ def __init__(self, *, yandex_conn_id: str | None = None, cluster_id: str | None = None, **kwargs) -> None:
255
+ super().__init__(**kwargs)
256
+ self.cluster_id = cluster_id
257
+ self.yandex_conn_id = yandex_conn_id
258
+
259
+ def _setup(self, context: Context) -> DataprocHook:
260
+ if self.cluster_id is None:
261
+ self.cluster_id = context["task_instance"].xcom_pull(key="cluster_id")
262
+ if self.yandex_conn_id is None:
263
+ xcom_yandex_conn_id = context["task_instance"].xcom_pull(key="yandexcloud_connection_id")
264
+ if xcom_yandex_conn_id:
265
+ warnings.warn(
266
+ "Implicit pass of `yandex_conn_id` is deprecated, please pass it explicitly",
267
+ AirflowProviderDeprecationWarning,
268
+ stacklevel=2,
269
+ )
270
+ self.yandex_conn_id = xcom_yandex_conn_id
271
+
272
+ return DataprocHook(yandex_conn_id=self.yandex_conn_id)
273
+
274
+ def execute(self, context: Context):
275
+ raise NotImplementedError()
276
+
277
+
278
+ class DataprocDeleteClusterOperator(DataprocBaseOperator):
279
+ """Deletes Yandex.Cloud Data Proc cluster.
280
+
281
+ :param connection_id: ID of the Yandex.Cloud Airflow connection.
282
+ :param cluster_id: ID of the cluster to remove. (templated)
283
+ """
284
+
285
+ def __init__(self, *, connection_id: str | None = None, cluster_id: str | None = None, **kwargs) -> None:
286
+ super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
287
+
288
+ def execute(self, context: Context) -> None:
289
+ hook = self._setup(context)
290
+ hook.client.delete_cluster(self.cluster_id)
291
+
292
+
293
+ class DataprocCreateHiveJobOperator(DataprocBaseOperator):
294
+ """Runs Hive job in Data Proc cluster.
295
+
296
+ :param query: Hive query.
297
+ :param query_file_uri: URI of the script that contains Hive queries. Can be placed in HDFS or S3.
298
+ :param properties: A mapping of property names to values, used to configure Hive.
299
+ :param script_variables: Mapping of query variable names to values.
300
+ :param continue_on_failure: Whether to continue executing queries if a query fails.
301
+ :param name: Name of the job. Used for labeling.
302
+ :param cluster_id: ID of the cluster to run job in.
303
+ Will try to take the ID from Dataproc Hook object if it's specified. (templated)
304
+ :param connection_id: ID of the Yandex.Cloud Airflow connection.
305
+ """
306
+
307
+ def __init__(
308
+ self,
309
+ *,
310
+ query: str | None = None,
311
+ query_file_uri: str | None = None,
312
+ script_variables: dict[str, str] | None = None,
313
+ continue_on_failure: bool = False,
314
+ properties: dict[str, str] | None = None,
315
+ name: str = "Hive job",
316
+ cluster_id: str | None = None,
317
+ connection_id: str | None = None,
318
+ **kwargs,
319
+ ) -> None:
320
+ super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
321
+ self.query = query
322
+ self.query_file_uri = query_file_uri
323
+ self.script_variables = script_variables
324
+ self.continue_on_failure = continue_on_failure
325
+ self.properties = properties
326
+ self.name = name
327
+
328
+ def execute(self, context: Context) -> None:
329
+ hook = self._setup(context)
330
+ hook.client.create_hive_job(
331
+ query=self.query,
332
+ query_file_uri=self.query_file_uri,
333
+ script_variables=self.script_variables,
334
+ continue_on_failure=self.continue_on_failure,
335
+ properties=self.properties,
336
+ name=self.name,
337
+ cluster_id=self.cluster_id,
338
+ )
339
+
340
+
341
+ class DataprocCreateMapReduceJobOperator(DataprocBaseOperator):
342
+ """Runs Mapreduce job in Data Proc cluster.
343
+
344
+ :param main_jar_file_uri: URI of jar file with job.
345
+ Can be placed in HDFS or S3. Can be specified instead of main_class.
346
+ :param main_class: Name of the main class of the job. Can be specified instead of main_jar_file_uri.
347
+ :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
348
+ :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
349
+ :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
350
+ :param properties: Properties for the job.
351
+ :param args: Arguments to be passed to the job.
352
+ :param name: Name of the job. Used for labeling.
353
+ :param cluster_id: ID of the cluster to run job in.
354
+ Will try to take the ID from Dataproc Hook object if it's specified. (templated)
355
+ :param connection_id: ID of the Yandex.Cloud Airflow connection.
356
+ """
357
+
358
+ def __init__(
359
+ self,
360
+ *,
361
+ main_class: str | None = None,
362
+ main_jar_file_uri: str | None = None,
363
+ jar_file_uris: Iterable[str] | None = None,
364
+ archive_uris: Iterable[str] | None = None,
365
+ file_uris: Iterable[str] | None = None,
366
+ args: Iterable[str] | None = None,
367
+ properties: dict[str, str] | None = None,
368
+ name: str = "Mapreduce job",
369
+ cluster_id: str | None = None,
370
+ connection_id: str | None = None,
371
+ **kwargs,
372
+ ) -> None:
373
+ super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
374
+ self.main_class = main_class
375
+ self.main_jar_file_uri = main_jar_file_uri
376
+ self.jar_file_uris = jar_file_uris
377
+ self.archive_uris = archive_uris
378
+ self.file_uris = file_uris
379
+ self.args = args
380
+ self.properties = properties
381
+ self.name = name
382
+
383
+ def execute(self, context: Context) -> None:
384
+ hook = self._setup(context)
385
+ hook.client.create_mapreduce_job(
386
+ main_class=self.main_class,
387
+ main_jar_file_uri=self.main_jar_file_uri,
388
+ jar_file_uris=self.jar_file_uris,
389
+ archive_uris=self.archive_uris,
390
+ file_uris=self.file_uris,
391
+ args=self.args,
392
+ properties=self.properties,
393
+ name=self.name,
394
+ cluster_id=self.cluster_id,
395
+ )
396
+
397
+
398
+ class DataprocCreateSparkJobOperator(DataprocBaseOperator):
399
+ """Runs Spark job in Data Proc cluster.
400
+
401
+ :param main_jar_file_uri: URI of jar file with job. Can be placed in HDFS or S3.
402
+ :param main_class: Name of the main class of the job.
403
+ :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
404
+ :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
405
+ :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
406
+ :param properties: Properties for the job.
407
+ :param args: Arguments to be passed to the job.
408
+ :param name: Name of the job. Used for labeling.
409
+ :param cluster_id: ID of the cluster to run job in.
410
+ Will try to take the ID from Dataproc Hook object if it's specified. (templated)
411
+ :param connection_id: ID of the Yandex.Cloud Airflow connection.
412
+ :param packages: List of maven coordinates of jars to include on the driver and executor classpaths.
413
+ :param repositories: List of additional remote repositories to search for the maven coordinates
414
+ given with --packages.
415
+ :param exclude_packages: List of groupId:artifactId, to exclude while resolving the dependencies
416
+ provided in --packages to avoid dependency conflicts.
417
+ """
418
+
419
+ def __init__(
420
+ self,
421
+ *,
422
+ main_class: str | None = None,
423
+ main_jar_file_uri: str | None = None,
424
+ jar_file_uris: Iterable[str] | None = None,
425
+ archive_uris: Iterable[str] | None = None,
426
+ file_uris: Iterable[str] | None = None,
427
+ args: Iterable[str] | None = None,
428
+ properties: dict[str, str] | None = None,
429
+ name: str = "Spark job",
430
+ cluster_id: str | None = None,
431
+ connection_id: str | None = None,
432
+ packages: Iterable[str] | None = None,
433
+ repositories: Iterable[str] | None = None,
434
+ exclude_packages: Iterable[str] | None = None,
435
+ **kwargs,
436
+ ) -> None:
437
+ super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
438
+ self.main_class = main_class
439
+ self.main_jar_file_uri = main_jar_file_uri
440
+ self.jar_file_uris = jar_file_uris
441
+ self.archive_uris = archive_uris
442
+ self.file_uris = file_uris
443
+ self.args = args
444
+ self.properties = properties
445
+ self.name = name
446
+ self.packages = packages
447
+ self.repositories = repositories
448
+ self.exclude_packages = exclude_packages
449
+
450
+ def execute(self, context: Context) -> None:
451
+ hook = self._setup(context)
452
+ hook.client.create_spark_job(
453
+ main_class=self.main_class,
454
+ main_jar_file_uri=self.main_jar_file_uri,
455
+ jar_file_uris=self.jar_file_uris,
456
+ archive_uris=self.archive_uris,
457
+ file_uris=self.file_uris,
458
+ args=self.args,
459
+ properties=self.properties,
460
+ packages=self.packages,
461
+ repositories=self.repositories,
462
+ exclude_packages=self.exclude_packages,
463
+ name=self.name,
464
+ cluster_id=self.cluster_id,
465
+ )
466
+
467
+
468
+ class DataprocCreatePysparkJobOperator(DataprocBaseOperator):
469
+ """Runs Pyspark job in Data Proc cluster.
470
+
471
+ :param main_python_file_uri: URI of python file with job. Can be placed in HDFS or S3.
472
+ :param python_file_uris: URIs of python files used in the job. Can be placed in HDFS or S3.
473
+ :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
474
+ :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
475
+ :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
476
+ :param properties: Properties for the job.
477
+ :param args: Arguments to be passed to the job.
478
+ :param name: Name of the job. Used for labeling.
479
+ :param cluster_id: ID of the cluster to run job in.
480
+ Will try to take the ID from Dataproc Hook object if it's specified. (templated)
481
+ :param connection_id: ID of the Yandex.Cloud Airflow connection.
482
+ :param packages: List of maven coordinates of jars to include on the driver and executor classpaths.
483
+ :param repositories: List of additional remote repositories to search for the maven coordinates
484
+ given with --packages.
485
+ :param exclude_packages: List of groupId:artifactId, to exclude while resolving the dependencies
486
+ provided in --packages to avoid dependency conflicts.
487
+ """
488
+
489
+ def __init__(
490
+ self,
491
+ *,
492
+ main_python_file_uri: str | None = None,
493
+ python_file_uris: Iterable[str] | None = None,
494
+ jar_file_uris: Iterable[str] | None = None,
495
+ archive_uris: Iterable[str] | None = None,
496
+ file_uris: Iterable[str] | None = None,
497
+ args: Iterable[str] | None = None,
498
+ properties: dict[str, str] | None = None,
499
+ name: str = "Pyspark job",
500
+ cluster_id: str | None = None,
501
+ connection_id: str | None = None,
502
+ packages: Iterable[str] | None = None,
503
+ repositories: Iterable[str] | None = None,
504
+ exclude_packages: Iterable[str] | None = None,
505
+ **kwargs,
506
+ ) -> None:
507
+ super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
508
+ self.main_python_file_uri = main_python_file_uri
509
+ self.python_file_uris = python_file_uris
510
+ self.jar_file_uris = jar_file_uris
511
+ self.archive_uris = archive_uris
512
+ self.file_uris = file_uris
513
+ self.args = args
514
+ self.properties = properties
515
+ self.name = name
516
+ self.packages = packages
517
+ self.repositories = repositories
518
+ self.exclude_packages = exclude_packages
519
+
520
+ def execute(self, context: Context) -> None:
521
+ hook = self._setup(context)
522
+ hook.client.create_pyspark_job(
523
+ main_python_file_uri=self.main_python_file_uri,
524
+ python_file_uris=self.python_file_uris,
525
+ jar_file_uris=self.jar_file_uris,
526
+ archive_uris=self.archive_uris,
527
+ file_uris=self.file_uris,
528
+ args=self.args,
529
+ properties=self.properties,
530
+ packages=self.packages,
531
+ repositories=self.repositories,
532
+ exclude_packages=self.exclude_packages,
533
+ name=self.name,
534
+ cluster_id=self.cluster_id,
535
+ )
@@ -14,522 +14,17 @@
14
14
  # KIND, either express or implied. See the License for the
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
+ """This module is deprecated. Please use :mod:`airflow.providers.yandex.operators.dataproc` instead."""
18
+
17
19
  from __future__ import annotations
18
20
 
19
21
  import warnings
20
- from dataclasses import dataclass
21
- from typing import TYPE_CHECKING, Iterable, Sequence
22
22
 
23
23
  from airflow.exceptions import AirflowProviderDeprecationWarning
24
- from airflow.models import BaseOperator
25
- from airflow.providers.yandex.hooks.yandexcloud_dataproc import DataprocHook
26
-
27
- if TYPE_CHECKING:
28
- from airflow.utils.context import Context
29
-
30
-
31
- @dataclass
32
- class InitializationAction:
33
- """Data for initialization action to be run at start of DataProc cluster."""
34
-
35
- uri: str # Uri of the executable file
36
- args: Sequence[str] # Arguments to the initialization action
37
- timeout: int # Execution timeout
38
-
39
-
40
- class DataprocCreateClusterOperator(BaseOperator):
41
- """Creates Yandex.Cloud Data Proc cluster.
42
-
43
- :param folder_id: ID of the folder in which cluster should be created.
44
- :param cluster_name: Cluster name. Must be unique inside the folder.
45
- :param cluster_description: Cluster description.
46
- :param cluster_image_version: Cluster image version. Use default.
47
- :param ssh_public_keys: List of SSH public keys that will be deployed to created compute instances.
48
- :param subnet_id: ID of the subnetwork. All Data Proc cluster nodes will use one subnetwork.
49
- :param services: List of services that will be installed to the cluster. Possible options:
50
- HDFS, YARN, MAPREDUCE, HIVE, TEZ, ZOOKEEPER, HBASE, SQOOP, FLUME, SPARK, SPARK, ZEPPELIN, OOZIE
51
- :param s3_bucket: Yandex.Cloud S3 bucket to store cluster logs.
52
- Jobs will not work if the bucket is not specified.
53
- :param zone: Availability zone to create cluster in.
54
- Currently there are ru-central1-a, ru-central1-b and ru-central1-c.
55
- :param service_account_id: Service account id for the cluster.
56
- Service account can be created inside the folder.
57
- :param masternode_resource_preset: Resources preset (CPU+RAM configuration)
58
- for the primary node of the cluster.
59
- :param masternode_disk_size: Masternode storage size in GiB.
60
- :param masternode_disk_type: Masternode storage type. Possible options: network-ssd, network-hdd.
61
- :param datanode_resource_preset: Resources preset (CPU+RAM configuration)
62
- for the data nodes of the cluster.
63
- :param datanode_disk_size: Datanodes storage size in GiB.
64
- :param datanode_disk_type: Datanodes storage type. Possible options: network-ssd, network-hdd.
65
- :param computenode_resource_preset: Resources preset (CPU+RAM configuration)
66
- for the compute nodes of the cluster.
67
- :param computenode_disk_size: Computenodes storage size in GiB.
68
- :param computenode_disk_type: Computenodes storage type. Possible options: network-ssd, network-hdd.
69
- :param connection_id: ID of the Yandex.Cloud Airflow connection.
70
- :param computenode_max_count: Maximum number of nodes of compute autoscaling subcluster.
71
- :param computenode_warmup_duration: The warmup time of the instance in seconds. During this time,
72
- traffic is sent to the instance,
73
- but instance metrics are not collected. In seconds.
74
- :param computenode_stabilization_duration: Minimum amount of time in seconds for monitoring before
75
- Instance Groups can reduce the number of instances in the group.
76
- During this time, the group size doesn't decrease,
77
- even if the new metric values indicate that it should. In seconds.
78
- :param computenode_preemptible: Preemptible instances are stopped at least once every 24 hours,
79
- and can be stopped at any time if their resources are needed by Compute.
80
- :param computenode_cpu_utilization_target: Defines an autoscaling rule
81
- based on the average CPU utilization of the instance group.
82
- in percents. 10-100.
83
- By default is not set and default autoscaling strategy is used.
84
- :param computenode_decommission_timeout: Timeout to gracefully decommission nodes during downscaling.
85
- In seconds
86
- :param properties: Properties passed to main node software.
87
- Docs: https://cloud.yandex.com/docs/data-proc/concepts/settings-list
88
- :param enable_ui_proxy: Enable UI Proxy feature for forwarding Hadoop components web interfaces
89
- Docs: https://cloud.yandex.com/docs/data-proc/concepts/ui-proxy
90
- :param host_group_ids: Dedicated host groups to place VMs of cluster on.
91
- Docs: https://cloud.yandex.com/docs/compute/concepts/dedicated-host
92
- :param security_group_ids: User security groups.
93
- Docs: https://cloud.yandex.com/docs/data-proc/concepts/network#security-groups
94
- :param log_group_id: Id of log group to write logs. By default logs will be sent to default log group.
95
- To disable cloud log sending set cluster property dataproc:disable_cloud_logging = true
96
- Docs: https://cloud.yandex.com/docs/data-proc/concepts/logs
97
- :param initialization_actions: Set of init-actions to run when cluster starts.
98
- Docs: https://cloud.yandex.com/docs/data-proc/concepts/init-action
99
- :param labels: Cluster labels as key:value pairs. No more than 64 per resource.
100
- Docs: https://cloud.yandex.com/docs/resource-manager/concepts/labels
101
- """
102
-
103
- def __init__(
104
- self,
105
- *,
106
- folder_id: str | None = None,
107
- cluster_name: str | None = None,
108
- cluster_description: str | None = "",
109
- cluster_image_version: str | None = None,
110
- ssh_public_keys: str | Iterable[str] | None = None,
111
- subnet_id: str | None = None,
112
- services: Iterable[str] = ("HDFS", "YARN", "MAPREDUCE", "HIVE", "SPARK"),
113
- s3_bucket: str | None = None,
114
- zone: str = "ru-central1-b",
115
- service_account_id: str | None = None,
116
- masternode_resource_preset: str | None = None,
117
- masternode_disk_size: int | None = None,
118
- masternode_disk_type: str | None = None,
119
- datanode_resource_preset: str | None = None,
120
- datanode_disk_size: int | None = None,
121
- datanode_disk_type: str | None = None,
122
- datanode_count: int = 1,
123
- computenode_resource_preset: str | None = None,
124
- computenode_disk_size: int | None = None,
125
- computenode_disk_type: str | None = None,
126
- computenode_count: int = 0,
127
- computenode_max_hosts_count: int | None = None,
128
- computenode_measurement_duration: int | None = None,
129
- computenode_warmup_duration: int | None = None,
130
- computenode_stabilization_duration: int | None = None,
131
- computenode_preemptible: bool = False,
132
- computenode_cpu_utilization_target: int | None = None,
133
- computenode_decommission_timeout: int | None = None,
134
- connection_id: str | None = None,
135
- properties: dict[str, str] | None = None,
136
- enable_ui_proxy: bool = False,
137
- host_group_ids: Iterable[str] | None = None,
138
- security_group_ids: Iterable[str] | None = None,
139
- log_group_id: str | None = None,
140
- initialization_actions: Iterable[InitializationAction] | None = None,
141
- labels: dict[str, str] | None = None,
142
- **kwargs,
143
- ) -> None:
144
- super().__init__(**kwargs)
145
- self.folder_id = folder_id
146
- self.yandex_conn_id = connection_id
147
- self.cluster_name = cluster_name
148
- self.cluster_description = cluster_description
149
- self.cluster_image_version = cluster_image_version
150
- self.ssh_public_keys = ssh_public_keys
151
- self.subnet_id = subnet_id
152
- self.services = services
153
- self.s3_bucket = s3_bucket
154
- self.zone = zone
155
- self.service_account_id = service_account_id
156
- self.masternode_resource_preset = masternode_resource_preset
157
- self.masternode_disk_size = masternode_disk_size
158
- self.masternode_disk_type = masternode_disk_type
159
- self.datanode_resource_preset = datanode_resource_preset
160
- self.datanode_disk_size = datanode_disk_size
161
- self.datanode_disk_type = datanode_disk_type
162
- self.datanode_count = datanode_count
163
- self.computenode_resource_preset = computenode_resource_preset
164
- self.computenode_disk_size = computenode_disk_size
165
- self.computenode_disk_type = computenode_disk_type
166
- self.computenode_count = computenode_count
167
- self.computenode_max_hosts_count = computenode_max_hosts_count
168
- self.computenode_measurement_duration = computenode_measurement_duration
169
- self.computenode_warmup_duration = computenode_warmup_duration
170
- self.computenode_stabilization_duration = computenode_stabilization_duration
171
- self.computenode_preemptible = computenode_preemptible
172
- self.computenode_cpu_utilization_target = computenode_cpu_utilization_target
173
- self.computenode_decommission_timeout = computenode_decommission_timeout
174
- self.properties = properties
175
- self.enable_ui_proxy = enable_ui_proxy
176
- self.host_group_ids = host_group_ids
177
- self.security_group_ids = security_group_ids
178
- self.log_group_id = log_group_id
179
- self.initialization_actions = initialization_actions
180
- self.labels = labels
181
-
182
- self.hook: DataprocHook | None = None
183
-
184
- def execute(self, context: Context) -> dict:
185
- self.hook = DataprocHook(
186
- yandex_conn_id=self.yandex_conn_id,
187
- )
188
- operation_result = self.hook.client.create_cluster(
189
- folder_id=self.folder_id,
190
- cluster_name=self.cluster_name,
191
- cluster_description=self.cluster_description,
192
- cluster_image_version=self.cluster_image_version,
193
- ssh_public_keys=self.ssh_public_keys,
194
- subnet_id=self.subnet_id,
195
- services=self.services,
196
- s3_bucket=self.s3_bucket,
197
- zone=self.zone,
198
- service_account_id=self.service_account_id or self.hook.default_service_account_id,
199
- masternode_resource_preset=self.masternode_resource_preset,
200
- masternode_disk_size=self.masternode_disk_size,
201
- masternode_disk_type=self.masternode_disk_type,
202
- datanode_resource_preset=self.datanode_resource_preset,
203
- datanode_disk_size=self.datanode_disk_size,
204
- datanode_disk_type=self.datanode_disk_type,
205
- datanode_count=self.datanode_count,
206
- computenode_resource_preset=self.computenode_resource_preset,
207
- computenode_disk_size=self.computenode_disk_size,
208
- computenode_disk_type=self.computenode_disk_type,
209
- computenode_count=self.computenode_count,
210
- computenode_max_hosts_count=self.computenode_max_hosts_count,
211
- computenode_measurement_duration=self.computenode_measurement_duration,
212
- computenode_warmup_duration=self.computenode_warmup_duration,
213
- computenode_stabilization_duration=self.computenode_stabilization_duration,
214
- computenode_preemptible=self.computenode_preemptible,
215
- computenode_cpu_utilization_target=self.computenode_cpu_utilization_target,
216
- computenode_decommission_timeout=self.computenode_decommission_timeout,
217
- properties=self.properties,
218
- enable_ui_proxy=self.enable_ui_proxy,
219
- host_group_ids=self.host_group_ids,
220
- security_group_ids=self.security_group_ids,
221
- log_group_id=self.log_group_id,
222
- labels=self.labels,
223
- initialization_actions=self.initialization_actions
224
- and [
225
- self.hook.sdk.wrappers.InitializationAction(
226
- uri=init_action.uri,
227
- args=init_action.args,
228
- timeout=init_action.timeout,
229
- )
230
- for init_action in self.initialization_actions
231
- ],
232
- )
233
- cluster_id = operation_result.response.id
234
-
235
- context["task_instance"].xcom_push(key="cluster_id", value=cluster_id)
236
- # Deprecated
237
- context["task_instance"].xcom_push(key="yandexcloud_connection_id", value=self.yandex_conn_id)
238
- return cluster_id
239
-
240
- @property
241
- def cluster_id(self):
242
- return self.output
243
-
244
-
245
- class DataprocBaseOperator(BaseOperator):
246
- """Base class for DataProc operators working with given cluster.
247
-
248
- :param connection_id: ID of the Yandex.Cloud Airflow connection.
249
- :param cluster_id: ID of the cluster to remove. (templated)
250
- """
251
-
252
- template_fields: Sequence[str] = ("cluster_id",)
253
-
254
- def __init__(self, *, yandex_conn_id: str | None = None, cluster_id: str | None = None, **kwargs) -> None:
255
- super().__init__(**kwargs)
256
- self.cluster_id = cluster_id
257
- self.yandex_conn_id = yandex_conn_id
258
-
259
- def _setup(self, context: Context) -> DataprocHook:
260
- if self.cluster_id is None:
261
- self.cluster_id = context["task_instance"].xcom_pull(key="cluster_id")
262
- if self.yandex_conn_id is None:
263
- xcom_yandex_conn_id = context["task_instance"].xcom_pull(key="yandexcloud_connection_id")
264
- if xcom_yandex_conn_id:
265
- warnings.warn(
266
- "Implicit pass of `yandex_conn_id` is deprecated, please pass it explicitly",
267
- AirflowProviderDeprecationWarning,
268
- stacklevel=2,
269
- )
270
- self.yandex_conn_id = xcom_yandex_conn_id
271
-
272
- return DataprocHook(yandex_conn_id=self.yandex_conn_id)
273
-
274
- def execute(self, context: Context):
275
- raise NotImplementedError()
276
-
277
-
278
- class DataprocDeleteClusterOperator(DataprocBaseOperator):
279
- """Deletes Yandex.Cloud Data Proc cluster.
280
-
281
- :param connection_id: ID of the Yandex.Cloud Airflow connection.
282
- :param cluster_id: ID of the cluster to remove. (templated)
283
- """
284
-
285
- def __init__(self, *, connection_id: str | None = None, cluster_id: str | None = None, **kwargs) -> None:
286
- super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
287
-
288
- def execute(self, context: Context) -> None:
289
- hook = self._setup(context)
290
- hook.client.delete_cluster(self.cluster_id)
291
-
292
-
293
- class DataprocCreateHiveJobOperator(DataprocBaseOperator):
294
- """Runs Hive job in Data Proc cluster.
295
-
296
- :param query: Hive query.
297
- :param query_file_uri: URI of the script that contains Hive queries. Can be placed in HDFS or S3.
298
- :param properties: A mapping of property names to values, used to configure Hive.
299
- :param script_variables: Mapping of query variable names to values.
300
- :param continue_on_failure: Whether to continue executing queries if a query fails.
301
- :param name: Name of the job. Used for labeling.
302
- :param cluster_id: ID of the cluster to run job in.
303
- Will try to take the ID from Dataproc Hook object if it's specified. (templated)
304
- :param connection_id: ID of the Yandex.Cloud Airflow connection.
305
- """
306
-
307
- def __init__(
308
- self,
309
- *,
310
- query: str | None = None,
311
- query_file_uri: str | None = None,
312
- script_variables: dict[str, str] | None = None,
313
- continue_on_failure: bool = False,
314
- properties: dict[str, str] | None = None,
315
- name: str = "Hive job",
316
- cluster_id: str | None = None,
317
- connection_id: str | None = None,
318
- **kwargs,
319
- ) -> None:
320
- super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
321
- self.query = query
322
- self.query_file_uri = query_file_uri
323
- self.script_variables = script_variables
324
- self.continue_on_failure = continue_on_failure
325
- self.properties = properties
326
- self.name = name
327
-
328
- def execute(self, context: Context) -> None:
329
- hook = self._setup(context)
330
- hook.client.create_hive_job(
331
- query=self.query,
332
- query_file_uri=self.query_file_uri,
333
- script_variables=self.script_variables,
334
- continue_on_failure=self.continue_on_failure,
335
- properties=self.properties,
336
- name=self.name,
337
- cluster_id=self.cluster_id,
338
- )
339
-
340
-
341
- class DataprocCreateMapReduceJobOperator(DataprocBaseOperator):
342
- """Runs Mapreduce job in Data Proc cluster.
343
-
344
- :param main_jar_file_uri: URI of jar file with job.
345
- Can be placed in HDFS or S3. Can be specified instead of main_class.
346
- :param main_class: Name of the main class of the job. Can be specified instead of main_jar_file_uri.
347
- :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
348
- :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
349
- :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
350
- :param properties: Properties for the job.
351
- :param args: Arguments to be passed to the job.
352
- :param name: Name of the job. Used for labeling.
353
- :param cluster_id: ID of the cluster to run job in.
354
- Will try to take the ID from Dataproc Hook object if it's specified. (templated)
355
- :param connection_id: ID of the Yandex.Cloud Airflow connection.
356
- """
357
-
358
- def __init__(
359
- self,
360
- *,
361
- main_class: str | None = None,
362
- main_jar_file_uri: str | None = None,
363
- jar_file_uris: Iterable[str] | None = None,
364
- archive_uris: Iterable[str] | None = None,
365
- file_uris: Iterable[str] | None = None,
366
- args: Iterable[str] | None = None,
367
- properties: dict[str, str] | None = None,
368
- name: str = "Mapreduce job",
369
- cluster_id: str | None = None,
370
- connection_id: str | None = None,
371
- **kwargs,
372
- ) -> None:
373
- super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
374
- self.main_class = main_class
375
- self.main_jar_file_uri = main_jar_file_uri
376
- self.jar_file_uris = jar_file_uris
377
- self.archive_uris = archive_uris
378
- self.file_uris = file_uris
379
- self.args = args
380
- self.properties = properties
381
- self.name = name
382
-
383
- def execute(self, context: Context) -> None:
384
- hook = self._setup(context)
385
- hook.client.create_mapreduce_job(
386
- main_class=self.main_class,
387
- main_jar_file_uri=self.main_jar_file_uri,
388
- jar_file_uris=self.jar_file_uris,
389
- archive_uris=self.archive_uris,
390
- file_uris=self.file_uris,
391
- args=self.args,
392
- properties=self.properties,
393
- name=self.name,
394
- cluster_id=self.cluster_id,
395
- )
396
-
397
-
398
- class DataprocCreateSparkJobOperator(DataprocBaseOperator):
399
- """Runs Spark job in Data Proc cluster.
400
-
401
- :param main_jar_file_uri: URI of jar file with job. Can be placed in HDFS or S3.
402
- :param main_class: Name of the main class of the job.
403
- :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
404
- :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
405
- :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
406
- :param properties: Properties for the job.
407
- :param args: Arguments to be passed to the job.
408
- :param name: Name of the job. Used for labeling.
409
- :param cluster_id: ID of the cluster to run job in.
410
- Will try to take the ID from Dataproc Hook object if it's specified. (templated)
411
- :param connection_id: ID of the Yandex.Cloud Airflow connection.
412
- :param packages: List of maven coordinates of jars to include on the driver and executor classpaths.
413
- :param repositories: List of additional remote repositories to search for the maven coordinates
414
- given with --packages.
415
- :param exclude_packages: List of groupId:artifactId, to exclude while resolving the dependencies
416
- provided in --packages to avoid dependency conflicts.
417
- """
418
-
419
- def __init__(
420
- self,
421
- *,
422
- main_class: str | None = None,
423
- main_jar_file_uri: str | None = None,
424
- jar_file_uris: Iterable[str] | None = None,
425
- archive_uris: Iterable[str] | None = None,
426
- file_uris: Iterable[str] | None = None,
427
- args: Iterable[str] | None = None,
428
- properties: dict[str, str] | None = None,
429
- name: str = "Spark job",
430
- cluster_id: str | None = None,
431
- connection_id: str | None = None,
432
- packages: Iterable[str] | None = None,
433
- repositories: Iterable[str] | None = None,
434
- exclude_packages: Iterable[str] | None = None,
435
- **kwargs,
436
- ) -> None:
437
- super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
438
- self.main_class = main_class
439
- self.main_jar_file_uri = main_jar_file_uri
440
- self.jar_file_uris = jar_file_uris
441
- self.archive_uris = archive_uris
442
- self.file_uris = file_uris
443
- self.args = args
444
- self.properties = properties
445
- self.name = name
446
- self.packages = packages
447
- self.repositories = repositories
448
- self.exclude_packages = exclude_packages
449
-
450
- def execute(self, context: Context) -> None:
451
- hook = self._setup(context)
452
- hook.client.create_spark_job(
453
- main_class=self.main_class,
454
- main_jar_file_uri=self.main_jar_file_uri,
455
- jar_file_uris=self.jar_file_uris,
456
- archive_uris=self.archive_uris,
457
- file_uris=self.file_uris,
458
- args=self.args,
459
- properties=self.properties,
460
- packages=self.packages,
461
- repositories=self.repositories,
462
- exclude_packages=self.exclude_packages,
463
- name=self.name,
464
- cluster_id=self.cluster_id,
465
- )
466
-
467
-
468
- class DataprocCreatePysparkJobOperator(DataprocBaseOperator):
469
- """Runs Pyspark job in Data Proc cluster.
470
-
471
- :param main_python_file_uri: URI of python file with job. Can be placed in HDFS or S3.
472
- :param python_file_uris: URIs of python files used in the job. Can be placed in HDFS or S3.
473
- :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
474
- :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
475
- :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
476
- :param properties: Properties for the job.
477
- :param args: Arguments to be passed to the job.
478
- :param name: Name of the job. Used for labeling.
479
- :param cluster_id: ID of the cluster to run job in.
480
- Will try to take the ID from Dataproc Hook object if it's specified. (templated)
481
- :param connection_id: ID of the Yandex.Cloud Airflow connection.
482
- :param packages: List of maven coordinates of jars to include on the driver and executor classpaths.
483
- :param repositories: List of additional remote repositories to search for the maven coordinates
484
- given with --packages.
485
- :param exclude_packages: List of groupId:artifactId, to exclude while resolving the dependencies
486
- provided in --packages to avoid dependency conflicts.
487
- """
488
-
489
- def __init__(
490
- self,
491
- *,
492
- main_python_file_uri: str | None = None,
493
- python_file_uris: Iterable[str] | None = None,
494
- jar_file_uris: Iterable[str] | None = None,
495
- archive_uris: Iterable[str] | None = None,
496
- file_uris: Iterable[str] | None = None,
497
- args: Iterable[str] | None = None,
498
- properties: dict[str, str] | None = None,
499
- name: str = "Pyspark job",
500
- cluster_id: str | None = None,
501
- connection_id: str | None = None,
502
- packages: Iterable[str] | None = None,
503
- repositories: Iterable[str] | None = None,
504
- exclude_packages: Iterable[str] | None = None,
505
- **kwargs,
506
- ) -> None:
507
- super().__init__(yandex_conn_id=connection_id, cluster_id=cluster_id, **kwargs)
508
- self.main_python_file_uri = main_python_file_uri
509
- self.python_file_uris = python_file_uris
510
- self.jar_file_uris = jar_file_uris
511
- self.archive_uris = archive_uris
512
- self.file_uris = file_uris
513
- self.args = args
514
- self.properties = properties
515
- self.name = name
516
- self.packages = packages
517
- self.repositories = repositories
518
- self.exclude_packages = exclude_packages
24
+ from airflow.providers.yandex.operators.dataproc import * # noqa: F403
519
25
 
520
- def execute(self, context: Context) -> None:
521
- hook = self._setup(context)
522
- hook.client.create_pyspark_job(
523
- main_python_file_uri=self.main_python_file_uri,
524
- python_file_uris=self.python_file_uris,
525
- jar_file_uris=self.jar_file_uris,
526
- archive_uris=self.archive_uris,
527
- file_uris=self.file_uris,
528
- args=self.args,
529
- properties=self.properties,
530
- packages=self.packages,
531
- repositories=self.repositories,
532
- exclude_packages=self.exclude_packages,
533
- name=self.name,
534
- cluster_id=self.cluster_id,
535
- )
26
+ warnings.warn(
27
+ "This module is deprecated. Please use `airflow.providers.yandex.operators.dataproc` instead.",
28
+ AirflowProviderDeprecationWarning,
29
+ stacklevel=2,
30
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: apache-airflow-providers-yandex
3
- Version: 3.11.0rc1
3
+ Version: 3.11.1
4
4
  Summary: Provider package apache-airflow-providers-yandex for Apache Airflow
5
5
  Keywords: airflow-provider,yandex,airflow,integration
6
6
  Author-email: Apache Software Foundation <dev@airflow.apache.org>
@@ -21,12 +21,12 @@ Classifier: Programming Language :: Python :: 3.10
21
21
  Classifier: Programming Language :: Python :: 3.11
22
22
  Classifier: Programming Language :: Python :: 3.12
23
23
  Classifier: Topic :: System :: Monitoring
24
- Requires-Dist: apache-airflow>=2.7.0rc0
25
- Requires-Dist: yandex-query-client>=0.1.2
26
- Requires-Dist: yandexcloud>=0.228.0
24
+ Requires-Dist: apache-airflow>=2.7.0
25
+ Requires-Dist: yandex-query-client>=0.1.4
26
+ Requires-Dist: yandexcloud>=0.278.0
27
27
  Project-URL: Bug Tracker, https://github.com/apache/airflow/issues
28
- Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.0/changelog.html
29
- Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.0
28
+ Project-URL: Changelog, https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.1/changelog.html
29
+ Project-URL: Documentation, https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.1
30
30
  Project-URL: Slack Chat, https://s.apache.org/airflow-slack
31
31
  Project-URL: Source Code, https://github.com/apache/airflow
32
32
  Project-URL: Twitter, https://twitter.com/ApacheAirflow
@@ -76,7 +76,7 @@ Project-URL: YouTube, https://www.youtube.com/channel/UCSXwxpWZQ7XZ1WL3wqevChA/
76
76
 
77
77
  Package ``apache-airflow-providers-yandex``
78
78
 
79
- Release: ``3.11.0.rc1``
79
+ Release: ``3.11.1``
80
80
 
81
81
 
82
82
  This package is for Yandex, including:
@@ -91,7 +91,7 @@ This is a provider package for ``yandex`` provider. All classes for this provide
91
91
  are in ``airflow.providers.yandex`` python package.
92
92
 
93
93
  You can find package information and changelog for the provider
94
- in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.0/>`_.
94
+ in the `documentation <https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.1/>`_.
95
95
 
96
96
  Installation
97
97
  ------------
@@ -109,9 +109,9 @@ Requirements
109
109
  PIP package Version required
110
110
  ======================= ==================
111
111
  ``apache-airflow`` ``>=2.7.0``
112
- ``yandexcloud`` ``>=0.228.0``
113
- ``yandex-query-client`` ``>=0.1.2``
112
+ ``yandexcloud`` ``>=0.278.0``
113
+ ``yandex-query-client`` ``>=0.1.4``
114
114
  ======================= ==================
115
115
 
116
116
  The changelog for the provider package can be found in the
117
- `changelog <https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.0/changelog.html>`_.
117
+ `changelog <https://airflow.apache.org/docs/apache-airflow-providers-yandex/3.11.1/changelog.html>`_.
@@ -1,14 +1,16 @@
1
1
  airflow/providers/yandex/LICENSE,sha256=ywUBpKZc7Jb96rVt5I3IDbg7dIJAbUSHkuoDcF3jbH4,13569
2
- airflow/providers/yandex/__init__.py,sha256=Yt71vCxPPsxO_2P9J0vcegEAAkCHBPzVGqnrAqlc1sM,1582
3
- airflow/providers/yandex/get_provider_info.py,sha256=UphQCZS6MagPObXUU2wMT4hyjz-0Ct148R3I8mYKkFk,4707
2
+ airflow/providers/yandex/__init__.py,sha256=YDLtjjlu9HOYq1ZymV8A6FrVMGCaF4EmVDyWYaMaZMo,1494
3
+ airflow/providers/yandex/get_provider_info.py,sha256=z_jO2yskCqIQzWSD_O0ZQ1uWbCR2AV9Ve2mFcGQWbjg,4717
4
4
  airflow/providers/yandex/hooks/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
5
+ airflow/providers/yandex/hooks/dataproc.py,sha256=1UdqxDMI7uL6fNkG6oU6l2tFITF_nHXiV1VUgRqF7KY,1379
5
6
  airflow/providers/yandex/hooks/yandex.py,sha256=xJMUzGo0sNpb5-LQvgq6jDxWHK3XkNzlpoeEELREeow,7097
6
- airflow/providers/yandex/hooks/yandexcloud_dataproc.py,sha256=1UdqxDMI7uL6fNkG6oU6l2tFITF_nHXiV1VUgRqF7KY,1379
7
- airflow/providers/yandex/hooks/yq.py,sha256=WpKL_Ic1BkqLU4JX8Lv8oPRk5RVXmHLMmL34AxTo_BU,3978
7
+ airflow/providers/yandex/hooks/yandexcloud_dataproc.py,sha256=-JVJm3YLkDbJZKauCR1oCnWNkdLUJa1Fj_5HmZq1f44,1243
8
+ airflow/providers/yandex/hooks/yq.py,sha256=a1J5y-LocaG89cy-A9hgbzLmVbmWrpPwjRgss-KaYVg,3477
8
9
  airflow/providers/yandex/links/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
9
10
  airflow/providers/yandex/links/yq.py,sha256=jsy3liqQFk1eSSdK9YDbor0Epp7ng_q2ueVIwsD2i-8,1578
10
11
  airflow/providers/yandex/operators/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
11
- airflow/providers/yandex/operators/yandexcloud_dataproc.py,sha256=0-g6AzP0KiQ6pJFLMFXHtB1YFaUPkl_4FQJZyH0ce9E,25957
12
+ airflow/providers/yandex/operators/dataproc.py,sha256=QJc7UvBNPhAUBsuYQ4H8Wf0LpZP_-kCw7RdI0n3P_Bs,25945
13
+ airflow/providers/yandex/operators/yandexcloud_dataproc.py,sha256=bDLMwevS5spRfVEtixdKhQTC9gqDMm9himLrRohJwKQ,1255
12
14
  airflow/providers/yandex/operators/yq.py,sha256=lGqbogakylV4s5D5movQRL4v3IU2Qt1JHH8ygo3Hd2Q,3223
13
15
  airflow/providers/yandex/secrets/__init__.py,sha256=9hdXHABrVpkbpjZgUft39kOFL2xSGeG4GEua0Hmelus,785
14
16
  airflow/providers/yandex/secrets/lockbox.py,sha256=9Vi95RXd6VT36Rh1PVMUfFzm42oyrlMl77DoL9ivxVc,12161
@@ -17,7 +19,7 @@ airflow/providers/yandex/utils/credentials.py,sha256=6McJIitAuTROJRUSKTdWChfcZ9o
17
19
  airflow/providers/yandex/utils/defaults.py,sha256=CXt75MhGJe8echoDpl1vR4VG5bEvYDDjIHmFqckDh2w,950
18
20
  airflow/providers/yandex/utils/fields.py,sha256=1D8SDWH8h0djj5Hnk50w6BpPeNJyP-689Qfjpkr-yCg,1728
19
21
  airflow/providers/yandex/utils/user_agent.py,sha256=AC-WEzhjxkgUYOy4LdX2-nnUZdMhKRRUCJ2_TjfNm6k,1839
20
- apache_airflow_providers_yandex-3.11.0rc1.dist-info/entry_points.txt,sha256=ApXKRkvdgU2QNSQovjewC0b-LptwfBGBnJB3LTgBNx8,102
21
- apache_airflow_providers_yandex-3.11.0rc1.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
22
- apache_airflow_providers_yandex-3.11.0rc1.dist-info/METADATA,sha256=YrzHIxxUERD5JzvDmyXV6NRjyJlhNydlYaedzq5u7Pc,4919
23
- apache_airflow_providers_yandex-3.11.0rc1.dist-info/RECORD,,
22
+ apache_airflow_providers_yandex-3.11.1.dist-info/entry_points.txt,sha256=ApXKRkvdgU2QNSQovjewC0b-LptwfBGBnJB3LTgBNx8,102
23
+ apache_airflow_providers_yandex-3.11.1.dist-info/WHEEL,sha256=EZbGkh7Ie4PoZfRQ8I0ZuP9VklN_TvcZ6DSE5Uar4z4,81
24
+ apache_airflow_providers_yandex-3.11.1.dist-info/METADATA,sha256=Teoe7nxOttI_AsbkeAB_hlWBkF972Mm8KyyX1iFWcrs,4909
25
+ apache_airflow_providers_yandex-3.11.1.dist-info/RECORD,,