apache-airflow-providers-google 11.0.0rc1__py3-none-any.whl → 12.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airflow/providers/google/__init__.py +3 -3
- airflow/providers/google/assets/gcs.py +1 -7
- airflow/providers/google/cloud/hooks/alloy_db.py +289 -0
- airflow/providers/google/cloud/hooks/cloud_batch.py +13 -5
- airflow/providers/google/cloud/hooks/dataproc.py +7 -3
- airflow/providers/google/cloud/hooks/dataproc_metastore.py +41 -22
- airflow/providers/google/cloud/hooks/kubernetes_engine.py +7 -38
- airflow/providers/google/cloud/hooks/translate.py +355 -0
- airflow/providers/google/cloud/hooks/vertex_ai/feature_store.py +147 -0
- airflow/providers/google/cloud/hooks/vertex_ai/generative_model.py +10 -0
- airflow/providers/google/cloud/links/alloy_db.py +55 -0
- airflow/providers/google/cloud/links/translate.py +98 -0
- airflow/providers/google/cloud/log/stackdriver_task_handler.py +1 -5
- airflow/providers/google/cloud/openlineage/mixins.py +4 -12
- airflow/providers/google/cloud/openlineage/utils.py +200 -22
- airflow/providers/google/cloud/operators/alloy_db.py +459 -0
- airflow/providers/google/cloud/operators/automl.py +55 -44
- airflow/providers/google/cloud/operators/bigquery.py +60 -15
- airflow/providers/google/cloud/operators/dataproc.py +12 -0
- airflow/providers/google/cloud/operators/gcs.py +5 -14
- airflow/providers/google/cloud/operators/kubernetes_engine.py +377 -705
- airflow/providers/google/cloud/operators/mlengine.py +41 -31
- airflow/providers/google/cloud/operators/translate.py +586 -1
- airflow/providers/google/cloud/operators/vertex_ai/feature_store.py +163 -0
- airflow/providers/google/cloud/operators/vertex_ai/generative_model.py +5 -0
- airflow/providers/google/cloud/sensors/dataproc.py +2 -2
- airflow/providers/google/cloud/sensors/vertex_ai/__init__.py +16 -0
- airflow/providers/google/cloud/sensors/vertex_ai/feature_store.py +112 -0
- airflow/providers/google/cloud/transfers/bigquery_to_gcs.py +6 -11
- airflow/providers/google/cloud/transfers/bigquery_to_mssql.py +3 -0
- airflow/providers/google/cloud/transfers/bigquery_to_mysql.py +3 -0
- airflow/providers/google/cloud/transfers/gcs_to_bigquery.py +5 -10
- airflow/providers/google/cloud/transfers/gcs_to_gcs.py +3 -15
- airflow/providers/google/cloud/transfers/gcs_to_local.py +9 -0
- airflow/providers/google/cloud/transfers/local_to_gcs.py +41 -6
- airflow/providers/google/cloud/transfers/s3_to_gcs.py +15 -0
- airflow/providers/google/get_provider_info.py +30 -18
- airflow/providers/google/version_compat.py +36 -0
- {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/METADATA +16 -18
- {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/RECORD +42 -37
- airflow/providers/google/cloud/hooks/datapipeline.py +0 -71
- airflow/providers/google/cloud/openlineage/BigQueryErrorRunFacet.json +0 -30
- airflow/providers/google/cloud/operators/datapipeline.py +0 -63
- {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/WHEEL +0 -0
- {apache_airflow_providers_google-11.0.0rc1.dist-info → apache_airflow_providers_google-12.0.0rc1.dist-info}/entry_points.txt +0 -0
@@ -20,7 +20,7 @@
|
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
22
|
from collections.abc import MutableMapping, MutableSequence, Sequence
|
23
|
-
from typing import TYPE_CHECKING
|
23
|
+
from typing import TYPE_CHECKING, cast
|
24
24
|
|
25
25
|
from google.api_core.exceptions import GoogleAPICallError
|
26
26
|
from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
|
@@ -28,8 +28,11 @@ from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
|
|
28
28
|
from airflow.exceptions import AirflowException
|
29
29
|
from airflow.providers.google.cloud.hooks.translate import CloudTranslateHook, TranslateHook
|
30
30
|
from airflow.providers.google.cloud.links.translate import (
|
31
|
+
TranslateResultByOutputConfigLink,
|
31
32
|
TranslateTextBatchLink,
|
32
33
|
TranslationDatasetsListLink,
|
34
|
+
TranslationModelLink,
|
35
|
+
TranslationModelsListLink,
|
33
36
|
TranslationNativeDatasetLink,
|
34
37
|
)
|
35
38
|
from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseOperator
|
@@ -38,7 +41,11 @@ from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
|
|
38
41
|
if TYPE_CHECKING:
|
39
42
|
from google.api_core.retry import Retry
|
40
43
|
from google.cloud.translate_v3.types import (
|
44
|
+
BatchDocumentInputConfig,
|
45
|
+
BatchDocumentOutputConfig,
|
41
46
|
DatasetInputConfig,
|
47
|
+
DocumentInputConfig,
|
48
|
+
DocumentOutputConfig,
|
42
49
|
InputConfig,
|
43
50
|
OutputConfig,
|
44
51
|
TranslateTextGlossaryConfig,
|
@@ -723,3 +730,581 @@ class TranslateDeleteDatasetOperator(GoogleCloudBaseOperator):
|
|
723
730
|
)
|
724
731
|
hook.wait_for_operation_done(operation=operation, timeout=self.timeout)
|
725
732
|
self.log.info("Dataset deletion complete!")
|
733
|
+
|
734
|
+
|
735
|
+
class TranslateCreateModelOperator(GoogleCloudBaseOperator):
|
736
|
+
"""
|
737
|
+
Creates a Google Cloud Translate model.
|
738
|
+
|
739
|
+
Creates a `native` translation model, using API V3.
|
740
|
+
For more information on how to use this operator, take a look at the guide:
|
741
|
+
:ref:`howto/operator:TranslateCreateModelOperator`.
|
742
|
+
|
743
|
+
:param dataset_id: The dataset id used for model training.
|
744
|
+
:param project_id: ID of the Google Cloud project where dataset is located.
|
745
|
+
If not provided default project_id is used.
|
746
|
+
:param location: The location of the project.
|
747
|
+
:param retry: Designation of what errors, if any, should be retried.
|
748
|
+
:param timeout: The timeout for this request.
|
749
|
+
:param metadata: Strings which should be sent along with the request as metadata.
|
750
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
751
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
752
|
+
credentials, or chained list of accounts required to get the access_token
|
753
|
+
of the last account in the list, which will be impersonated in the request.
|
754
|
+
If set as a string, the account must grant the originating account
|
755
|
+
the Service Account Token Creator IAM role.
|
756
|
+
If set as a sequence, the identities from the list must grant
|
757
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
758
|
+
account from the list granting this role to the originating account (templated).
|
759
|
+
"""
|
760
|
+
|
761
|
+
template_fields: Sequence[str] = (
|
762
|
+
"dataset_id",
|
763
|
+
"location",
|
764
|
+
"project_id",
|
765
|
+
"gcp_conn_id",
|
766
|
+
"impersonation_chain",
|
767
|
+
)
|
768
|
+
|
769
|
+
operator_extra_links = (TranslationModelLink(),)
|
770
|
+
|
771
|
+
def __init__(
|
772
|
+
self,
|
773
|
+
*,
|
774
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
775
|
+
location: str,
|
776
|
+
dataset_id: str,
|
777
|
+
display_name: str,
|
778
|
+
timeout: float | None = None,
|
779
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
780
|
+
gcp_conn_id: str = "google_cloud_default",
|
781
|
+
metadata: Sequence[tuple[str, str]] = (),
|
782
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
783
|
+
**kwargs,
|
784
|
+
) -> None:
|
785
|
+
super().__init__(**kwargs)
|
786
|
+
self.project_id = project_id
|
787
|
+
self.location = location
|
788
|
+
self.dataset_id = dataset_id
|
789
|
+
self.display_name = display_name
|
790
|
+
self.metadata = metadata
|
791
|
+
self.timeout = timeout
|
792
|
+
self.retry = retry
|
793
|
+
self.gcp_conn_id = gcp_conn_id
|
794
|
+
self.impersonation_chain = impersonation_chain
|
795
|
+
|
796
|
+
def execute(self, context: Context) -> str:
|
797
|
+
hook = TranslateHook(
|
798
|
+
gcp_conn_id=self.gcp_conn_id,
|
799
|
+
impersonation_chain=self.impersonation_chain,
|
800
|
+
)
|
801
|
+
self.log.info("Model creation started, dataset_id %s...", self.dataset_id)
|
802
|
+
try:
|
803
|
+
result_operation = hook.create_model(
|
804
|
+
dataset_id=self.dataset_id,
|
805
|
+
display_name=self.display_name,
|
806
|
+
location=self.location,
|
807
|
+
project_id=self.project_id,
|
808
|
+
retry=self.retry,
|
809
|
+
timeout=self.timeout,
|
810
|
+
metadata=self.metadata,
|
811
|
+
)
|
812
|
+
except GoogleAPICallError as e:
|
813
|
+
self.log.error("Error submitting create_model operation ")
|
814
|
+
raise AirflowException(e)
|
815
|
+
|
816
|
+
self.log.info("Training has started")
|
817
|
+
hook.wait_for_operation_done(operation=result_operation)
|
818
|
+
result = hook.wait_for_operation_result(operation=result_operation)
|
819
|
+
result = type(result).to_dict(result)
|
820
|
+
model_id = hook.extract_object_id(result)
|
821
|
+
self.xcom_push(context, key="model_id", value=model_id)
|
822
|
+
self.log.info("Model creation complete. The model_id: %s.", model_id)
|
823
|
+
|
824
|
+
project_id = self.project_id or hook.project_id
|
825
|
+
TranslationModelLink.persist(
|
826
|
+
context=context,
|
827
|
+
task_instance=self,
|
828
|
+
dataset_id=self.dataset_id,
|
829
|
+
model_id=model_id,
|
830
|
+
project_id=project_id,
|
831
|
+
)
|
832
|
+
return result
|
833
|
+
|
834
|
+
|
835
|
+
class TranslateModelsListOperator(GoogleCloudBaseOperator):
|
836
|
+
"""
|
837
|
+
Get a list of native Google Cloud Translation models in a project.
|
838
|
+
|
839
|
+
Get project's list of `native` translation models, using API V3.
|
840
|
+
For more information on how to use this operator, take a look at the guide:
|
841
|
+
:ref:`howto/operator:TranslateModelsListOperator`.
|
842
|
+
|
843
|
+
:param project_id: ID of the Google Cloud project where dataset is located.
|
844
|
+
If not provided default project_id is used.
|
845
|
+
:param location: The location of the project.
|
846
|
+
:param retry: Designation of what errors, if any, should be retried.
|
847
|
+
:param timeout: The timeout for this request.
|
848
|
+
:param metadata: Strings which should be sent along with the request as metadata.
|
849
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
850
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
851
|
+
credentials, or chained list of accounts required to get the access_token
|
852
|
+
of the last account in the list, which will be impersonated in the request.
|
853
|
+
If set as a string, the account must grant the originating account
|
854
|
+
the Service Account Token Creator IAM role.
|
855
|
+
If set as a sequence, the identities from the list must grant
|
856
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
857
|
+
account from the list granting this role to the originating account (templated).
|
858
|
+
"""
|
859
|
+
|
860
|
+
template_fields: Sequence[str] = (
|
861
|
+
"location",
|
862
|
+
"project_id",
|
863
|
+
"gcp_conn_id",
|
864
|
+
"impersonation_chain",
|
865
|
+
)
|
866
|
+
|
867
|
+
operator_extra_links = (TranslationModelsListLink(),)
|
868
|
+
|
869
|
+
def __init__(
|
870
|
+
self,
|
871
|
+
*,
|
872
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
873
|
+
location: str,
|
874
|
+
metadata: Sequence[tuple[str, str]] = (),
|
875
|
+
timeout: float | _MethodDefault = DEFAULT,
|
876
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
877
|
+
gcp_conn_id: str = "google_cloud_default",
|
878
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
879
|
+
**kwargs,
|
880
|
+
) -> None:
|
881
|
+
super().__init__(**kwargs)
|
882
|
+
self.project_id = project_id
|
883
|
+
self.location = location
|
884
|
+
self.metadata = metadata
|
885
|
+
self.timeout = timeout
|
886
|
+
self.retry = retry
|
887
|
+
self.gcp_conn_id = gcp_conn_id
|
888
|
+
self.impersonation_chain = impersonation_chain
|
889
|
+
|
890
|
+
def execute(self, context: Context):
|
891
|
+
hook = TranslateHook(
|
892
|
+
gcp_conn_id=self.gcp_conn_id,
|
893
|
+
impersonation_chain=self.impersonation_chain,
|
894
|
+
)
|
895
|
+
project_id = self.project_id or hook.project_id
|
896
|
+
TranslationModelsListLink.persist(
|
897
|
+
context=context,
|
898
|
+
task_instance=self,
|
899
|
+
project_id=project_id,
|
900
|
+
)
|
901
|
+
self.log.info("Requesting models list")
|
902
|
+
results_pager = hook.list_models(
|
903
|
+
location=self.location,
|
904
|
+
project_id=self.project_id,
|
905
|
+
retry=self.retry,
|
906
|
+
timeout=self.timeout,
|
907
|
+
metadata=self.metadata,
|
908
|
+
)
|
909
|
+
result_ids = []
|
910
|
+
for model_item in results_pager:
|
911
|
+
model_data = type(model_item).to_dict(model_item)
|
912
|
+
model_id = hook.extract_object_id(model_data)
|
913
|
+
result_ids.append(model_id)
|
914
|
+
self.log.info("Fetching the models list complete. Model id-s: %s", result_ids)
|
915
|
+
return result_ids
|
916
|
+
|
917
|
+
|
918
|
+
class TranslateDeleteModelOperator(GoogleCloudBaseOperator):
|
919
|
+
"""
|
920
|
+
Delete translation model and all of its contents.
|
921
|
+
|
922
|
+
Deletes the translation model and it's data, using API V3.
|
923
|
+
For more information on how to use this operator, take a look at the guide:
|
924
|
+
:ref:`howto/operator:TranslateDeleteModelOperator`.
|
925
|
+
|
926
|
+
:param model_id: The model_id of target native model to be deleted.
|
927
|
+
:param location: The location of the project.
|
928
|
+
:param retry: Designation of what errors, if any, should be retried.
|
929
|
+
:param timeout: The timeout for this request.
|
930
|
+
:param metadata: Strings which should be sent along with the request as metadata.
|
931
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
932
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
933
|
+
credentials, or chained list of accounts required to get the access_token
|
934
|
+
of the last account in the list, which will be impersonated in the request.
|
935
|
+
If set as a string, the account must grant the originating account
|
936
|
+
the Service Account Token Creator IAM role.
|
937
|
+
If set as a sequence, the identities from the list must grant
|
938
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
939
|
+
account from the list granting this role to the originating account (templated).
|
940
|
+
"""
|
941
|
+
|
942
|
+
template_fields: Sequence[str] = (
|
943
|
+
"model_id",
|
944
|
+
"location",
|
945
|
+
"project_id",
|
946
|
+
"gcp_conn_id",
|
947
|
+
"impersonation_chain",
|
948
|
+
)
|
949
|
+
|
950
|
+
def __init__(
|
951
|
+
self,
|
952
|
+
*,
|
953
|
+
model_id: str,
|
954
|
+
location: str,
|
955
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
956
|
+
metadata: Sequence[tuple[str, str]] = (),
|
957
|
+
timeout: float | None = None,
|
958
|
+
retry: Retry | _MethodDefault = DEFAULT,
|
959
|
+
gcp_conn_id: str = "google_cloud_default",
|
960
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
961
|
+
**kwargs,
|
962
|
+
) -> None:
|
963
|
+
super().__init__(**kwargs)
|
964
|
+
self.model_id = model_id
|
965
|
+
self.project_id = project_id
|
966
|
+
self.location = location
|
967
|
+
self.metadata = metadata
|
968
|
+
self.timeout = timeout
|
969
|
+
self.retry = retry
|
970
|
+
self.gcp_conn_id = gcp_conn_id
|
971
|
+
self.impersonation_chain = impersonation_chain
|
972
|
+
|
973
|
+
def execute(self, context: Context):
|
974
|
+
hook = TranslateHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain)
|
975
|
+
self.log.info("Deleting the model %s...", self.model_id)
|
976
|
+
operation = hook.delete_model(
|
977
|
+
model_id=self.model_id,
|
978
|
+
location=self.location,
|
979
|
+
project_id=self.project_id,
|
980
|
+
retry=self.retry,
|
981
|
+
timeout=self.timeout,
|
982
|
+
metadata=self.metadata,
|
983
|
+
)
|
984
|
+
hook.wait_for_operation_done(operation=operation, timeout=self.timeout)
|
985
|
+
self.log.info("Model deletion complete!")
|
986
|
+
|
987
|
+
|
988
|
+
class TranslateDocumentOperator(GoogleCloudBaseOperator):
|
989
|
+
"""
|
990
|
+
Translate document provided.
|
991
|
+
|
992
|
+
Wraps the Google cloud Translate Text (Advanced) functionality.
|
993
|
+
Supports wide range of input/output file types, please visit the
|
994
|
+
https://cloud.google.com/translate/docs/advanced/translate-documents for more details.
|
995
|
+
|
996
|
+
For more information on how to use this operator, take a look at the guide:
|
997
|
+
:ref:`howto/operator:TranslateDocumentOperator`.
|
998
|
+
|
999
|
+
:param project_id: Optional. The ID of the Google Cloud project that the
|
1000
|
+
service belongs to. If not specified the hook project_id will be used.
|
1001
|
+
:param source_language_code: Optional. The ISO-639 language code of the
|
1002
|
+
input document text if known. If the source language isn't specified,
|
1003
|
+
the API attempts to identify the source language automatically and returns
|
1004
|
+
the source language within the response.
|
1005
|
+
:param target_language_code: Required. The ISO-639 language code to use
|
1006
|
+
for translation of the input document text.
|
1007
|
+
:param location: Optional. Project or location to make a call. Must refer to a caller's project.
|
1008
|
+
If not specified, 'global' is used.
|
1009
|
+
Non-global location is required for requests using AutoML models or custom glossaries.
|
1010
|
+
Models and glossaries must be within the same region (have the same location-id).
|
1011
|
+
:param document_input_config: A document translation request input config.
|
1012
|
+
:param document_output_config: Optional. A document translation request output config.
|
1013
|
+
If not provided the translated file will only be returned through a byte-stream
|
1014
|
+
and its output mime type will be the same as the input file's mime type.
|
1015
|
+
:param customized_attribution: Optional. This flag is to support user customized
|
1016
|
+
attribution. If not provided, the default is ``Machine Translated by Google``.
|
1017
|
+
Customized attribution should follow rules in
|
1018
|
+
https://cloud.google.com/translate/attribution#attribution_and_logos
|
1019
|
+
:param is_translate_native_pdf_only: Optional. Param for external customers.
|
1020
|
+
If true, the page limit of online native PDF translation is 300 and only native PDF pages
|
1021
|
+
will be translated.
|
1022
|
+
:param enable_shadow_removal_native_pdf: Optional. If true, use the text removal server to remove the
|
1023
|
+
shadow text on background image for native PDF translation.
|
1024
|
+
Shadow removal feature can only be enabled when both ``is_translate_native_pdf_only``,
|
1025
|
+
``pdf_native_only`` are False.
|
1026
|
+
:param enable_rotation_correction: Optional. If true, enable auto rotation
|
1027
|
+
correction in DVS.
|
1028
|
+
:param model: Optional. The ``model`` type requested for this translation.
|
1029
|
+
If not provided, the default Google model (NMT) will be used.
|
1030
|
+
The format depends on model type:
|
1031
|
+
|
1032
|
+
- AutoML Translation models:
|
1033
|
+
``projects/{project-number-or-id}/locations/{location-id}/models/{model-id}``
|
1034
|
+
- General (built-in) models:
|
1035
|
+
``projects/{project-number-or-id}/locations/{location-id}/models/general/nmt``
|
1036
|
+
|
1037
|
+
If not provided, the default Google model (NMT) will be used
|
1038
|
+
for translation.
|
1039
|
+
:param glossary_config: Optional. Glossary to be applied.
|
1040
|
+
:param transliteration_config: Optional. Transliteration to be applied.
|
1041
|
+
:param retry: Designation of what errors, if any, should be retried.
|
1042
|
+
:param timeout: The timeout for this request.
|
1043
|
+
:param metadata: Strings which should be sent along with the request as metadata.
|
1044
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
1045
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1046
|
+
credentials, or chained list of accounts required to get the access_token
|
1047
|
+
of the last account in the list, which will be impersonated in the request.
|
1048
|
+
If set as a string, the account must grant the originating account
|
1049
|
+
the Service Account Token Creator IAM role.
|
1050
|
+
If set as a sequence, the identities from the list must grant
|
1051
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1052
|
+
account from the list granting this role to the originating account (templated).
|
1053
|
+
"""
|
1054
|
+
|
1055
|
+
operator_extra_links = (TranslateResultByOutputConfigLink(),)
|
1056
|
+
|
1057
|
+
template_fields: Sequence[str] = (
|
1058
|
+
"source_language_code",
|
1059
|
+
"target_language_code",
|
1060
|
+
"document_input_config",
|
1061
|
+
"document_output_config",
|
1062
|
+
"model",
|
1063
|
+
"gcp_conn_id",
|
1064
|
+
"impersonation_chain",
|
1065
|
+
)
|
1066
|
+
|
1067
|
+
def __init__(
|
1068
|
+
self,
|
1069
|
+
*,
|
1070
|
+
location: str | None = None,
|
1071
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
1072
|
+
source_language_code: str | None = None,
|
1073
|
+
target_language_code: str,
|
1074
|
+
document_input_config: DocumentInputConfig | dict,
|
1075
|
+
document_output_config: DocumentOutputConfig | dict | None,
|
1076
|
+
customized_attribution: str | None = None,
|
1077
|
+
is_translate_native_pdf_only: bool = False,
|
1078
|
+
enable_shadow_removal_native_pdf: bool = False,
|
1079
|
+
enable_rotation_correction: bool = False,
|
1080
|
+
model: str | None = None,
|
1081
|
+
glossary_config: TranslateTextGlossaryConfig | None = None,
|
1082
|
+
labels: str | None = None,
|
1083
|
+
timeout: float | _MethodDefault = DEFAULT,
|
1084
|
+
retry: Retry | _MethodDefault | None = DEFAULT,
|
1085
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1086
|
+
gcp_conn_id: str = "google_cloud_default",
|
1087
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1088
|
+
**kwargs,
|
1089
|
+
) -> None:
|
1090
|
+
super().__init__(**kwargs)
|
1091
|
+
self.project_id = project_id
|
1092
|
+
self.source_language_code = source_language_code
|
1093
|
+
self.target_language_code = target_language_code
|
1094
|
+
self.document_input_config = document_input_config
|
1095
|
+
self.document_output_config = document_output_config
|
1096
|
+
self.customized_attribution = customized_attribution
|
1097
|
+
self.is_translate_native_pdf_only = is_translate_native_pdf_only
|
1098
|
+
self.enable_shadow_removal_native_pdf = enable_shadow_removal_native_pdf
|
1099
|
+
self.enable_rotation_correction = enable_rotation_correction
|
1100
|
+
self.location = location
|
1101
|
+
self.labels = labels
|
1102
|
+
self.model = model
|
1103
|
+
self.glossary_config = glossary_config
|
1104
|
+
self.metadate = metadata
|
1105
|
+
self.timeout = timeout
|
1106
|
+
self.retry = retry
|
1107
|
+
self.gcp_conn_id = gcp_conn_id
|
1108
|
+
self.impersonation_chain = impersonation_chain
|
1109
|
+
|
1110
|
+
def execute(self, context: Context) -> dict:
|
1111
|
+
hook = TranslateHook(
|
1112
|
+
gcp_conn_id=self.gcp_conn_id,
|
1113
|
+
impersonation_chain=self.impersonation_chain,
|
1114
|
+
)
|
1115
|
+
try:
|
1116
|
+
self.log.info("Starting the document translation")
|
1117
|
+
doc_translation_result = hook.translate_document(
|
1118
|
+
source_language_code=self.source_language_code,
|
1119
|
+
target_language_code=self.target_language_code,
|
1120
|
+
document_input_config=self.document_input_config,
|
1121
|
+
document_output_config=self.document_output_config,
|
1122
|
+
customized_attribution=self.customized_attribution,
|
1123
|
+
is_translate_native_pdf_only=self.is_translate_native_pdf_only,
|
1124
|
+
enable_shadow_removal_native_pdf=self.enable_shadow_removal_native_pdf,
|
1125
|
+
enable_rotation_correction=self.enable_rotation_correction,
|
1126
|
+
location=self.location,
|
1127
|
+
labels=self.labels,
|
1128
|
+
model=self.model,
|
1129
|
+
glossary_config=self.glossary_config,
|
1130
|
+
timeout=self.timeout,
|
1131
|
+
retry=self.retry,
|
1132
|
+
metadata=self.metadate,
|
1133
|
+
)
|
1134
|
+
self.log.info("Document translation completed")
|
1135
|
+
except GoogleAPICallError as e:
|
1136
|
+
self.log.error("An error occurred executing translate_document method: \n%s", e)
|
1137
|
+
raise AirflowException(e)
|
1138
|
+
if self.document_output_config:
|
1139
|
+
TranslateResultByOutputConfigLink.persist(
|
1140
|
+
context=context,
|
1141
|
+
task_instance=self,
|
1142
|
+
project_id=self.project_id or hook.project_id,
|
1143
|
+
output_config=self.document_output_config,
|
1144
|
+
)
|
1145
|
+
return cast(dict, type(doc_translation_result).to_dict(doc_translation_result))
|
1146
|
+
|
1147
|
+
|
1148
|
+
class TranslateDocumentBatchOperator(GoogleCloudBaseOperator):
|
1149
|
+
"""
|
1150
|
+
Translate documents provided via input and output configurations.
|
1151
|
+
|
1152
|
+
Up to 10 target languages per operation supported.
|
1153
|
+
Wraps the Google cloud Translate Text (Advanced) functionality.
|
1154
|
+
See https://cloud.google.com/translate/docs/advanced/batch-translation.
|
1155
|
+
|
1156
|
+
For more information on how to use this operator, take a look at the guide:
|
1157
|
+
:ref:`howto/operator:TranslateDocumentBatchOperator`.
|
1158
|
+
|
1159
|
+
:param project_id: Required. The ID of the Google Cloud project that the service belongs to.
|
1160
|
+
:param source_language_code: Optional. The ISO-639 language code of the
|
1161
|
+
input text if known. If the source language isn't specified, the API attempts to identify
|
1162
|
+
the source language automatically and returns the source language within the response.
|
1163
|
+
:param target_language_codes: Required. The ISO-639 language code to use
|
1164
|
+
for translation of the input document. Specify up to 10 language codes here.
|
1165
|
+
:param location: Optional. Project or location to make a call. Must refer to
|
1166
|
+
a caller's project. If not specified, 'global' is used.
|
1167
|
+
Non-global location is required for requests using AutoML models or custom glossaries.
|
1168
|
+
Models and glossaries must be within the same region (have the same location-id).
|
1169
|
+
:param input_configs: Input configurations. The total number of files matched should be <=
|
1170
|
+
100. The total content size to translate should be <= 100M Unicode codepoints.
|
1171
|
+
The files must use UTF-8 encoding.
|
1172
|
+
:param output_config: Output configuration. If 2 input configs match to the same file (that
|
1173
|
+
is, same input path), no output for duplicate inputs will be generated.
|
1174
|
+
:param format_conversions: Optional. The file format conversion map that is applied to
|
1175
|
+
all input files. The map key is the original mime_type.
|
1176
|
+
The map value is the target mime_type of translated documents.
|
1177
|
+
Supported file format conversion includes:
|
1178
|
+
|
1179
|
+
- ``application/pdf`` to
|
1180
|
+
``application/vnd.openxmlformats-officedocument.wordprocessingml.document``
|
1181
|
+
|
1182
|
+
If nothing specified, output files will be in the same format as the original file.
|
1183
|
+
:param customized_attribution: Optional. This flag is to support user customized
|
1184
|
+
attribution. If not provided, the default is ``Machine Translated by Google``.
|
1185
|
+
Customized attribution should follow rules in
|
1186
|
+
https://cloud.google.com/translate/attribution#attribution_and_logos
|
1187
|
+
:param enable_shadow_removal_native_pdf: Optional. If true, use the text removal server to remove the
|
1188
|
+
shadow text on background image for native PDF translation.
|
1189
|
+
Shadow removal feature can only be enabled when both ``is_translate_native_pdf_only``,
|
1190
|
+
``pdf_native_only`` are False.
|
1191
|
+
:param enable_rotation_correction: Optional. If true, enable auto rotation
|
1192
|
+
correction in DVS.
|
1193
|
+
:param models: Optional. The models to use for translation. Map's key is
|
1194
|
+
target language code. Map's value is the model name. Value
|
1195
|
+
can be a built-in general model, or an AutoML Translation model.
|
1196
|
+
The value format depends on model type:
|
1197
|
+
|
1198
|
+
- AutoML Translation models:
|
1199
|
+
``projects/{project-number-or-id}/locations/{location-id}/models/{model-id}``
|
1200
|
+
|
1201
|
+
- General (built-in) models:
|
1202
|
+
``projects/{project-number-or-id}/locations/{location-id}/models/general/nmt``,
|
1203
|
+
|
1204
|
+
If the map is empty or a specific model is not requested for
|
1205
|
+
a language pair, then default google model (NMT) is used.
|
1206
|
+
:param glossaries: Glossaries to be applied. It's keyed by target language code.
|
1207
|
+
:param retry: Designation of what errors, if any, should be retried.
|
1208
|
+
:param timeout: The timeout for this request.
|
1209
|
+
:param metadata: Strings which should be sent along with the request as metadata.
|
1210
|
+
:param gcp_conn_id: The connection ID to use connecting to Google Cloud.
|
1211
|
+
:param impersonation_chain: Optional service account to impersonate using short-term
|
1212
|
+
credentials, or chained list of accounts required to get the access_token
|
1213
|
+
of the last account in the list, which will be impersonated in the request.
|
1214
|
+
If set as a string, the account must grant the originating account
|
1215
|
+
the Service Account Token Creator IAM role.
|
1216
|
+
If set as a sequence, the identities from the list must grant
|
1217
|
+
Service Account Token Creator IAM role to the directly preceding identity, with first
|
1218
|
+
account from the list granting this role to the originating account (templated).
|
1219
|
+
"""
|
1220
|
+
|
1221
|
+
operator_extra_links = (TranslateResultByOutputConfigLink(),)
|
1222
|
+
|
1223
|
+
template_fields: Sequence[str] = (
|
1224
|
+
"input_configs",
|
1225
|
+
"output_config",
|
1226
|
+
"target_language_codes",
|
1227
|
+
"source_language_code",
|
1228
|
+
"models",
|
1229
|
+
"glossaries",
|
1230
|
+
"gcp_conn_id",
|
1231
|
+
"impersonation_chain",
|
1232
|
+
)
|
1233
|
+
|
1234
|
+
def __init__(
|
1235
|
+
self,
|
1236
|
+
*,
|
1237
|
+
project_id: str = PROVIDE_PROJECT_ID,
|
1238
|
+
source_language_code: str,
|
1239
|
+
target_language_codes: MutableSequence[str] | None = None,
|
1240
|
+
location: str | None = None,
|
1241
|
+
input_configs: MutableSequence[BatchDocumentInputConfig | dict],
|
1242
|
+
output_config: BatchDocumentOutputConfig | dict,
|
1243
|
+
customized_attribution: str | None = None,
|
1244
|
+
format_conversions: MutableMapping[str, str] | None = None,
|
1245
|
+
enable_shadow_removal_native_pdf: bool = False,
|
1246
|
+
enable_rotation_correction: bool = False,
|
1247
|
+
models: MutableMapping[str, str] | None = None,
|
1248
|
+
glossaries: MutableMapping[str, TranslateTextGlossaryConfig] | None = None,
|
1249
|
+
metadata: Sequence[tuple[str, str]] = (),
|
1250
|
+
timeout: float | _MethodDefault = DEFAULT,
|
1251
|
+
retry: Retry | _MethodDefault | None = DEFAULT,
|
1252
|
+
gcp_conn_id: str = "google_cloud_default",
|
1253
|
+
impersonation_chain: str | Sequence[str] | None = None,
|
1254
|
+
**kwargs,
|
1255
|
+
) -> None:
|
1256
|
+
super().__init__(**kwargs)
|
1257
|
+
self.project_id = project_id
|
1258
|
+
self.location = location
|
1259
|
+
self.target_language_codes = target_language_codes
|
1260
|
+
self.source_language_code = source_language_code
|
1261
|
+
self.input_configs = input_configs
|
1262
|
+
self.output_config = output_config
|
1263
|
+
self.customized_attribution = customized_attribution
|
1264
|
+
self.format_conversions = format_conversions
|
1265
|
+
self.enable_shadow_removal_native_pdf = enable_shadow_removal_native_pdf
|
1266
|
+
self.enable_rotation_correction = enable_rotation_correction
|
1267
|
+
self.models = models
|
1268
|
+
self.glossaries = glossaries
|
1269
|
+
self.metadata = metadata
|
1270
|
+
self.timeout = timeout
|
1271
|
+
self.retry = retry
|
1272
|
+
self.gcp_conn_id = gcp_conn_id
|
1273
|
+
self.impersonation_chain = impersonation_chain
|
1274
|
+
|
1275
|
+
def execute(self, context: Context) -> dict:
|
1276
|
+
hook = TranslateHook(
|
1277
|
+
gcp_conn_id=self.gcp_conn_id,
|
1278
|
+
impersonation_chain=self.impersonation_chain,
|
1279
|
+
)
|
1280
|
+
try:
|
1281
|
+
batch_document_translate_operation = hook.batch_translate_document(
|
1282
|
+
project_id=self.project_id,
|
1283
|
+
location=self.location,
|
1284
|
+
target_language_codes=self.target_language_codes,
|
1285
|
+
source_language_code=self.source_language_code,
|
1286
|
+
input_configs=self.input_configs,
|
1287
|
+
output_config=self.output_config,
|
1288
|
+
customized_attribution=self.customized_attribution,
|
1289
|
+
format_conversions=self.format_conversions,
|
1290
|
+
enable_shadow_removal_native_pdf=self.enable_shadow_removal_native_pdf,
|
1291
|
+
enable_rotation_correction=self.enable_rotation_correction,
|
1292
|
+
models=self.models,
|
1293
|
+
glossaries=self.glossaries,
|
1294
|
+
metadata=self.metadata,
|
1295
|
+
timeout=self.timeout,
|
1296
|
+
retry=self.retry,
|
1297
|
+
)
|
1298
|
+
except GoogleAPICallError as e:
|
1299
|
+
self.log.error("An error occurred executing batch_translate_document method: \n%s", e)
|
1300
|
+
raise AirflowException(e)
|
1301
|
+
self.log.info("Batch document translation job started.")
|
1302
|
+
TranslateResultByOutputConfigLink.persist(
|
1303
|
+
context=context,
|
1304
|
+
task_instance=self,
|
1305
|
+
project_id=self.project_id or hook.project_id,
|
1306
|
+
output_config=self.output_config,
|
1307
|
+
)
|
1308
|
+
result = hook.wait_for_operation_result(batch_document_translate_operation)
|
1309
|
+
self.log.info("Batch document translation job finished")
|
1310
|
+
return cast(dict, type(result).to_dict(result))
|