awx-zipline-ai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +251 -0
- ai/chronon/api/__init__.py +1 -0
- ai/chronon/api/common/__init__.py +1 -0
- ai/chronon/api/common/constants.py +15 -0
- ai/chronon/api/common/ttypes.py +1844 -0
- ai/chronon/api/constants.py +15 -0
- ai/chronon/api/ttypes.py +3624 -0
- ai/chronon/cli/compile/column_hashing.py +313 -0
- ai/chronon/cli/compile/compile_context.py +177 -0
- ai/chronon/cli/compile/compiler.py +160 -0
- ai/chronon/cli/compile/conf_validator.py +590 -0
- ai/chronon/cli/compile/display/class_tracker.py +112 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +46 -0
- ai/chronon/cli/compile/fill_templates.py +40 -0
- ai/chronon/cli/compile/parse_configs.py +141 -0
- ai/chronon/cli/compile/parse_teams.py +238 -0
- ai/chronon/cli/compile/serializer.py +115 -0
- ai/chronon/cli/git_utils.py +156 -0
- ai/chronon/cli/logger.py +61 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/eval/__init__.py +122 -0
- ai/chronon/eval/query_parsing.py +19 -0
- ai/chronon/eval/sample_tables.py +100 -0
- ai/chronon/eval/table_scan.py +186 -0
- ai/chronon/fetcher/__init__.py +1 -0
- ai/chronon/fetcher/constants.py +15 -0
- ai/chronon/fetcher/ttypes.py +127 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/hub/__init__.py +1 -0
- ai/chronon/hub/constants.py +15 -0
- ai/chronon/hub/ttypes.py +1228 -0
- ai/chronon/join.py +566 -0
- ai/chronon/logger.py +24 -0
- ai/chronon/model.py +35 -0
- ai/chronon/observability/__init__.py +1 -0
- ai/chronon/observability/constants.py +15 -0
- ai/chronon/observability/ttypes.py +2192 -0
- ai/chronon/orchestration/__init__.py +1 -0
- ai/chronon/orchestration/constants.py +15 -0
- ai/chronon/orchestration/ttypes.py +4406 -0
- ai/chronon/planner/__init__.py +1 -0
- ai/chronon/planner/constants.py +15 -0
- ai/chronon/planner/ttypes.py +1686 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +40 -0
- ai/chronon/repo/aws.py +298 -0
- ai/chronon/repo/cluster.py +65 -0
- ai/chronon/repo/compile.py +56 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +291 -0
- ai/chronon/repo/explore.py +421 -0
- ai/chronon/repo/extract_objects.py +137 -0
- ai/chronon/repo/gcp.py +585 -0
- ai/chronon/repo/gitpython_utils.py +14 -0
- ai/chronon/repo/hub_runner.py +171 -0
- ai/chronon/repo/hub_uploader.py +108 -0
- ai/chronon/repo/init.py +53 -0
- ai/chronon/repo/join_backfill.py +105 -0
- ai/chronon/repo/run.py +293 -0
- ai/chronon/repo/serializer.py +141 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +472 -0
- ai/chronon/repo/zipline.py +51 -0
- ai/chronon/repo/zipline_hub.py +105 -0
- ai/chronon/resources/gcp/README.md +174 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +30 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +23 -0
- ai/chronon/resources/gcp/teams.py +70 -0
- ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
- ai/chronon/source.py +88 -0
- ai/chronon/staging_query.py +185 -0
- ai/chronon/types.py +57 -0
- ai/chronon/utils.py +557 -0
- ai/chronon/windows.py +50 -0
- awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
- awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
- awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
- awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
- jars/__init__.py +0 -0
ai/chronon/repo/gcp.py
ADDED
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import multiprocessing
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
import crcmod
|
|
10
|
+
from google.cloud import storage
|
|
11
|
+
|
|
12
|
+
from ai.chronon.logger import get_logger
|
|
13
|
+
from ai.chronon.repo.constants import ROUTES, ZIPLINE_DIRECTORY
|
|
14
|
+
from ai.chronon.repo.default_runner import Runner
|
|
15
|
+
from ai.chronon.repo.utils import (
|
|
16
|
+
JobType,
|
|
17
|
+
check_call,
|
|
18
|
+
check_output,
|
|
19
|
+
extract_filename_from_path,
|
|
20
|
+
get_customer_warehouse_bucket,
|
|
21
|
+
get_environ_arg,
|
|
22
|
+
retry_decorator,
|
|
23
|
+
split_date_range,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
LOG = get_logger()
|
|
27
|
+
|
|
28
|
+
# GCP DATAPROC SPECIFIC CONSTANTS
|
|
29
|
+
DATAPROC_ENTRY = "ai.chronon.integrations.cloud_gcp.DataprocSubmitter"
|
|
30
|
+
ZIPLINE_GCP_JAR_DEFAULT = "cloud_gcp_lib_deploy.jar"
|
|
31
|
+
ZIPLINE_GCP_ONLINE_CLASS_DEFAULT = "ai.chronon.integrations.cloud_gcp.GcpApiImpl"
|
|
32
|
+
ZIPLINE_GCP_FLINK_JAR_DEFAULT = "flink_assembly_deploy.jar"
|
|
33
|
+
ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT = "connectors_pubsub_deploy.jar"
|
|
34
|
+
ZIPLINE_GCP_SERVICE_JAR = "service_assembly_deploy.jar"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class GcpRunner(Runner):
|
|
38
|
+
def __init__(self, args):
|
|
39
|
+
self._remote_artifact_prefix = args.get("artifact_prefix")
|
|
40
|
+
if not self._remote_artifact_prefix:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
"GCP artifact prefix not set."
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
self._version = args.get("version")
|
|
46
|
+
gcp_jar_path = GcpRunner.download_zipline_dataproc_jar(
|
|
47
|
+
self._remote_artifact_prefix,
|
|
48
|
+
ZIPLINE_DIRECTORY,
|
|
49
|
+
self._version,
|
|
50
|
+
ZIPLINE_GCP_JAR_DEFAULT,
|
|
51
|
+
)
|
|
52
|
+
service_jar_path = GcpRunner.download_zipline_dataproc_jar(
|
|
53
|
+
self._remote_artifact_prefix,
|
|
54
|
+
ZIPLINE_DIRECTORY,
|
|
55
|
+
self._version,
|
|
56
|
+
ZIPLINE_GCP_SERVICE_JAR,
|
|
57
|
+
)
|
|
58
|
+
jar_path = (
|
|
59
|
+
f"{service_jar_path}:{gcp_jar_path}"
|
|
60
|
+
if args["mode"] == "fetch"
|
|
61
|
+
else gcp_jar_path
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
self._args = args
|
|
65
|
+
self.job_id = str(uuid.uuid4())
|
|
66
|
+
|
|
67
|
+
super().__init__(args, os.path.expanduser(jar_path))
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def get_gcp_project_id() -> str:
|
|
72
|
+
return get_environ_arg("GCP_PROJECT_ID")
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_gcp_bigtable_instance_id() -> str:
|
|
76
|
+
return get_environ_arg("GCP_BIGTABLE_INSTANCE_ID")
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def is_pubsub_enabled() -> bool:
|
|
80
|
+
pubsub_enabled_env = get_environ_arg("ENABLE_PUBSUB", ignoreError=True)
|
|
81
|
+
return pubsub_enabled_env and pubsub_enabled_env.lower() == "true"
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def get_gcp_region_id() -> str:
|
|
85
|
+
return get_environ_arg("GCP_REGION")
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
@retry_decorator(retries=2, backoff=5)
|
|
89
|
+
def download_gcs_to_text(remote_file_name: str):
|
|
90
|
+
"""Download from the bucket using path."""
|
|
91
|
+
parsed = urlparse(remote_file_name)
|
|
92
|
+
bucket_name = parsed.netloc
|
|
93
|
+
source_blob_name = parsed.path.lstrip("/")
|
|
94
|
+
try:
|
|
95
|
+
storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
|
|
96
|
+
bucket = storage_client.bucket(bucket_name)
|
|
97
|
+
blob = bucket.blob(source_blob_name)
|
|
98
|
+
|
|
99
|
+
if not blob.exists():
|
|
100
|
+
return blob.exists(), None
|
|
101
|
+
else:
|
|
102
|
+
return blob.exists(), blob.download_as_text()
|
|
103
|
+
except Exception as e:
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
f"Failed to download {source_blob_name}: {str(e)}"
|
|
106
|
+
) from e
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
@retry_decorator(retries=2, backoff=5)
|
|
110
|
+
def download_gcs_file(remote_file_name: str, destination_file_name: str):
|
|
111
|
+
"""Download from the bucket using path."""
|
|
112
|
+
parsed = urlparse(remote_file_name)
|
|
113
|
+
bucket_name = parsed.netloc
|
|
114
|
+
source_blob_name = parsed.path.lstrip("/")
|
|
115
|
+
try:
|
|
116
|
+
storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
|
|
117
|
+
bucket = storage_client.bucket(bucket_name)
|
|
118
|
+
blob = bucket.blob(source_blob_name)
|
|
119
|
+
|
|
120
|
+
if not blob.exists():
|
|
121
|
+
raise FileNotFoundError(
|
|
122
|
+
f"Blob {source_blob_name} not found in bucket {bucket_name}"
|
|
123
|
+
)
|
|
124
|
+
blob.download_to_filename(destination_file_name)
|
|
125
|
+
LOG.info(
|
|
126
|
+
"Downloaded storage object {} from bucket {} to local file {}.".format(
|
|
127
|
+
source_blob_name, bucket_name, destination_file_name
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
raise RuntimeError(
|
|
132
|
+
f"Failed to download {source_blob_name}: {str(e)}"
|
|
133
|
+
) from e
|
|
134
|
+
|
|
135
|
+
@staticmethod
|
|
136
|
+
@retry_decorator(retries=2, backoff=5)
|
|
137
|
+
def upload_gcs_blob(bucket_name, source_file_name, destination_blob_name):
|
|
138
|
+
"""Uploads a file to the bucket."""
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
|
|
142
|
+
bucket = storage_client.bucket(bucket_name)
|
|
143
|
+
blob = bucket.blob(destination_blob_name)
|
|
144
|
+
blob.upload_from_filename(source_file_name)
|
|
145
|
+
|
|
146
|
+
LOG.info(
|
|
147
|
+
f"File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}."
|
|
148
|
+
)
|
|
149
|
+
return f"gs://{bucket_name}/{destination_blob_name}"
|
|
150
|
+
except Exception as e:
|
|
151
|
+
raise RuntimeError(f"Failed to upload {source_file_name}: {str(e)}") from e
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def get_gcs_file_hash(remote_file_path: str) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Get the hash of a file stored in Google Cloud Storage.
|
|
157
|
+
"""
|
|
158
|
+
parsed = urlparse(remote_file_path)
|
|
159
|
+
storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
|
|
160
|
+
bucket_name = parsed.netloc
|
|
161
|
+
blob_name = parsed.path.lstrip("/")
|
|
162
|
+
bucket = storage_client.bucket(bucket_name)
|
|
163
|
+
blob = bucket.get_blob(blob_name)
|
|
164
|
+
|
|
165
|
+
if not blob:
|
|
166
|
+
raise FileNotFoundError(
|
|
167
|
+
f"File {blob_name} not found in bucket {bucket_name}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return blob.crc32c
|
|
171
|
+
|
|
172
|
+
@staticmethod
|
|
173
|
+
def get_local_file_hash(file_path: str) -> str:
|
|
174
|
+
"""
|
|
175
|
+
Calculate CRC32C hash of a local file.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
file_path: Path to the local file
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Base64-encoded string of the file's CRC32C hash
|
|
182
|
+
"""
|
|
183
|
+
crc32c_hash = crcmod.predefined.Crc("crc-32c")
|
|
184
|
+
|
|
185
|
+
with open(file_path, "rb") as f:
|
|
186
|
+
# Read the file in chunks to handle large files efficiently
|
|
187
|
+
for chunk in iter(lambda: f.read(4096), b""):
|
|
188
|
+
crc32c_hash.update(chunk)
|
|
189
|
+
|
|
190
|
+
# Convert to base64 to match GCS format
|
|
191
|
+
return base64.b64encode(crc32c_hash.digest()).decode("utf-8")
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def compare_gcs_and_local_file_hashes(
|
|
195
|
+
remote_file_path: str, local_file_path: str
|
|
196
|
+
) -> bool:
|
|
197
|
+
"""
|
|
198
|
+
Compare hashes of a GCS file and a local file to check if they're identical.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
remote_file_path: URI of the remote object in GCS
|
|
202
|
+
local_file_path: Path to the local file to compare
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
True if files are identical, False otherwise
|
|
206
|
+
"""
|
|
207
|
+
try:
|
|
208
|
+
gcs_hash = GcpRunner.get_gcs_file_hash(remote_file_path)
|
|
209
|
+
local_hash = GcpRunner.get_local_file_hash(local_file_path)
|
|
210
|
+
|
|
211
|
+
LOG.info(
|
|
212
|
+
f"Local hash of {local_file_path}: {local_hash}. GCS file {remote_file_path} hash: {gcs_hash}"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
return gcs_hash == local_hash
|
|
216
|
+
|
|
217
|
+
except Exception as e:
|
|
218
|
+
LOG.info(f"Error comparing files: {str(e)}")
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def download_zipline_dataproc_jar(remote_file_path: str, local_file_path: str, version: str, jar_name: str
|
|
223
|
+
):
|
|
224
|
+
source_path = os.path.join(remote_file_path, "release", version, "jars", jar_name)
|
|
225
|
+
dest_path = os.path.join(local_file_path, jar_name)
|
|
226
|
+
|
|
227
|
+
are_identical = (
|
|
228
|
+
GcpRunner.compare_gcs_and_local_file_hashes(
|
|
229
|
+
source_path, dest_path
|
|
230
|
+
)
|
|
231
|
+
if os.path.exists(dest_path)
|
|
232
|
+
else False
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
if are_identical:
|
|
236
|
+
LOG.info(f"{dest_path} matches GCS {source_path}")
|
|
237
|
+
else:
|
|
238
|
+
LOG.info(
|
|
239
|
+
f"{dest_path} does NOT match GCS {source_path}"
|
|
240
|
+
)
|
|
241
|
+
LOG.info(f"Downloading {jar_name} from GCS...")
|
|
242
|
+
|
|
243
|
+
GcpRunner.download_gcs_file(source_path, dest_path)
|
|
244
|
+
return dest_path
|
|
245
|
+
|
|
246
|
+
def generate_dataproc_submitter_args(
|
|
247
|
+
self,
|
|
248
|
+
user_args: str,
|
|
249
|
+
version: str,
|
|
250
|
+
customer_artifact_prefix: str,
|
|
251
|
+
job_type: JobType = JobType.SPARK,
|
|
252
|
+
metadata_conf_path: str = None,
|
|
253
|
+
):
|
|
254
|
+
|
|
255
|
+
parsed = urlparse(customer_artifact_prefix)
|
|
256
|
+
source_blob_name = parsed.path.lstrip("/")
|
|
257
|
+
|
|
258
|
+
gcs_files = []
|
|
259
|
+
|
|
260
|
+
# upload to `metadata` folder
|
|
261
|
+
if metadata_conf_path:
|
|
262
|
+
destination_file_path = os.path.join(
|
|
263
|
+
source_blob_name,
|
|
264
|
+
"metadata",
|
|
265
|
+
self.job_id,
|
|
266
|
+
f"{extract_filename_from_path(metadata_conf_path)}"
|
|
267
|
+
)
|
|
268
|
+
gcs_files.append(
|
|
269
|
+
GcpRunner.upload_gcs_blob(
|
|
270
|
+
get_customer_warehouse_bucket(), metadata_conf_path, destination_file_path
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
gcs_file_args = ",".join(gcs_files)
|
|
275
|
+
release_prefix = os.path.join(customer_artifact_prefix, "release", version, "jars")
|
|
276
|
+
|
|
277
|
+
# include jar uri. should also already be in the bucket
|
|
278
|
+
jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_JAR_DEFAULT}")
|
|
279
|
+
|
|
280
|
+
final_args = "{user_args} --jar-uri={jar_uri} --job-type={job_type} --main-class={main_class} --zipline-version={zipline_version} --job-id={job_id}"
|
|
281
|
+
|
|
282
|
+
if job_type == JobType.FLINK:
|
|
283
|
+
main_class = "ai.chronon.flink.FlinkJob"
|
|
284
|
+
flink_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_JAR_DEFAULT}")
|
|
285
|
+
enable_pubsub = GcpRunner.is_pubsub_enabled()
|
|
286
|
+
flink_pubsub_connector_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT}")
|
|
287
|
+
base_formatted_args = final_args.format(
|
|
288
|
+
user_args=user_args,
|
|
289
|
+
jar_uri=jar_uri,
|
|
290
|
+
job_type=job_type.value,
|
|
291
|
+
main_class=main_class,
|
|
292
|
+
zipline_version=self._version,
|
|
293
|
+
job_id=self.job_id,
|
|
294
|
+
) + f" --flink-main-jar-uri={flink_jar_uri}"
|
|
295
|
+
if enable_pubsub:
|
|
296
|
+
base_formatted_args += f" --flink-pubsub-jar-uri={flink_pubsub_connector_jar_uri}"
|
|
297
|
+
return base_formatted_args
|
|
298
|
+
|
|
299
|
+
elif job_type == JobType.SPARK:
|
|
300
|
+
main_class = "ai.chronon.spark.Driver"
|
|
301
|
+
return " ".join([
|
|
302
|
+
final_args.format(
|
|
303
|
+
user_args=user_args,
|
|
304
|
+
jar_uri=jar_uri,
|
|
305
|
+
job_type=job_type.value,
|
|
306
|
+
main_class=main_class,
|
|
307
|
+
zipline_version=self._version,
|
|
308
|
+
job_id=self.job_id,
|
|
309
|
+
), "--is-gcp",
|
|
310
|
+
f"--gcp-project-id={GcpRunner.get_gcp_project_id()}",
|
|
311
|
+
f"--gcp-bigtable-instance-id={GcpRunner.get_gcp_bigtable_instance_id()}",
|
|
312
|
+
f"--files={gcs_file_args}" if gcs_file_args else "",
|
|
313
|
+
]
|
|
314
|
+
)
|
|
315
|
+
else:
|
|
316
|
+
raise ValueError(f"Invalid job type: {job_type}")
|
|
317
|
+
|
|
318
|
+
@staticmethod
|
|
319
|
+
def get_state_dataproc_job(job_id):
|
|
320
|
+
jobs_info_str = check_output(
|
|
321
|
+
f"gcloud dataproc jobs describe {job_id} --region={GcpRunner.get_gcp_region_id()} "
|
|
322
|
+
f"--project={GcpRunner.get_gcp_project_id()} --format=json"
|
|
323
|
+
).decode("utf-8")
|
|
324
|
+
job_info = json.loads(jobs_info_str)
|
|
325
|
+
return job_info.get("status", {}).get("state", "")
|
|
326
|
+
|
|
327
|
+
def run_dataproc_flink_streaming(self):
|
|
328
|
+
user_args = {
|
|
329
|
+
"--groupby-name": self.conf_metadata_name,
|
|
330
|
+
"--kafka-bootstrap": self.kafka_bootstrap,
|
|
331
|
+
"--online-class": ZIPLINE_GCP_ONLINE_CLASS_DEFAULT,
|
|
332
|
+
"-ZGCP_PROJECT_ID": GcpRunner.get_gcp_project_id(),
|
|
333
|
+
"-ZGCP_BIGTABLE_INSTANCE_ID": GcpRunner.get_gcp_bigtable_instance_id(),
|
|
334
|
+
"--validate-rows": self.validate_rows,
|
|
335
|
+
"--streaming-manifest-path": self.streaming_manifest_path,
|
|
336
|
+
"--streaming-checkpoint-path": self.streaming_checkpoint_path,
|
|
337
|
+
"--local-zipline-version": self._version,
|
|
338
|
+
|
|
339
|
+
# Need these for extracting metadata name in submitter
|
|
340
|
+
"--local-conf-path": self.local_abs_conf_path,
|
|
341
|
+
"--original-mode": self.mode,
|
|
342
|
+
"--conf-type": self.conf_type,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
args = self._args.get("args")
|
|
346
|
+
if "check-if-job-is-running" in args:
|
|
347
|
+
user_args["--streaming-mode"] = "check-if-job-is-running"
|
|
348
|
+
elif "deploy" in args:
|
|
349
|
+
user_args["--streaming-mode"] = "deploy"
|
|
350
|
+
|
|
351
|
+
flag_args = {"--validate": self.validate, "--enable-debug": self.enable_debug}
|
|
352
|
+
|
|
353
|
+
# Set the savepoint deploy strategy
|
|
354
|
+
if self.latest_savepoint:
|
|
355
|
+
flag_args["--latest-savepoint"] = self.latest_savepoint
|
|
356
|
+
elif self.custom_savepoint:
|
|
357
|
+
user_args["--custom-savepoint"] = self.custom_savepoint
|
|
358
|
+
else:
|
|
359
|
+
flag_args["--no-savepoint"] = self.no_savepoint
|
|
360
|
+
|
|
361
|
+
# Set version check deploy
|
|
362
|
+
if self.version_check:
|
|
363
|
+
flag_args["--version-check"] = self.version_check
|
|
364
|
+
|
|
365
|
+
# Set additional jars
|
|
366
|
+
if self.additional_jars:
|
|
367
|
+
user_args["--additional-jars"] = self.additional_jars
|
|
368
|
+
|
|
369
|
+
user_args_str = " ".join(f"{key}={value}" for key, value in user_args.items() if value)
|
|
370
|
+
# if online args are set we add them to the user_args_str
|
|
371
|
+
if self.online_args:
|
|
372
|
+
user_args_str += " " + self.online_args
|
|
373
|
+
|
|
374
|
+
flag_args_str = " ".join(key for key, value in flag_args.items() if value)
|
|
375
|
+
dataproc_args = self.generate_dataproc_submitter_args(
|
|
376
|
+
job_type=JobType.FLINK,
|
|
377
|
+
version=self._version,
|
|
378
|
+
customer_artifact_prefix=self._remote_artifact_prefix,
|
|
379
|
+
user_args=" ".join([user_args_str, flag_args_str]),
|
|
380
|
+
)
|
|
381
|
+
command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
382
|
+
|
|
383
|
+
return command
|
|
384
|
+
|
|
385
|
+
def run(self):
|
|
386
|
+
command_list = []
|
|
387
|
+
if self.mode == "info":
|
|
388
|
+
command_list.append(
|
|
389
|
+
"python3 {script} --conf {conf} --ds {ds} --repo {repo}".format(
|
|
390
|
+
script=self.render_info, conf=self.conf, ds=self.ds, repo=self.repo
|
|
391
|
+
)
|
|
392
|
+
)
|
|
393
|
+
elif self.sub_help or self.mode == "fetch":
|
|
394
|
+
entrypoint = "ai.chronon.online.fetcher.FetcherMain"
|
|
395
|
+
command_list.append(
|
|
396
|
+
"java -cp {jar} {entrypoint} {subcommand} {args}".format(
|
|
397
|
+
jar=self.jar_path,
|
|
398
|
+
entrypoint=entrypoint,
|
|
399
|
+
args="--help" if self.sub_help else self._gen_final_args(),
|
|
400
|
+
subcommand=ROUTES[self.conf_type][self.mode],
|
|
401
|
+
)
|
|
402
|
+
)
|
|
403
|
+
elif self.mode == "metastore":
|
|
404
|
+
# We could presumably support other metastore options but
|
|
405
|
+
# for now only poking for a particular partition is supported.
|
|
406
|
+
args = self._args.get("args")
|
|
407
|
+
supported_subcommands = ["check-partitions"]
|
|
408
|
+
assert (
|
|
409
|
+
"check-partitions" in args
|
|
410
|
+
), f"Must specify one of the following subcommands: {supported_subcommands}"
|
|
411
|
+
assert (
|
|
412
|
+
"--partition-names" in args
|
|
413
|
+
), "Must specify a list of `--partition-names=schema.table/pk1=pv1/pk2=pv2"
|
|
414
|
+
|
|
415
|
+
dataproc_args = self.generate_dataproc_submitter_args(
|
|
416
|
+
# for now, self.conf is the only local file that requires uploading to gcs
|
|
417
|
+
user_args=self._gen_final_args(),
|
|
418
|
+
version=self._version,
|
|
419
|
+
customer_artifact_prefix=self._remote_artifact_prefix,
|
|
420
|
+
metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None
|
|
421
|
+
)
|
|
422
|
+
command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
423
|
+
command_list.append(command)
|
|
424
|
+
elif self.mode in ["streaming", "streaming-client"]:
|
|
425
|
+
args = self._args.get("args")
|
|
426
|
+
# streaming mode
|
|
427
|
+
command = self.run_dataproc_flink_streaming()
|
|
428
|
+
command_list.append(command)
|
|
429
|
+
else:
|
|
430
|
+
if self.parallelism > 1:
|
|
431
|
+
assert self.start_ds is not None and self.ds is not None, (
|
|
432
|
+
"To use parallelism, please specify --start-ds and --end-ds to "
|
|
433
|
+
"break down into multiple backfill jobs"
|
|
434
|
+
)
|
|
435
|
+
date_ranges = split_date_range(self.start_ds, self.ds, self.parallelism)
|
|
436
|
+
for start_ds, end_ds in date_ranges:
|
|
437
|
+
user_args = ("{subcommand} {args} {additional_args}").format(
|
|
438
|
+
subcommand=ROUTES[self.conf_type][self.mode],
|
|
439
|
+
args=self._gen_final_args(
|
|
440
|
+
start_ds=start_ds,
|
|
441
|
+
end_ds=end_ds,
|
|
442
|
+
# overriding the conf here because we only want the
|
|
443
|
+
# filename, not the full path. When we upload this to
|
|
444
|
+
# GCS, the full path does get reflected on GCS. But
|
|
445
|
+
# when we include the gcs file path as part of dataproc,
|
|
446
|
+
# the file is copied to root and not the complete path
|
|
447
|
+
# is copied.
|
|
448
|
+
override_conf_path=(
|
|
449
|
+
extract_filename_from_path(self.conf)
|
|
450
|
+
if self.conf
|
|
451
|
+
else None
|
|
452
|
+
),
|
|
453
|
+
),
|
|
454
|
+
additional_args=os.environ.get(
|
|
455
|
+
"CHRONON_CONFIG_ADDITIONAL_ARGS", ""
|
|
456
|
+
),
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
dataproc_args = self.generate_dataproc_submitter_args(
|
|
460
|
+
# for now, self.conf is the only local file that requires uploading to gcs
|
|
461
|
+
user_args=user_args,
|
|
462
|
+
version=self._version,
|
|
463
|
+
customer_artifact_prefix=self._remote_artifact_prefix,
|
|
464
|
+
metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None,
|
|
465
|
+
)
|
|
466
|
+
command = (
|
|
467
|
+
f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
468
|
+
)
|
|
469
|
+
command_list.append(command)
|
|
470
|
+
else:
|
|
471
|
+
user_args = ("{subcommand} {args} {additional_args}").format(
|
|
472
|
+
subcommand=ROUTES[self.conf_type][self.mode],
|
|
473
|
+
args=self._gen_final_args(
|
|
474
|
+
start_ds=self.start_ds,
|
|
475
|
+
# overriding the conf here because we only want the filename,
|
|
476
|
+
# not the full path. When we upload this to GCS, the full path
|
|
477
|
+
# does get reflected on GCS. But when we include the gcs file
|
|
478
|
+
# path as part of dataproc, the file is copied to root and
|
|
479
|
+
# not the complete path is copied.
|
|
480
|
+
override_conf_path=(
|
|
481
|
+
extract_filename_from_path(self.conf) if self.conf else None
|
|
482
|
+
),
|
|
483
|
+
),
|
|
484
|
+
additional_args=os.environ.get(
|
|
485
|
+
"CHRONON_CONFIG_ADDITIONAL_ARGS", ""
|
|
486
|
+
),
|
|
487
|
+
)
|
|
488
|
+
dataproc_args = self.generate_dataproc_submitter_args(
|
|
489
|
+
user_args=user_args,
|
|
490
|
+
version=self._version,
|
|
491
|
+
customer_artifact_prefix=self._remote_artifact_prefix,
|
|
492
|
+
metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None
|
|
493
|
+
)
|
|
494
|
+
command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
495
|
+
command_list.append(command)
|
|
496
|
+
|
|
497
|
+
if len(command_list) > 1:
|
|
498
|
+
# parallel backfill mode
|
|
499
|
+
with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
|
|
500
|
+
LOG.info(
|
|
501
|
+
"Running args list {} with pool size {}".format(
|
|
502
|
+
command_list, self.parallelism
|
|
503
|
+
)
|
|
504
|
+
)
|
|
505
|
+
pool.map(check_call, command_list)
|
|
506
|
+
elif len(command_list) == 1:
|
|
507
|
+
output = check_output(command_list[0]).decode("utf-8").split("\n")
|
|
508
|
+
print(*output, sep="\n")
|
|
509
|
+
|
|
510
|
+
dataproc_submitter_id_str = "Dataproc submitter job id"
|
|
511
|
+
|
|
512
|
+
dataproc_submitter_logs = [s for s in output if dataproc_submitter_id_str in s]
|
|
513
|
+
|
|
514
|
+
submitted_job_id = None
|
|
515
|
+
if dataproc_submitter_logs:
|
|
516
|
+
log = dataproc_submitter_logs[0]
|
|
517
|
+
submitted_job_id = (
|
|
518
|
+
log[log.index(dataproc_submitter_id_str) + len(dataproc_submitter_id_str) + 1 :]
|
|
519
|
+
).strip()
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
if not self.disable_cloud_logging and submitted_job_id:
|
|
523
|
+
LOG.info(
|
|
524
|
+
"""
|
|
525
|
+
<-----------------------------------------------------------------------------------
|
|
526
|
+
------------------------------------------------------------------------------------
|
|
527
|
+
DATAPROC LOGS
|
|
528
|
+
------------------------------------------------------------------------------------
|
|
529
|
+
------------------------------------------------------------------------------------>
|
|
530
|
+
"""
|
|
531
|
+
)
|
|
532
|
+
check_call(
|
|
533
|
+
f"gcloud dataproc jobs wait {submitted_job_id} --region={GcpRunner.get_gcp_region_id()} "
|
|
534
|
+
f"--project={GcpRunner.get_gcp_project_id()}"
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
# Fetch the final job state
|
|
538
|
+
job_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
|
|
539
|
+
|
|
540
|
+
LOG.info("<<<<<<<<<<<<<<<<-----------------JOB STATUS----------------->>>>>>>>>>>>>>>>>")
|
|
541
|
+
if job_state != 'DONE':
|
|
542
|
+
LOG.info(f"Job {submitted_job_id} is not in DONE state. Current state: {job_state}")
|
|
543
|
+
raise RuntimeError(f"Job {submitted_job_id} failed.")
|
|
544
|
+
else:
|
|
545
|
+
LOG.info(f"Job {submitted_job_id} is in DONE state.")
|
|
546
|
+
return
|
|
547
|
+
|
|
548
|
+
# If streaming deploy job, poll and check for final
|
|
549
|
+
if (submitted_job_id and self.mode in ["streaming", "streaming-client"]
|
|
550
|
+
and "deploy" in self._args.get("args")):
|
|
551
|
+
# Poll the dataproc job id for 5 minutes until the job
|
|
552
|
+
total_time_seconds = 5 * 60
|
|
553
|
+
interval_seconds = 10
|
|
554
|
+
start_time = time.time()
|
|
555
|
+
while time.time() - start_time < total_time_seconds:
|
|
556
|
+
current_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
|
|
557
|
+
|
|
558
|
+
non_terminal_states = ['SETUP_DONE', 'RUNNING', 'PENDING', 'STATE_UNSPECIFIED']
|
|
559
|
+
if current_state not in non_terminal_states:
|
|
560
|
+
raise RuntimeError(f"Flink job is not in {non_terminal_states}. "
|
|
561
|
+
f"Current state: {current_state}")
|
|
562
|
+
|
|
563
|
+
manifest_path = os.path.join(self.streaming_manifest_path, self.conf_metadata_name, "manifest.txt")
|
|
564
|
+
manifest_exists, raw_manifest = self.download_gcs_to_text(str(manifest_path))
|
|
565
|
+
|
|
566
|
+
if manifest_exists:
|
|
567
|
+
manifest = raw_manifest.strip()
|
|
568
|
+
LOG.info(f"Checking Flink manifest to confirm deployment. Manifest: [{manifest}]")
|
|
569
|
+
manifest_tuples = manifest.split(",")
|
|
570
|
+
|
|
571
|
+
flink_job_id = [f.split("=")[1] for f in manifest_tuples if f.startswith("flinkJobId")][0]
|
|
572
|
+
parent_job_id = [f.split("=")[1] for f in manifest_tuples if f.startswith("parentJobId")][0]
|
|
573
|
+
|
|
574
|
+
if parent_job_id == submitted_job_id:
|
|
575
|
+
LOG.info(f"Flink job has been deployed successfully. Flink job ID = [{flink_job_id}]."
|
|
576
|
+
f" Dataproc job ID = [{submitted_job_id}]")
|
|
577
|
+
break
|
|
578
|
+
else:
|
|
579
|
+
LOG.info(f"Flink manifest not updated with new Dataproc job id {submitted_job_id}.")
|
|
580
|
+
LOG.info(f"Sleeping for {interval_seconds} seconds...")
|
|
581
|
+
time.sleep(interval_seconds)
|
|
582
|
+
else:
|
|
583
|
+
raise RuntimeError(
|
|
584
|
+
f"Failed to confirm Flink manifest for new deployment with Dataproc job id {submitted_job_id}."
|
|
585
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from git import Repo
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_default_origin_branch(path, repo: Optional[Repo] = None):
|
|
7
|
+
if not repo:
|
|
8
|
+
repo = Repo(path, search_parent_directories=True)
|
|
9
|
+
return repo.remotes.origin.refs.HEAD.reference.name.split('/')[-1]
|
|
10
|
+
|
|
11
|
+
def get_current_branch(path, repo: Optional[Repo] = None):
|
|
12
|
+
if not repo:
|
|
13
|
+
repo = Repo(path, search_parent_directories=True)
|
|
14
|
+
return repo.active_branch.name
|