awx-zipline-ai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +251 -0
- ai/chronon/api/__init__.py +1 -0
- ai/chronon/api/common/__init__.py +1 -0
- ai/chronon/api/common/constants.py +15 -0
- ai/chronon/api/common/ttypes.py +1844 -0
- ai/chronon/api/constants.py +15 -0
- ai/chronon/api/ttypes.py +3624 -0
- ai/chronon/cli/compile/column_hashing.py +313 -0
- ai/chronon/cli/compile/compile_context.py +177 -0
- ai/chronon/cli/compile/compiler.py +160 -0
- ai/chronon/cli/compile/conf_validator.py +590 -0
- ai/chronon/cli/compile/display/class_tracker.py +112 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +46 -0
- ai/chronon/cli/compile/fill_templates.py +40 -0
- ai/chronon/cli/compile/parse_configs.py +141 -0
- ai/chronon/cli/compile/parse_teams.py +238 -0
- ai/chronon/cli/compile/serializer.py +115 -0
- ai/chronon/cli/git_utils.py +156 -0
- ai/chronon/cli/logger.py +61 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/eval/__init__.py +122 -0
- ai/chronon/eval/query_parsing.py +19 -0
- ai/chronon/eval/sample_tables.py +100 -0
- ai/chronon/eval/table_scan.py +186 -0
- ai/chronon/fetcher/__init__.py +1 -0
- ai/chronon/fetcher/constants.py +15 -0
- ai/chronon/fetcher/ttypes.py +127 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/hub/__init__.py +1 -0
- ai/chronon/hub/constants.py +15 -0
- ai/chronon/hub/ttypes.py +1228 -0
- ai/chronon/join.py +566 -0
- ai/chronon/logger.py +24 -0
- ai/chronon/model.py +35 -0
- ai/chronon/observability/__init__.py +1 -0
- ai/chronon/observability/constants.py +15 -0
- ai/chronon/observability/ttypes.py +2192 -0
- ai/chronon/orchestration/__init__.py +1 -0
- ai/chronon/orchestration/constants.py +15 -0
- ai/chronon/orchestration/ttypes.py +4406 -0
- ai/chronon/planner/__init__.py +1 -0
- ai/chronon/planner/constants.py +15 -0
- ai/chronon/planner/ttypes.py +1686 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +40 -0
- ai/chronon/repo/aws.py +298 -0
- ai/chronon/repo/cluster.py +65 -0
- ai/chronon/repo/compile.py +56 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +291 -0
- ai/chronon/repo/explore.py +421 -0
- ai/chronon/repo/extract_objects.py +137 -0
- ai/chronon/repo/gcp.py +585 -0
- ai/chronon/repo/gitpython_utils.py +14 -0
- ai/chronon/repo/hub_runner.py +171 -0
- ai/chronon/repo/hub_uploader.py +108 -0
- ai/chronon/repo/init.py +53 -0
- ai/chronon/repo/join_backfill.py +105 -0
- ai/chronon/repo/run.py +293 -0
- ai/chronon/repo/serializer.py +141 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +472 -0
- ai/chronon/repo/zipline.py +51 -0
- ai/chronon/repo/zipline_hub.py +105 -0
- ai/chronon/resources/gcp/README.md +174 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +30 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +23 -0
- ai/chronon/resources/gcp/teams.py +70 -0
- ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
- ai/chronon/source.py +88 -0
- ai/chronon/staging_query.py +185 -0
- ai/chronon/types.py +57 -0
- ai/chronon/utils.py +557 -0
- ai/chronon/windows.py +50 -0
- awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
- awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
- awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
- awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
- jars/__init__.py +0 -0
ai/chronon/repo/utils.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
import xml.etree.ElementTree as ET
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
from enum import Enum
|
|
9
|
+
|
|
10
|
+
from ai.chronon.cli.compile.parse_teams import EnvOrConfigAttribute
|
|
11
|
+
from ai.chronon.logger import get_logger
|
|
12
|
+
from ai.chronon.repo.constants import (
|
|
13
|
+
APP_NAME_TEMPLATE,
|
|
14
|
+
SCALA_VERSION_FOR_SPARK,
|
|
15
|
+
SUPPORTED_SPARK,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
LOG = get_logger()
|
|
19
|
+
|
|
20
|
+
class JobType(Enum):
|
|
21
|
+
SPARK = "spark"
|
|
22
|
+
FLINK = "flink"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def retry_decorator(retries=3, backoff=20):
|
|
26
|
+
def wrapper(func):
|
|
27
|
+
def wrapped(*args, **kwargs):
|
|
28
|
+
attempt = 0
|
|
29
|
+
while attempt <= retries:
|
|
30
|
+
try:
|
|
31
|
+
return func(*args, **kwargs)
|
|
32
|
+
except Exception as e:
|
|
33
|
+
attempt += 1
|
|
34
|
+
LOG.exception(e)
|
|
35
|
+
sleep_time = attempt * backoff
|
|
36
|
+
LOG.info(
|
|
37
|
+
"[{}] Retry: {} out of {}/ Sleeping for {}".format(
|
|
38
|
+
func.__name__, attempt, retries, sleep_time
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
time.sleep(sleep_time)
|
|
42
|
+
return func(*args, **kwargs)
|
|
43
|
+
|
|
44
|
+
return wrapped
|
|
45
|
+
|
|
46
|
+
return wrapper
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_environ_arg(env_name, ignoreError=False) -> str:
|
|
50
|
+
value = os.environ.get(env_name)
|
|
51
|
+
if not value and not ignoreError:
|
|
52
|
+
raise ValueError(f"Please set {env_name} environment variable")
|
|
53
|
+
return value
|
|
54
|
+
|
|
55
|
+
def get_customer_warehouse_bucket() -> str:
|
|
56
|
+
return f"zipline-warehouse-{get_customer_id()}"
|
|
57
|
+
|
|
58
|
+
def get_customer_id() -> str:
|
|
59
|
+
return get_environ_arg("CUSTOMER_ID")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def extract_filename_from_path(path):
|
|
63
|
+
return path.split("/")[-1]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def check_call(cmd):
|
|
67
|
+
LOG.info("Running command: " + cmd)
|
|
68
|
+
return subprocess.check_call(cmd.split(), bufsize=0)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def check_output(cmd):
|
|
72
|
+
LOG.info("Running command: " + cmd)
|
|
73
|
+
return subprocess.check_output(cmd.split(), bufsize=0).strip()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def custom_json(conf):
|
|
77
|
+
"""Extract the json stored in customJson for a conf."""
|
|
78
|
+
if conf.get("metaData", {}).get("customJson"):
|
|
79
|
+
return json.loads(conf["metaData"]["customJson"])
|
|
80
|
+
return {}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def download_only_once(url, path, skip_download=False):
|
|
84
|
+
if skip_download:
|
|
85
|
+
LOG.info("Skipping download of " + path)
|
|
86
|
+
return
|
|
87
|
+
should_download = True
|
|
88
|
+
path = path.strip()
|
|
89
|
+
if os.path.exists(path):
|
|
90
|
+
content_output = check_output("curl -sI " + url).decode("utf-8")
|
|
91
|
+
content_length = re.search("(content-length:\\s)(\\d+)", content_output.lower())
|
|
92
|
+
remote_size = int(content_length.group().split()[-1])
|
|
93
|
+
local_size = int(check_output("wc -c " + path).split()[0])
|
|
94
|
+
LOG.info(
|
|
95
|
+
"""Files sizes of {url} vs. {path}
|
|
96
|
+
Remote size: {remote_size}
|
|
97
|
+
Local size : {local_size}""".format(
|
|
98
|
+
**locals()
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
if local_size == remote_size:
|
|
102
|
+
LOG.info("Sizes match. Assuming it's already downloaded.")
|
|
103
|
+
should_download = False
|
|
104
|
+
if should_download:
|
|
105
|
+
LOG.info(
|
|
106
|
+
"Different file from remote at local: " + path + ". Re-downloading.."
|
|
107
|
+
)
|
|
108
|
+
check_call("curl {} -o {} --connect-timeout 10".format(url, path))
|
|
109
|
+
else:
|
|
110
|
+
LOG.info("No file at: " + path + ". Downloading..")
|
|
111
|
+
check_call("curl {} -o {} --connect-timeout 10".format(url, path))
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# NOTE: this is only for the open source chronon. For the internal zipline version, we have a different jar to download.
|
|
115
|
+
@retry_decorator(retries=3, backoff=50)
|
|
116
|
+
def download_jar(
|
|
117
|
+
version,
|
|
118
|
+
jar_type="uber",
|
|
119
|
+
release_tag=None,
|
|
120
|
+
spark_version="2.4.0",
|
|
121
|
+
skip_download=False,
|
|
122
|
+
):
|
|
123
|
+
assert spark_version in SUPPORTED_SPARK, (
|
|
124
|
+
f"Received unsupported spark version {spark_version}. "
|
|
125
|
+
f"Supported spark versions are {SUPPORTED_SPARK}"
|
|
126
|
+
)
|
|
127
|
+
scala_version = SCALA_VERSION_FOR_SPARK[spark_version]
|
|
128
|
+
maven_url_prefix = os.environ.get("CHRONON_MAVEN_MIRROR_PREFIX", None)
|
|
129
|
+
default_url_prefix = (
|
|
130
|
+
"https://s01.oss.sonatype.org/service/local/repositories/public/content"
|
|
131
|
+
)
|
|
132
|
+
url_prefix = maven_url_prefix if maven_url_prefix else default_url_prefix
|
|
133
|
+
base_url = "{}/ai/chronon/spark_{}_{}".format(url_prefix, jar_type, scala_version)
|
|
134
|
+
LOG.info("Downloading jar from url: " + base_url)
|
|
135
|
+
jar_path = os.environ.get("CHRONON_DRIVER_JAR", None)
|
|
136
|
+
if jar_path is None:
|
|
137
|
+
if version == "latest":
|
|
138
|
+
version = None
|
|
139
|
+
if version is None:
|
|
140
|
+
metadata_content = check_output(
|
|
141
|
+
"curl -s {}/maven-metadata.xml".format(base_url)
|
|
142
|
+
)
|
|
143
|
+
meta_tree = ET.fromstring(metadata_content)
|
|
144
|
+
versions = [
|
|
145
|
+
node.text
|
|
146
|
+
for node in meta_tree.findall("./versioning/versions/")
|
|
147
|
+
if re.search(
|
|
148
|
+
r"^\d+\.\d+\.\d+{}$".format(
|
|
149
|
+
r"\_{}\d*".format(release_tag) if release_tag else ""
|
|
150
|
+
),
|
|
151
|
+
node.text,
|
|
152
|
+
)
|
|
153
|
+
]
|
|
154
|
+
version = versions[-1]
|
|
155
|
+
jar_url = "{base_url}/{version}/spark_{jar_type}_{scala_version}-{version}-assembly.jar".format(
|
|
156
|
+
base_url=base_url,
|
|
157
|
+
version=version,
|
|
158
|
+
scala_version=scala_version,
|
|
159
|
+
jar_type=jar_type,
|
|
160
|
+
)
|
|
161
|
+
jar_path = os.path.join("/tmp", extract_filename_from_path(jar_url))
|
|
162
|
+
download_only_once(jar_url, jar_path, skip_download)
|
|
163
|
+
return jar_path
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def get_teams_json_file_path(repo_path):
|
|
167
|
+
return os.path.join(repo_path, "teams.json")
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def get_teams_py_file_path(repo_path):
|
|
171
|
+
return os.path.join(repo_path, "teams.py")
|
|
172
|
+
|
|
173
|
+
def set_runtime_env_v3(params, conf):
|
|
174
|
+
effective_mode = params.get("mode")
|
|
175
|
+
|
|
176
|
+
runtime_env = {"APP_NAME": params.get("app_name")}
|
|
177
|
+
|
|
178
|
+
if params.get("repo") and conf and effective_mode:
|
|
179
|
+
# get the conf file
|
|
180
|
+
conf_path = os.path.join(params["repo"], conf)
|
|
181
|
+
if os.path.isfile(conf_path):
|
|
182
|
+
with open(conf_path, "r") as infile:
|
|
183
|
+
conf_json = json.load(infile)
|
|
184
|
+
metadata = conf_json.get("metaData", {}) or conf_json # user may just pass metadata as the entire json
|
|
185
|
+
env = metadata.get("executionInfo", {}).get("env", {})
|
|
186
|
+
runtime_env.update(env.get(EnvOrConfigAttribute.ENV,{}).get(effective_mode,{}) or env.get("common", {}))
|
|
187
|
+
# Also set APP_NAME
|
|
188
|
+
try:
|
|
189
|
+
_, conf_type, team, _ = conf.split("/")[-4:]
|
|
190
|
+
if not team:
|
|
191
|
+
team = "default"
|
|
192
|
+
# context is the environment in which the job is running, which is provided from the args,
|
|
193
|
+
# default to be dev.
|
|
194
|
+
if params["env"]:
|
|
195
|
+
context = params["env"]
|
|
196
|
+
else:
|
|
197
|
+
context = "dev"
|
|
198
|
+
LOG.info(f"Context: {context} -- conf_type: {conf_type} -- team: {team}")
|
|
199
|
+
|
|
200
|
+
runtime_env["APP_NAME"] = APP_NAME_TEMPLATE.format(
|
|
201
|
+
mode=effective_mode,
|
|
202
|
+
conf_type=conf_type,
|
|
203
|
+
context=context,
|
|
204
|
+
name=conf_json["metaData"]["name"],
|
|
205
|
+
)
|
|
206
|
+
except Exception:
|
|
207
|
+
LOG.warn(
|
|
208
|
+
"Failed to set APP_NAME due to invalid conf path: {}, please ensure to supply the "
|
|
209
|
+
"relative path to zipline/ folder".format(
|
|
210
|
+
conf
|
|
211
|
+
)
|
|
212
|
+
)
|
|
213
|
+
else:
|
|
214
|
+
if not params.get("app_name") and not os.environ.get("APP_NAME"):
|
|
215
|
+
# Provide basic app_name when no conf is defined.
|
|
216
|
+
# Modes like metadata-upload and metadata-export can rely on conf-type or folder rather than a conf.
|
|
217
|
+
runtime_env["APP_NAME"] = "_".join(
|
|
218
|
+
[
|
|
219
|
+
k
|
|
220
|
+
for k in [
|
|
221
|
+
"chronon",
|
|
222
|
+
effective_mode.replace("-", "_")
|
|
223
|
+
]
|
|
224
|
+
if k is not None
|
|
225
|
+
]
|
|
226
|
+
)
|
|
227
|
+
for key, value in runtime_env.items():
|
|
228
|
+
if key not in os.environ and value is not None:
|
|
229
|
+
LOG.info(f"Setting to environment: {key}={value}")
|
|
230
|
+
print(f"Setting to environment: {key}={value}")
|
|
231
|
+
os.environ[key] = value
|
|
232
|
+
|
|
233
|
+
# TODO: delete this when we cutover
|
|
234
|
+
def set_runtime_env(params):
|
|
235
|
+
"""
|
|
236
|
+
Setting the runtime environment variables.
|
|
237
|
+
These are extracted from the common env, the team env and the common env.
|
|
238
|
+
In order to use the environment variables defined in the configs as overrides for the args in the cli this method
|
|
239
|
+
needs to be run before the runner and jar downloads.
|
|
240
|
+
|
|
241
|
+
The order of priority is:
|
|
242
|
+
- Environment variables existing already.
|
|
243
|
+
- Environment variables derived from args (like app_name)
|
|
244
|
+
- conf.metaData.modeToEnvMap for the mode (set on config)
|
|
245
|
+
- team's dev environment for each mode set on teams.json
|
|
246
|
+
- team's prod environment for each mode set on teams.json
|
|
247
|
+
- default team environment per context and mode set on teams.json
|
|
248
|
+
- Common Environment set in teams.json
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
environment = {
|
|
252
|
+
"common_env": {},
|
|
253
|
+
"conf_env": {},
|
|
254
|
+
"default_env": {},
|
|
255
|
+
"team_env": {},
|
|
256
|
+
"production_team_env": {},
|
|
257
|
+
"cli_args": {},
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
conf_type = None
|
|
261
|
+
# Normalize modes that are effectively replacement of each other (streaming/local-streaming/streaming-client)
|
|
262
|
+
effective_mode = params["mode"]
|
|
263
|
+
if effective_mode and "streaming" in effective_mode:
|
|
264
|
+
effective_mode = "streaming"
|
|
265
|
+
if params["repo"]:
|
|
266
|
+
|
|
267
|
+
# Break if teams.json and teams.py exists
|
|
268
|
+
teams_json_file = get_teams_json_file_path(params["repo"])
|
|
269
|
+
teams_py_file = get_teams_py_file_path(params["repo"])
|
|
270
|
+
|
|
271
|
+
if os.path.exists(teams_json_file) and os.path.exists(teams_py_file):
|
|
272
|
+
raise ValueError(
|
|
273
|
+
"Both teams.json and teams.py exist. Please only use teams.py."
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
if os.path.exists(teams_json_file):
|
|
277
|
+
set_runtime_env_teams_json(
|
|
278
|
+
environment, params, effective_mode, teams_json_file
|
|
279
|
+
)
|
|
280
|
+
if params["app_name"]:
|
|
281
|
+
environment["cli_args"]["APP_NAME"] = params["app_name"]
|
|
282
|
+
else:
|
|
283
|
+
if not params["app_name"] and not environment["cli_args"].get("APP_NAME"):
|
|
284
|
+
# Provide basic app_name when no conf is defined.
|
|
285
|
+
# Modes like metadata-upload and metadata-export can rely on conf-type or folder rather than a conf.
|
|
286
|
+
environment["cli_args"]["APP_NAME"] = "_".join(
|
|
287
|
+
[
|
|
288
|
+
k
|
|
289
|
+
for k in [
|
|
290
|
+
"chronon",
|
|
291
|
+
conf_type,
|
|
292
|
+
(
|
|
293
|
+
params["mode"].replace("-", "_")
|
|
294
|
+
if params["mode"]
|
|
295
|
+
else None
|
|
296
|
+
),
|
|
297
|
+
]
|
|
298
|
+
if k is not None
|
|
299
|
+
]
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Adding these to make sure they are printed if provided by the environment.
|
|
303
|
+
environment["cli_args"]["CHRONON_DRIVER_JAR"] = params["chronon_jar"]
|
|
304
|
+
environment["cli_args"]["CHRONON_ONLINE_JAR"] = params["online_jar"]
|
|
305
|
+
environment["cli_args"]["CHRONON_ONLINE_CLASS"] = params["online_class"]
|
|
306
|
+
order = [
|
|
307
|
+
"conf_env",
|
|
308
|
+
"team_env", # todo: team_env maybe should be below default/common_env
|
|
309
|
+
"production_team_env",
|
|
310
|
+
"default_env",
|
|
311
|
+
"common_env",
|
|
312
|
+
"cli_args",
|
|
313
|
+
]
|
|
314
|
+
LOG.info("Setting env variables:")
|
|
315
|
+
for key in os.environ:
|
|
316
|
+
if any([key in (environment.get(set_key, {}) or {}) for set_key in order]):
|
|
317
|
+
LOG.info(f"From <environment> found {key}={os.environ[key]}")
|
|
318
|
+
for set_key in order:
|
|
319
|
+
for key, value in (environment.get(set_key, {}) or {}).items():
|
|
320
|
+
if key not in os.environ and value is not None:
|
|
321
|
+
LOG.info(f"From <{set_key}> setting {key}={value}")
|
|
322
|
+
os.environ[key] = value
|
|
323
|
+
|
|
324
|
+
# TODO: delete this when we cutover
|
|
325
|
+
def set_runtime_env_teams_json(environment, params, effective_mode, teams_json_file):
|
|
326
|
+
if os.path.exists(teams_json_file):
|
|
327
|
+
with open(teams_json_file, "r") as infile:
|
|
328
|
+
teams_json = json.load(infile)
|
|
329
|
+
# we should have a fallback if user wants to set to something else `default`
|
|
330
|
+
environment["common_env"] = teams_json.get("default", {}).get("common_env", {})
|
|
331
|
+
if params["conf"] and effective_mode:
|
|
332
|
+
try:
|
|
333
|
+
_, conf_type, team, _ = params["conf"].split("/")[-4:]
|
|
334
|
+
except Exception as e:
|
|
335
|
+
LOG.error(
|
|
336
|
+
"Invalid conf path: {}, please ensure to supply the relative path to zipline/ folder".format(
|
|
337
|
+
params["conf"]
|
|
338
|
+
)
|
|
339
|
+
)
|
|
340
|
+
raise e
|
|
341
|
+
if not team:
|
|
342
|
+
team = "default"
|
|
343
|
+
# context is the environment in which the job is running, which is provided from the args,
|
|
344
|
+
# default to be dev.
|
|
345
|
+
if params["env"]:
|
|
346
|
+
context = params["env"]
|
|
347
|
+
else:
|
|
348
|
+
context = "dev"
|
|
349
|
+
LOG.info(
|
|
350
|
+
f"Context: {context} -- conf_type: {conf_type} -- team: {team}"
|
|
351
|
+
)
|
|
352
|
+
conf_path = os.path.join(params["repo"], params["conf"])
|
|
353
|
+
if os.path.isfile(conf_path):
|
|
354
|
+
with open(conf_path, "r") as conf_file:
|
|
355
|
+
conf_json = json.load(conf_file)
|
|
356
|
+
|
|
357
|
+
new_env = (
|
|
358
|
+
conf_json.get("metaData")
|
|
359
|
+
.get("executionInfo", {})
|
|
360
|
+
.get("env", {})
|
|
361
|
+
.get(effective_mode, {})
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
old_env = (
|
|
365
|
+
conf_json.get("metaData")
|
|
366
|
+
.get("modeToEnvMap", {})
|
|
367
|
+
.get(effective_mode, {})
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
environment["conf_env"] = new_env if new_env else old_env
|
|
371
|
+
|
|
372
|
+
# Load additional args used on backfill.
|
|
373
|
+
if custom_json(conf_json) and effective_mode in [
|
|
374
|
+
"backfill",
|
|
375
|
+
"backfill-left",
|
|
376
|
+
"backfill-final",
|
|
377
|
+
]:
|
|
378
|
+
environment["conf_env"]["CHRONON_CONFIG_ADDITIONAL_ARGS"] = (
|
|
379
|
+
" ".join(custom_json(conf_json).get("additional_args", []))
|
|
380
|
+
)
|
|
381
|
+
environment["cli_args"]["APP_NAME"] = APP_NAME_TEMPLATE.format(
|
|
382
|
+
mode=effective_mode,
|
|
383
|
+
conf_type=conf_type,
|
|
384
|
+
context=context,
|
|
385
|
+
name=conf_json["metaData"]["name"],
|
|
386
|
+
)
|
|
387
|
+
environment["team_env"] = (
|
|
388
|
+
teams_json[team].get(context, {}).get(effective_mode, {})
|
|
389
|
+
)
|
|
390
|
+
# fall-back to prod env even in dev mode when dev env is undefined.
|
|
391
|
+
environment["production_team_env"] = (
|
|
392
|
+
teams_json[team].get("production", {}).get(effective_mode, {})
|
|
393
|
+
)
|
|
394
|
+
# By default use production env.
|
|
395
|
+
environment["default_env"] = (
|
|
396
|
+
teams_json.get("default", {})
|
|
397
|
+
.get("production", {})
|
|
398
|
+
.get(effective_mode, {})
|
|
399
|
+
)
|
|
400
|
+
environment["cli_args"]["CHRONON_CONF_PATH"] = conf_path
|
|
401
|
+
if params["app_name"]:
|
|
402
|
+
environment["cli_args"]["APP_NAME"] = params["app_name"]
|
|
403
|
+
else:
|
|
404
|
+
if not params["app_name"] and not environment["cli_args"].get("APP_NAME"):
|
|
405
|
+
# Provide basic app_name when no conf is defined.
|
|
406
|
+
# Modes like metadata-upload and metadata-export can rely on conf-type or folder rather than a conf.
|
|
407
|
+
environment["cli_args"]["APP_NAME"] = "_".join(
|
|
408
|
+
[
|
|
409
|
+
k
|
|
410
|
+
for k in [
|
|
411
|
+
"chronon",
|
|
412
|
+
conf_type,
|
|
413
|
+
params["mode"].replace("-", "_") if params["mode"] else None,
|
|
414
|
+
]
|
|
415
|
+
if k is not None
|
|
416
|
+
]
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Adding these to make sure they are printed if provided by the environment.
|
|
420
|
+
environment["cli_args"]["CHRONON_DRIVER_JAR"] = params["chronon_jar"]
|
|
421
|
+
environment["cli_args"]["CHRONON_ONLINE_JAR"] = params["online_jar"]
|
|
422
|
+
environment["cli_args"]["CHRONON_ONLINE_CLASS"] = params["online_class"]
|
|
423
|
+
order = [
|
|
424
|
+
"conf_env",
|
|
425
|
+
"team_env",
|
|
426
|
+
"production_team_env",
|
|
427
|
+
"default_env",
|
|
428
|
+
"common_env",
|
|
429
|
+
"cli_args",
|
|
430
|
+
]
|
|
431
|
+
LOG.info("Setting env variables:")
|
|
432
|
+
for key in os.environ:
|
|
433
|
+
if any([key in environment[set_key] for set_key in order]):
|
|
434
|
+
LOG.info(f"From <environment> found {key}={os.environ[key]}")
|
|
435
|
+
for set_key in order:
|
|
436
|
+
for key, value in environment[set_key].items():
|
|
437
|
+
if key not in os.environ and value is not None:
|
|
438
|
+
LOG.info(f"From <{set_key}> setting {key}={value}")
|
|
439
|
+
os.environ[key] = value
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def split_date_range(start_date, end_date, parallelism):
|
|
443
|
+
start_date = datetime.strptime(start_date, "%Y-%m-%d")
|
|
444
|
+
end_date = datetime.strptime(end_date, "%Y-%m-%d")
|
|
445
|
+
if start_date > end_date:
|
|
446
|
+
raise ValueError("Start date should be earlier than end date")
|
|
447
|
+
total_days = (
|
|
448
|
+
end_date - start_date
|
|
449
|
+
).days + 1 # +1 to include the end_date in the range
|
|
450
|
+
|
|
451
|
+
# Check if parallelism is greater than total_days
|
|
452
|
+
if parallelism > total_days:
|
|
453
|
+
raise ValueError("Parallelism should be less than or equal to total days")
|
|
454
|
+
|
|
455
|
+
split_size = total_days // parallelism
|
|
456
|
+
date_ranges = []
|
|
457
|
+
|
|
458
|
+
for i in range(parallelism):
|
|
459
|
+
split_start = start_date + timedelta(days=i * split_size)
|
|
460
|
+
if i == parallelism - 1:
|
|
461
|
+
split_end = end_date
|
|
462
|
+
else:
|
|
463
|
+
split_end = split_start + timedelta(days=split_size - 1)
|
|
464
|
+
date_ranges.append(
|
|
465
|
+
(split_start.strftime("%Y-%m-%d"), split_end.strftime("%Y-%m-%d"))
|
|
466
|
+
)
|
|
467
|
+
return date_ranges
|
|
468
|
+
|
|
469
|
+
def get_metadata_name_from_conf(repo_path, conf_path):
|
|
470
|
+
with open(os.path.join(repo_path, conf_path), "r") as conf_file:
|
|
471
|
+
data = json.load(conf_file)
|
|
472
|
+
return data.get("metaData", {}).get("name", None)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from importlib.metadata import PackageNotFoundError
|
|
2
|
+
from importlib.metadata import version as ver
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from ai.chronon.cli.compile.display.console import console
|
|
7
|
+
from ai.chronon.repo.compile import compile
|
|
8
|
+
from ai.chronon.repo.hub_runner import hub
|
|
9
|
+
from ai.chronon.repo.init import main as init_main
|
|
10
|
+
from ai.chronon.repo.run import main as run_main
|
|
11
|
+
|
|
12
|
+
LOGO = """
|
|
13
|
+
=%%%@:-%%%@=:%%%@+ .%@%@@@@@@%%%%%%: .+#%*. -%%%= -#%#-
|
|
14
|
+
:@@@@#.@@@@%.%@@@@. .@@@@@@@@@@@@@@- -@@@@= =@@@+ @@@@@
|
|
15
|
+
:@@@@*.%@@@#.#@@@%. .#@@@@: :==: =@@@+ -=- :
|
|
16
|
+
=@@@@=-@@@@+:%@@@#. #@@@%. :--: .%%=:+#%@@@%#+- =@@@+ .-:-. *%= #%%* :=#%@@@@#*-
|
|
17
|
+
.#@@@#-+@@@%-=@@@@- .%@@@%. @@@@ .@@@@@@@@%%@@@@%= =@@@+ +@@@= *@@@+. %@@% :#@@@@%%%@@@@@=
|
|
18
|
+
+**+=-%@@@+-#@@@*----=. :@@@@# %@@@ .@@@@%=. .-#@@@* =@@@+ +@@@= *@@@@@*: %@@% -@@@%- .+@@@*
|
|
19
|
+
+@@@%-+@@@%-=@@@@+ :@@@@* @@@@ .@@@@. #@@@: =@@@+ +@@@= *@@@%@@@*: %@@% %@@@#++****+*@@@@-
|
|
20
|
+
-@@@@+:#@@@*:#@@@#. -@@@@* @@@@ .@@@@ *@@@- =@@@+ +@@@= *@@@.-%@@@#-%@@% @@@@****#****++++:
|
|
21
|
+
=@@@@--@@@@=:@@@@* =@@@@+ @@@@ .@@@@#. .+@@@% =@@@+ +@@@= *@@@ -#@@@@@@% =@@@*.
|
|
22
|
+
+@@@@--@@@@=:@@@@* +@@@@@#########+ @@@@ .@@@@@@%*+*#@@@@* =@@@+ +@@@= *@@@. :#@@@@% =@@@@% -==+-
|
|
23
|
+
:@@@@* @@@@# @@@@% *@@@@@@@@@@@@@@@% @@@@ .@@@@#@@@@@@@%+: =@@@+ +@@@= *@@@. :*@@% .=#@@@@@@@%*:
|
|
24
|
+
.@@@%
|
|
25
|
+
.@@@%
|
|
26
|
+
.@@@@
|
|
27
|
+
---:
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _set_package_version():
|
|
32
|
+
try:
|
|
33
|
+
package_version = ver("zipline-ai")
|
|
34
|
+
except PackageNotFoundError:
|
|
35
|
+
console.print("No package found. Continuing with the latest version.")
|
|
36
|
+
package_version = "latest"
|
|
37
|
+
return package_version
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@click.group(help="The Zipline CLI. A tool for authoring and running Zipline pipelines in the cloud. For more information, see: https://chronon.ai/")
|
|
41
|
+
@click.version_option(version=_set_package_version())
|
|
42
|
+
@click.pass_context
|
|
43
|
+
def zipline(ctx):
|
|
44
|
+
ctx.ensure_object(dict)
|
|
45
|
+
ctx.obj["version"] = _set_package_version()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
zipline.add_command(compile)
|
|
49
|
+
zipline.add_command(run_main)
|
|
50
|
+
zipline.add_command(init_main)
|
|
51
|
+
zipline.add_command(hub)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import google.auth
|
|
5
|
+
import requests
|
|
6
|
+
from google.auth.transport.requests import Request
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ZiplineHub:
|
|
10
|
+
def __init__(self, base_url):
|
|
11
|
+
if not base_url:
|
|
12
|
+
raise ValueError("Base URL for ZiplineHub cannot be empty.")
|
|
13
|
+
self.base_url = base_url
|
|
14
|
+
if self.base_url.startswith("https") and self.base_url.endswith(".app"):
|
|
15
|
+
print("\n 🔐 Using Google Cloud authentication for ZiplineHub.")
|
|
16
|
+
|
|
17
|
+
# First try to get ID token from environment (GitHub Actions)
|
|
18
|
+
self.id_token = os.getenv('GCP_ID_TOKEN')
|
|
19
|
+
if self.id_token:
|
|
20
|
+
print(" 🔑 Using ID token from environment")
|
|
21
|
+
else:
|
|
22
|
+
# Fallback to Google Cloud authentication
|
|
23
|
+
print(" 🔑 Generating ID token from default credentials")
|
|
24
|
+
credentials, project_id = google.auth.default()
|
|
25
|
+
credentials.refresh(Request())
|
|
26
|
+
self.id_token = credentials.id_token
|
|
27
|
+
|
|
28
|
+
def call_diff_api(self, names_to_hashes: dict[str, str]) -> Optional[list[str]]:
|
|
29
|
+
url = f"{self.base_url}/upload/v1/diff"
|
|
30
|
+
|
|
31
|
+
diff_request = {
|
|
32
|
+
'namesToHashes': names_to_hashes
|
|
33
|
+
}
|
|
34
|
+
headers = {'Content-Type': 'application/json'}
|
|
35
|
+
if hasattr(self, 'id_token'):
|
|
36
|
+
headers['Authorization'] = f'Bearer {self.id_token}'
|
|
37
|
+
try:
|
|
38
|
+
response = requests.post(url, json=diff_request, headers=headers)
|
|
39
|
+
response.raise_for_status()
|
|
40
|
+
diff_response = response.json()
|
|
41
|
+
return diff_response['diff']
|
|
42
|
+
except requests.RequestException as e:
|
|
43
|
+
print(f" ❌ Error calling diff API: {e}")
|
|
44
|
+
raise e
|
|
45
|
+
|
|
46
|
+
def call_upload_api(self, diff_confs, branch: str):
|
|
47
|
+
url = f"{self.base_url}/upload/v1/confs"
|
|
48
|
+
|
|
49
|
+
upload_request = {
|
|
50
|
+
'diffConfs': diff_confs,
|
|
51
|
+
'branch': branch,
|
|
52
|
+
}
|
|
53
|
+
headers = {'Content-Type': 'application/json'}
|
|
54
|
+
if hasattr(self, 'id_token'):
|
|
55
|
+
headers['Authorization'] = f'Bearer {self.id_token}'
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
response = requests.post(url, json=upload_request, headers=headers)
|
|
59
|
+
response.raise_for_status()
|
|
60
|
+
return response.json()
|
|
61
|
+
except requests.RequestException as e:
|
|
62
|
+
print(f" ❌ Error calling upload API: {e}")
|
|
63
|
+
raise e
|
|
64
|
+
|
|
65
|
+
def call_sync_api(self, branch: str, names_to_hashes: dict[str, str]) -> Optional[list[str]]:
|
|
66
|
+
url = f"{self.base_url}/upload/v1/sync"
|
|
67
|
+
|
|
68
|
+
sync_request = {
|
|
69
|
+
"namesToHashes": names_to_hashes,
|
|
70
|
+
"branch": branch,
|
|
71
|
+
}
|
|
72
|
+
headers = {'Content-Type': 'application/json'}
|
|
73
|
+
if hasattr(self, 'id_token'):
|
|
74
|
+
headers['Authorization'] = f'Bearer {self.id_token}'
|
|
75
|
+
try:
|
|
76
|
+
response = requests.post(url, json=sync_request, headers=headers)
|
|
77
|
+
response.raise_for_status()
|
|
78
|
+
return response.json()
|
|
79
|
+
except requests.RequestException as e:
|
|
80
|
+
print(f" ❌ Error calling diff API: {e}")
|
|
81
|
+
raise e
|
|
82
|
+
|
|
83
|
+
def call_workflow_start_api(self, conf_name, mode, branch, user, start, end, conf_hash):
|
|
84
|
+
url = f"{self.base_url}/workflow/start"
|
|
85
|
+
|
|
86
|
+
workflow_request = {
|
|
87
|
+
'confName': conf_name,
|
|
88
|
+
'confHash': conf_hash,
|
|
89
|
+
'mode': mode,
|
|
90
|
+
'branch': branch,
|
|
91
|
+
'user': user,
|
|
92
|
+
'start': start,
|
|
93
|
+
'end': end,
|
|
94
|
+
}
|
|
95
|
+
headers = {'Content-Type': 'application/json'}
|
|
96
|
+
if hasattr(self, 'id_token'):
|
|
97
|
+
headers['Authorization'] = f'Bearer {self.id_token}'
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
response = requests.post(url, json=workflow_request, headers=headers)
|
|
101
|
+
response.raise_for_status()
|
|
102
|
+
return response.json()
|
|
103
|
+
except requests.RequestException as e:
|
|
104
|
+
print(f" ❌ Error calling workflow start API: {e}")
|
|
105
|
+
raise e
|