awx-zipline-ai 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +248 -0
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +336 -0
- ai/chronon/cli/compile/compile_context.py +173 -0
- ai/chronon/cli/compile/compiler.py +183 -0
- ai/chronon/cli/compile/conf_validator.py +742 -0
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +102 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +111 -0
- ai/chronon/cli/compile/fill_templates.py +35 -0
- ai/chronon/cli/compile/parse_configs.py +134 -0
- ai/chronon/cli/compile/parse_teams.py +242 -0
- ai/chronon/cli/compile/serializer.py +109 -0
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +145 -0
- ai/chronon/cli/logger.py +59 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/join.py +580 -0
- ai/chronon/logger.py +23 -0
- ai/chronon/model.py +40 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +39 -0
- ai/chronon/repo/aws.py +284 -0
- ai/chronon/repo/cluster.py +136 -0
- ai/chronon/repo/compile.py +62 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +269 -0
- ai/chronon/repo/explore.py +418 -0
- ai/chronon/repo/extract_objects.py +134 -0
- ai/chronon/repo/gcp.py +586 -0
- ai/chronon/repo/gitpython_utils.py +15 -0
- ai/chronon/repo/hub_runner.py +261 -0
- ai/chronon/repo/hub_uploader.py +109 -0
- ai/chronon/repo/init.py +60 -0
- ai/chronon/repo/join_backfill.py +119 -0
- ai/chronon/repo/run.py +296 -0
- ai/chronon/repo/serializer.py +133 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +481 -0
- ai/chronon/repo/zipline.py +35 -0
- ai/chronon/repo/zipline_hub.py +277 -0
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +26 -0
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +26 -0
- ai/chronon/resources/gcp/teams.py +58 -0
- ai/chronon/source.py +86 -0
- ai/chronon/staging_query.py +226 -0
- ai/chronon/types.py +58 -0
- ai/chronon/utils.py +510 -0
- ai/chronon/windows.py +48 -0
- awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
- awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
- awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- gen_thrift/api/__init__.py +1 -0
- gen_thrift/api/constants.py +15 -0
- gen_thrift/api/ttypes.py +3754 -0
- gen_thrift/common/__init__.py +1 -0
- gen_thrift/common/constants.py +15 -0
- gen_thrift/common/ttypes.py +1814 -0
- gen_thrift/eval/__init__.py +1 -0
- gen_thrift/eval/constants.py +15 -0
- gen_thrift/eval/ttypes.py +660 -0
- gen_thrift/fetcher/__init__.py +1 -0
- gen_thrift/fetcher/constants.py +15 -0
- gen_thrift/fetcher/ttypes.py +127 -0
- gen_thrift/hub/__init__.py +1 -0
- gen_thrift/hub/constants.py +15 -0
- gen_thrift/hub/ttypes.py +1109 -0
- gen_thrift/observability/__init__.py +1 -0
- gen_thrift/observability/constants.py +15 -0
- gen_thrift/observability/ttypes.py +2355 -0
- gen_thrift/planner/__init__.py +1 -0
- gen_thrift/planner/constants.py +15 -0
- gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/query.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Copyright (C) 2023 The Chronon Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from collections import OrderedDict
|
|
16
|
+
from typing import Dict, List
|
|
17
|
+
|
|
18
|
+
import gen_thrift.api.ttypes as api
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def Query(
|
|
22
|
+
selects: Dict[str, str] = None,
|
|
23
|
+
wheres: List[str] = None,
|
|
24
|
+
start_partition: str = None,
|
|
25
|
+
end_partition: str = None,
|
|
26
|
+
time_column: str = None,
|
|
27
|
+
setups: List[str] = None,
|
|
28
|
+
mutation_time_column: str = None,
|
|
29
|
+
reversal_column: str = None,
|
|
30
|
+
partition_column: str = None,
|
|
31
|
+
partition_format: str = None,
|
|
32
|
+
sub_partitions_to_wait_for: List[str] = None,
|
|
33
|
+
) -> api.Query:
|
|
34
|
+
"""
|
|
35
|
+
Create a query object that is used to scan data from various data sources.
|
|
36
|
+
This contains partition ranges, row level transformations and filtering logic.
|
|
37
|
+
Additionally we also require a time_column for TEMPORAL events, mutation_time_column & reversal
|
|
38
|
+
for TEMPORAL entities.
|
|
39
|
+
|
|
40
|
+
:param selects: Spark sql expressions with only arithmetic, function application & inline lambdas.
|
|
41
|
+
You can also apply udfs see setups param below.::
|
|
42
|
+
|
|
43
|
+
Example: {
|
|
44
|
+
"alias": "built_in_function(col1) * my_udf(col2)",
|
|
45
|
+
"alias1": "aggregate(array_col, 0, (acc, x) -> acc + x)"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
See: https://spark.apache.org/docs/latest/api/sql/#built-in-functions
|
|
49
|
+
When none, we will assume that no transformations are needed and will pick columns necessary for aggregations.
|
|
50
|
+
:type selects: List[str], optional
|
|
51
|
+
:param wheres: Used for filtering. Same as above, but each expression must return boolean.
|
|
52
|
+
Expressions are joined using AND.
|
|
53
|
+
:type wheres: List[str], optional
|
|
54
|
+
:param start_partition: From which partition of the source is the data valid from - inclusive.
|
|
55
|
+
When absent we will consider all available data is usable.
|
|
56
|
+
:type start_partition: str, optional
|
|
57
|
+
:param end_partition: Till what partition of the source is the data valid till - inclusive.
|
|
58
|
+
Not specified unless you know for a fact that a particular source has expired after a partition and you
|
|
59
|
+
should instead use another source after this partition.
|
|
60
|
+
:type end_partition: str, optional
|
|
61
|
+
:param time_column: a single expression to produce time as ** milliseconds since epoch**.
|
|
62
|
+
:type time_column: str, optional
|
|
63
|
+
:param setups: you can register UDFs using setups
|
|
64
|
+
["ADD JAR YOUR_JAR", "create temporary function YOU_UDF_NAME as YOUR_CLASS"]
|
|
65
|
+
:type setups: List[str], optional
|
|
66
|
+
:param mutation_time_column: For entities, with real time accuracy, you need to specify an expression that
|
|
67
|
+
represents mutation time. Time should be milliseconds since epoch.
|
|
68
|
+
This is not necessary for event sources, defaults to "mutation_ts"
|
|
69
|
+
:type mutation_time_column: str, optional
|
|
70
|
+
:param reversal_column: (defaults to "is_before")
|
|
71
|
+
For entities with realtime accuracy, we divide updates into two additions & reversal.
|
|
72
|
+
updates have two rows - one with is_before = True (the old value) & is_before = False (the new value)
|
|
73
|
+
inserts only have is_before = false (just the new value).
|
|
74
|
+
deletes only have is_before = true (just the old value).
|
|
75
|
+
This is not necessary for event sources.
|
|
76
|
+
:type reversal_column: str, optional
|
|
77
|
+
:param partition_column:
|
|
78
|
+
Specify this to override spark.chronon.partition.column set in teams.py for this particular query.
|
|
79
|
+
:type partition_column: str, optional
|
|
80
|
+
:param sub_partitions_to_wait_for:
|
|
81
|
+
Additional partitions to be used in sensing that the source data has landed. Should be a full partition string, such as `hr=23:00'
|
|
82
|
+
:type sub_partitions_to_wait_for: List[str], optional
|
|
83
|
+
:param partition_format:
|
|
84
|
+
Date format string to expect the partition values to be in.
|
|
85
|
+
:type partition_format: str, optional
|
|
86
|
+
:return: A Query object that Chronon can use to scan just the necessary data efficiently.
|
|
87
|
+
"""
|
|
88
|
+
return api.Query(
|
|
89
|
+
selects=selects,
|
|
90
|
+
wheres=wheres,
|
|
91
|
+
startPartition=start_partition,
|
|
92
|
+
endPartition=end_partition,
|
|
93
|
+
timeColumn=time_column,
|
|
94
|
+
setups=setups,
|
|
95
|
+
mutationTimeColumn=mutation_time_column,
|
|
96
|
+
reversalColumn=reversal_column,
|
|
97
|
+
partitionColumn=partition_column,
|
|
98
|
+
subPartitionsToWaitFor=sub_partitions_to_wait_for,
|
|
99
|
+
partitionFormat=partition_format,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def selects(*args, **kwargs):
|
|
104
|
+
"""
|
|
105
|
+
Create a dictionary required for the selects parameter of Query.
|
|
106
|
+
|
|
107
|
+
.. code-block:: python
|
|
108
|
+
selects(
|
|
109
|
+
"event_id",
|
|
110
|
+
user_id="user_id",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
creates the following dictionary:
|
|
114
|
+
|
|
115
|
+
.. code-block:: python
|
|
116
|
+
{
|
|
117
|
+
"event_id": "event_id",
|
|
118
|
+
"user_id": "user_id"
|
|
119
|
+
}
|
|
120
|
+
"""
|
|
121
|
+
result = OrderedDict()
|
|
122
|
+
for x in args:
|
|
123
|
+
result[x] = x
|
|
124
|
+
for k, v in kwargs.items():
|
|
125
|
+
result[k] = v
|
|
126
|
+
return result
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Copyright (C) 2023 The Chronon Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from gen_thrift.api.ttypes import ConfType, GroupBy, Join, Model, StagingQuery
|
|
16
|
+
|
|
17
|
+
JOIN_FOLDER_NAME = "joins"
|
|
18
|
+
GROUP_BY_FOLDER_NAME = "group_bys"
|
|
19
|
+
STAGING_QUERY_FOLDER_NAME = "staging_queries"
|
|
20
|
+
MODEL_FOLDER_NAME = "models"
|
|
21
|
+
# TODO - make team part of thrift API?
|
|
22
|
+
TEAMS_FILE_PATH = "teams.json"
|
|
23
|
+
OUTPUT_ROOT = "production"
|
|
24
|
+
|
|
25
|
+
# This is set in the main function -
|
|
26
|
+
# from command line or from env variable during invocation
|
|
27
|
+
FOLDER_NAME_TO_CLASS = {
|
|
28
|
+
GROUP_BY_FOLDER_NAME: GroupBy,
|
|
29
|
+
JOIN_FOLDER_NAME: Join,
|
|
30
|
+
STAGING_QUERY_FOLDER_NAME: StagingQuery,
|
|
31
|
+
MODEL_FOLDER_NAME: Model,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
FOLDER_NAME_TO_CONF_TYPE = {
|
|
35
|
+
GROUP_BY_FOLDER_NAME: ConfType.GROUP_BY,
|
|
36
|
+
JOIN_FOLDER_NAME: ConfType.JOIN,
|
|
37
|
+
STAGING_QUERY_FOLDER_NAME: ConfType.STAGING_QUERY,
|
|
38
|
+
MODEL_FOLDER_NAME: ConfType.MODEL,
|
|
39
|
+
}
|
ai/chronon/repo/aws.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import os
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import boto3
|
|
7
|
+
|
|
8
|
+
from ai.chronon.logger import get_logger
|
|
9
|
+
from ai.chronon.repo.constants import ROUTES, ZIPLINE_DIRECTORY
|
|
10
|
+
from ai.chronon.repo.default_runner import Runner
|
|
11
|
+
from ai.chronon.repo.utils import (
|
|
12
|
+
JobType,
|
|
13
|
+
check_call,
|
|
14
|
+
extract_filename_from_path,
|
|
15
|
+
get_customer_id,
|
|
16
|
+
split_date_range,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
LOG = get_logger()
|
|
20
|
+
|
|
21
|
+
# AWS SPECIFIC CONSTANTS
|
|
22
|
+
EMR_ENTRY = "ai.chronon.integrations.aws.EmrSubmitter"
|
|
23
|
+
ZIPLINE_AWS_JAR_DEFAULT = "cloud_aws_lib_deploy.jar"
|
|
24
|
+
ZIPLINE_AWS_ONLINE_CLASS_DEFAULT = "ai.chronon.integrations.aws.AwsApiImpl"
|
|
25
|
+
ZIPLINE_AWS_FLINK_JAR_DEFAULT = "flink_assembly_deploy.jar"
|
|
26
|
+
ZIPLINE_AWS_SERVICE_JAR = "service_assembly_deploy.jar"
|
|
27
|
+
|
|
28
|
+
LOCAL_FILE_TO_ETAG_JSON = f"{ZIPLINE_DIRECTORY}/local_file_to_etag.json"
|
|
29
|
+
|
|
30
|
+
EMR_MOUNT_FILE_PREFIX = "/mnt/zipline/"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AwsRunner(Runner):
|
|
34
|
+
def __init__(self, args):
|
|
35
|
+
aws_jar_path = AwsRunner.download_zipline_aws_jar(
|
|
36
|
+
ZIPLINE_DIRECTORY, get_customer_id(), args["version"], ZIPLINE_AWS_JAR_DEFAULT
|
|
37
|
+
)
|
|
38
|
+
service_jar_path = AwsRunner.download_zipline_aws_jar(
|
|
39
|
+
ZIPLINE_DIRECTORY, get_customer_id(), args["version"], ZIPLINE_AWS_SERVICE_JAR
|
|
40
|
+
)
|
|
41
|
+
jar_path = f"{service_jar_path}:{aws_jar_path}" if args["mode"] == "fetch" else aws_jar_path
|
|
42
|
+
self.version = args.get("version", "latest")
|
|
43
|
+
|
|
44
|
+
super().__init__(args, os.path.expanduser(jar_path))
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def upload_s3_file(bucket_name: str, source_file_name: str, destination_blob_name: str):
|
|
48
|
+
"""Uploads a file to the bucket."""
|
|
49
|
+
obj = boto3.client("s3")
|
|
50
|
+
try:
|
|
51
|
+
obj.upload_file(source_file_name, bucket_name, destination_blob_name)
|
|
52
|
+
print(
|
|
53
|
+
f"File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}."
|
|
54
|
+
)
|
|
55
|
+
return f"s3://{bucket_name}/{destination_blob_name}"
|
|
56
|
+
except Exception as e:
|
|
57
|
+
raise RuntimeError(f"Failed to upload {source_file_name}: {str(e)}") from e
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def download_zipline_aws_jar(
|
|
61
|
+
destination_dir: str, customer_id: str, version: str, jar_name: str
|
|
62
|
+
):
|
|
63
|
+
s3_client = boto3.client("s3")
|
|
64
|
+
destination_path = f"{destination_dir}/{jar_name}"
|
|
65
|
+
source_key_name = f"release/{version}/jars/{jar_name}"
|
|
66
|
+
bucket_name = f"zipline-artifacts-{customer_id}"
|
|
67
|
+
|
|
68
|
+
are_identical = (
|
|
69
|
+
AwsRunner.compare_s3_and_local_file_hashes(
|
|
70
|
+
bucket_name, source_key_name, destination_path
|
|
71
|
+
)
|
|
72
|
+
if os.path.exists(destination_path)
|
|
73
|
+
else False
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if are_identical:
|
|
77
|
+
print(f"{destination_path} matches S3 {bucket_name}/{source_key_name}")
|
|
78
|
+
else:
|
|
79
|
+
print(f"{destination_path} does NOT match S3 {bucket_name}/{source_key_name}")
|
|
80
|
+
print(f"Downloading {jar_name} from S3...")
|
|
81
|
+
|
|
82
|
+
s3_client.download_file(
|
|
83
|
+
Filename=destination_path, Bucket=bucket_name, Key=source_key_name
|
|
84
|
+
)
|
|
85
|
+
# Persist ETag to prevent downloading the same file next time
|
|
86
|
+
etag = AwsRunner.get_s3_file_hash(bucket_name, source_key_name)
|
|
87
|
+
if os.path.exists(LOCAL_FILE_TO_ETAG_JSON):
|
|
88
|
+
with open(LOCAL_FILE_TO_ETAG_JSON, "r") as file:
|
|
89
|
+
data = json.load(file)
|
|
90
|
+
|
|
91
|
+
# Add the new entry
|
|
92
|
+
data[destination_path] = etag
|
|
93
|
+
|
|
94
|
+
# Write the updated dictionary back to the file
|
|
95
|
+
with open(LOCAL_FILE_TO_ETAG_JSON, "w") as file:
|
|
96
|
+
json.dump(data, file)
|
|
97
|
+
else:
|
|
98
|
+
with open(LOCAL_FILE_TO_ETAG_JSON, "w") as file:
|
|
99
|
+
data = {destination_path: etag}
|
|
100
|
+
json.dump(data, file)
|
|
101
|
+
|
|
102
|
+
return destination_path
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def get_s3_file_hash(bucket_name: str, file_name: str):
|
|
106
|
+
s3_client = boto3.client("s3")
|
|
107
|
+
response = s3_client.head_object(Bucket=bucket_name, Key=file_name)
|
|
108
|
+
return response["ETag"].strip('"')
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def get_local_file_hash(file_name: str):
|
|
112
|
+
# read in the json file
|
|
113
|
+
if os.path.exists(LOCAL_FILE_TO_ETAG_JSON):
|
|
114
|
+
with open(LOCAL_FILE_TO_ETAG_JSON, "r") as f:
|
|
115
|
+
data = json.load(f)
|
|
116
|
+
if file_name in data:
|
|
117
|
+
return data[file_name]
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def compare_s3_and_local_file_hashes(bucket_name: str, s3_file_path: str, local_file_path: str):
|
|
122
|
+
try:
|
|
123
|
+
s3_hash = AwsRunner.get_s3_file_hash(bucket_name, s3_file_path)
|
|
124
|
+
local_hash = AwsRunner.get_local_file_hash(local_file_path)
|
|
125
|
+
print(f"Local hash: {local_hash}, S3 hash: {s3_hash}")
|
|
126
|
+
return s3_hash == local_hash
|
|
127
|
+
except Exception as e:
|
|
128
|
+
print(f"Error comparing files: {str(e)}")
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
def generate_emr_submitter_args(
|
|
132
|
+
self,
|
|
133
|
+
user_args: str,
|
|
134
|
+
job_type: JobType = JobType.SPARK,
|
|
135
|
+
local_files_to_upload: List[str] = None,
|
|
136
|
+
):
|
|
137
|
+
customer_warehouse_bucket_name = f"zipline-warehouse-{get_customer_id()}"
|
|
138
|
+
s3_files = []
|
|
139
|
+
for source_file in local_files_to_upload:
|
|
140
|
+
# upload to `metadata` folder
|
|
141
|
+
destination_file_path = f"metadata/{extract_filename_from_path(source_file)}"
|
|
142
|
+
s3_files.append(
|
|
143
|
+
AwsRunner.upload_s3_file(
|
|
144
|
+
customer_warehouse_bucket_name, source_file, destination_file_path
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# we also want the additional-confs included here. it should already be in the bucket
|
|
149
|
+
|
|
150
|
+
zipline_artifacts_bucket_prefix = "s3://zipline-artifacts"
|
|
151
|
+
|
|
152
|
+
s3_files.append(
|
|
153
|
+
f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}/confs/additional-confs.yaml"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
s3_file_args = ",".join(s3_files)
|
|
157
|
+
|
|
158
|
+
# include jar uri. should also already be in the bucket
|
|
159
|
+
jar_uri = (
|
|
160
|
+
f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}"
|
|
161
|
+
+ f"/release/{self.version}/jars/{ZIPLINE_AWS_JAR_DEFAULT}"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
final_args = (
|
|
165
|
+
"{user_args} --jar-uri={jar_uri} --job-type={job_type} --main-class={main_class}"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
if job_type == JobType.FLINK:
|
|
169
|
+
main_class = "ai.chronon.flink.FlinkJob"
|
|
170
|
+
flink_jar_uri = (
|
|
171
|
+
f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}"
|
|
172
|
+
+ f"/jars/{ZIPLINE_AWS_FLINK_JAR_DEFAULT}"
|
|
173
|
+
)
|
|
174
|
+
return (
|
|
175
|
+
final_args.format(
|
|
176
|
+
user_args=user_args,
|
|
177
|
+
jar_uri=jar_uri,
|
|
178
|
+
job_type=job_type.value,
|
|
179
|
+
main_class=main_class,
|
|
180
|
+
)
|
|
181
|
+
+ f" --flink-main-jar-uri={flink_jar_uri}"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
elif job_type == JobType.SPARK:
|
|
185
|
+
main_class = "ai.chronon.spark.Driver"
|
|
186
|
+
return (
|
|
187
|
+
final_args.format(
|
|
188
|
+
user_args=user_args,
|
|
189
|
+
jar_uri=jar_uri,
|
|
190
|
+
job_type=job_type.value,
|
|
191
|
+
main_class=main_class,
|
|
192
|
+
)
|
|
193
|
+
+ f" --additional-conf-path={EMR_MOUNT_FILE_PREFIX}additional-confs.yaml"
|
|
194
|
+
f" --files={s3_file_args}"
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
raise ValueError(f"Invalid job type: {job_type}")
|
|
198
|
+
|
|
199
|
+
def run(self):
|
|
200
|
+
command_list = []
|
|
201
|
+
if self.mode == "info":
|
|
202
|
+
command_list.append(
|
|
203
|
+
"python3 {script} --conf {conf} --ds {ds} --repo {repo}".format(
|
|
204
|
+
script=self.render_info, conf=self.conf, ds=self.ds, repo=self.repo
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
elif self.sub_help or self.mode == "fetch":
|
|
208
|
+
entrypoint = "ai.chronon.online.fetcher.FetcherMain"
|
|
209
|
+
command_list.append(
|
|
210
|
+
"java -cp {jar} {entrypoint} {subcommand} {args}".format(
|
|
211
|
+
jar=self.jar_path,
|
|
212
|
+
entrypoint=entrypoint,
|
|
213
|
+
args="--help" if self.sub_help else self._gen_final_args(),
|
|
214
|
+
subcommand=ROUTES[self.conf_type][self.mode],
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
elif self.mode in ["streaming", "streaming-client"]:
|
|
218
|
+
raise ValueError("Streaming is not supported for AWS yet.")
|
|
219
|
+
else:
|
|
220
|
+
local_files_to_upload_to_aws = []
|
|
221
|
+
if self.conf:
|
|
222
|
+
local_files_to_upload_to_aws.append(os.path.join(self.repo, self.conf))
|
|
223
|
+
if self.parallelism > 1:
|
|
224
|
+
assert self.start_ds is not None and self.ds is not None, (
|
|
225
|
+
"To use parallelism, please specify --start-ds and --end-ds to "
|
|
226
|
+
"break down into multiple backfill jobs"
|
|
227
|
+
)
|
|
228
|
+
date_ranges = split_date_range(self.start_ds, self.ds, self.parallelism)
|
|
229
|
+
for start_ds, end_ds in date_ranges:
|
|
230
|
+
user_args = "{subcommand} {args} {additional_args}".format(
|
|
231
|
+
subcommand=ROUTES[self.conf_type][self.mode],
|
|
232
|
+
args=self._gen_final_args(
|
|
233
|
+
start_ds=start_ds,
|
|
234
|
+
end_ds=end_ds,
|
|
235
|
+
# when we download files from s3 to emr, they'll be mounted at /mnt/zipline
|
|
236
|
+
override_conf_path=(
|
|
237
|
+
EMR_MOUNT_FILE_PREFIX + extract_filename_from_path(self.conf)
|
|
238
|
+
if self.conf
|
|
239
|
+
else None
|
|
240
|
+
),
|
|
241
|
+
),
|
|
242
|
+
additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
emr_args = self.generate_emr_submitter_args(
|
|
246
|
+
local_files_to_upload=local_files_to_upload_to_aws,
|
|
247
|
+
# for now, self.conf is the only local file that requires uploading to gcs
|
|
248
|
+
user_args=user_args,
|
|
249
|
+
)
|
|
250
|
+
command = f"java -cp {self.jar_path} {EMR_ENTRY} {emr_args}"
|
|
251
|
+
command_list.append(command)
|
|
252
|
+
else:
|
|
253
|
+
user_args = ("{subcommand} {args} {additional_args}").format(
|
|
254
|
+
subcommand=ROUTES[self.conf_type][self.mode],
|
|
255
|
+
args=self._gen_final_args(
|
|
256
|
+
start_ds=self.start_ds,
|
|
257
|
+
# when we download files from s3 to emr, they'll be mounted at /mnt/zipline
|
|
258
|
+
override_conf_path=(
|
|
259
|
+
EMR_MOUNT_FILE_PREFIX + extract_filename_from_path(self.conf)
|
|
260
|
+
if self.conf
|
|
261
|
+
else None
|
|
262
|
+
),
|
|
263
|
+
),
|
|
264
|
+
additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
emr_args = self.generate_emr_submitter_args(
|
|
268
|
+
# for now, self.conf is the only local file that requires uploading
|
|
269
|
+
local_files_to_upload=local_files_to_upload_to_aws,
|
|
270
|
+
user_args=user_args,
|
|
271
|
+
)
|
|
272
|
+
command = f"java -cp {self.jar_path} {EMR_ENTRY} {emr_args}"
|
|
273
|
+
command_list.append(command)
|
|
274
|
+
|
|
275
|
+
if len(command_list) > 1:
|
|
276
|
+
# parallel backfill mode
|
|
277
|
+
with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
|
|
278
|
+
LOG.info(
|
|
279
|
+
"Running args list {} with pool size {}".format(command_list, self.parallelism)
|
|
280
|
+
)
|
|
281
|
+
pool.map(check_call, command_list)
|
|
282
|
+
elif len(command_list) == 1:
|
|
283
|
+
# TODO: add log tailing
|
|
284
|
+
check_call(command_list[0])
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def generate_dataproc_cluster_config(
|
|
5
|
+
num_workers,
|
|
6
|
+
project_id,
|
|
7
|
+
artifact_prefix,
|
|
8
|
+
master_host_type="n2-highmem-64",
|
|
9
|
+
worker_host_type="n2-highmem-16",
|
|
10
|
+
subnetwork="default",
|
|
11
|
+
idle_timeout="7200s",
|
|
12
|
+
initialization_actions=None,
|
|
13
|
+
tags=None,
|
|
14
|
+
):
|
|
15
|
+
"""
|
|
16
|
+
Create a configuration for a Dataproc cluster.
|
|
17
|
+
:return: A json string representing the configuration.
|
|
18
|
+
"""
|
|
19
|
+
if initialization_actions is None:
|
|
20
|
+
initialization_actions = []
|
|
21
|
+
return json.dumps(
|
|
22
|
+
{
|
|
23
|
+
"gceClusterConfig": {
|
|
24
|
+
"subnetworkUri": subnetwork,
|
|
25
|
+
"serviceAccount": "dataproc@" + project_id + ".iam.gserviceaccount.com",
|
|
26
|
+
"serviceAccountScopes": [
|
|
27
|
+
"https://www.googleapis.com/auth/cloud-platform",
|
|
28
|
+
"https://www.googleapis.com/auth/monitoring",
|
|
29
|
+
"https://www.googleapis.com/auth/cloud.useraccounts.readonly",
|
|
30
|
+
"https://www.googleapis.com/auth/devstorage.read_write",
|
|
31
|
+
"https://www.googleapis.com/auth/logging.write",
|
|
32
|
+
],
|
|
33
|
+
"metadata": {
|
|
34
|
+
"hive-version": "3.1.2",
|
|
35
|
+
"SPARK_BQ_CONNECTOR_URL": "gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar",
|
|
36
|
+
"artifact_prefix": artifact_prefix.rstrip("/"),
|
|
37
|
+
},
|
|
38
|
+
"tags": tags or [],
|
|
39
|
+
},
|
|
40
|
+
"masterConfig": {
|
|
41
|
+
"numInstances": 1,
|
|
42
|
+
"machineTypeUri": master_host_type,
|
|
43
|
+
"diskConfig": {"bootDiskType": "pd-standard", "bootDiskSizeGb": 1024},
|
|
44
|
+
},
|
|
45
|
+
"workerConfig": {
|
|
46
|
+
"numInstances": num_workers,
|
|
47
|
+
"machineTypeUri": worker_host_type,
|
|
48
|
+
"diskConfig": {
|
|
49
|
+
"bootDiskType": "pd-standard",
|
|
50
|
+
"bootDiskSizeGb": 64,
|
|
51
|
+
"numLocalSsds": 2,
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
"softwareConfig": {
|
|
55
|
+
"imageVersion": "2.2.66-debian12",
|
|
56
|
+
"optionalComponents": [
|
|
57
|
+
"FLINK",
|
|
58
|
+
"JUPYTER",
|
|
59
|
+
],
|
|
60
|
+
"properties": {
|
|
61
|
+
"dataproc:dataproc.logging.stackdriver.enable": "true",
|
|
62
|
+
"dataproc:jobs.file-backed-output.enable": "true",
|
|
63
|
+
"dataproc:dataproc.logging.stackdriver.job.driver.enable": "true",
|
|
64
|
+
"dataproc:dataproc.logging.stackdriver.job.yarn.container.enable": "true",
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
"initializationActions": [
|
|
68
|
+
{"executable_file": initialization_action}
|
|
69
|
+
for initialization_action in (
|
|
70
|
+
(initialization_actions or [])
|
|
71
|
+
+ [artifact_prefix.rstrip("/") + "/scripts/copy_java_security.sh"]
|
|
72
|
+
)
|
|
73
|
+
],
|
|
74
|
+
"endpointConfig": {
|
|
75
|
+
"enableHttpPortAccess": True,
|
|
76
|
+
},
|
|
77
|
+
"lifecycleConfig": {
|
|
78
|
+
"idleDeleteTtl": idle_timeout,
|
|
79
|
+
},
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def fixed_cluster(
|
|
85
|
+
size,
|
|
86
|
+
project_id,
|
|
87
|
+
artifact_prefix,
|
|
88
|
+
subnetwork="default",
|
|
89
|
+
initialization_actions=None,
|
|
90
|
+
tags=None,
|
|
91
|
+
):
|
|
92
|
+
"""
|
|
93
|
+
Create a Dataproc cluster configuration based on t-shirt sizes.
|
|
94
|
+
|
|
95
|
+
:param size: T-shirt size - 'small', 'medium', or 'large'
|
|
96
|
+
:param project_id: GCP project ID
|
|
97
|
+
:param artifact_prefix: Artifact prefix for initialization scripts
|
|
98
|
+
:param subnetwork: Subnetwork for the cluster
|
|
99
|
+
:param initialization_actions: List of initialization actions
|
|
100
|
+
:param tags: List of tags for the cluster
|
|
101
|
+
:return: A json string representing the cluster configuration
|
|
102
|
+
"""
|
|
103
|
+
size_configs = {
|
|
104
|
+
"small": {
|
|
105
|
+
"num_workers": 20,
|
|
106
|
+
"worker_host_type": "n2-highmem-4", # 16GB, 4 cores
|
|
107
|
+
"master_host_type": "n2-highmem-4", # Same as worker for consistency
|
|
108
|
+
},
|
|
109
|
+
"medium": {
|
|
110
|
+
"num_workers": 50,
|
|
111
|
+
"worker_host_type": "n2-highmem-16", # 32GB, 8 cores
|
|
112
|
+
"master_host_type": "n2-highmem-16", # Same as worker for consistency
|
|
113
|
+
},
|
|
114
|
+
"large": {
|
|
115
|
+
"num_workers": 250,
|
|
116
|
+
"worker_host_type": "n2-highmem-16", # 64GB, 16 cores
|
|
117
|
+
"master_host_type": "n2-highmem-16", # Same as worker for consistency
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if size not in size_configs:
|
|
122
|
+
raise ValueError(f"Invalid size '{size}'. Must be one of: {list(size_configs.keys())}")
|
|
123
|
+
|
|
124
|
+
config = size_configs[size]
|
|
125
|
+
|
|
126
|
+
return generate_dataproc_cluster_config(
|
|
127
|
+
num_workers=config["num_workers"],
|
|
128
|
+
project_id=project_id,
|
|
129
|
+
artifact_prefix=artifact_prefix,
|
|
130
|
+
master_host_type=config["master_host_type"],
|
|
131
|
+
worker_host_type=config["worker_host_type"],
|
|
132
|
+
subnetwork=subnetwork,
|
|
133
|
+
idle_timeout="3600s", # 1 hour of inactivity
|
|
134
|
+
initialization_actions=initialization_actions,
|
|
135
|
+
tags=tags,
|
|
136
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from ai.chronon.cli.compile.compile_context import CompileContext
|
|
7
|
+
from ai.chronon.cli.compile.compiler import Compiler
|
|
8
|
+
from ai.chronon.cli.compile.display.console import console
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@click.command(name="compile")
|
|
12
|
+
@click.option(
|
|
13
|
+
"--chronon-root",
|
|
14
|
+
envvar="CHRONON_ROOT",
|
|
15
|
+
help="Path to the root chronon folder",
|
|
16
|
+
default=os.getcwd(),
|
|
17
|
+
)
|
|
18
|
+
@click.option(
|
|
19
|
+
"--ignore-python-errors",
|
|
20
|
+
is_flag=True,
|
|
21
|
+
default=False,
|
|
22
|
+
help="Allow compilation to proceed even with Python errors (useful for testing)",
|
|
23
|
+
)
|
|
24
|
+
def compile(chronon_root, ignore_python_errors):
|
|
25
|
+
print()
|
|
26
|
+
|
|
27
|
+
if chronon_root is None or chronon_root == "":
|
|
28
|
+
chronon_root = os.getcwd()
|
|
29
|
+
|
|
30
|
+
if chronon_root not in sys.path:
|
|
31
|
+
console.print(
|
|
32
|
+
f"Adding [cyan italic]{chronon_root}[/cyan italic] to python path, during compile."
|
|
33
|
+
)
|
|
34
|
+
sys.path.append(chronon_root)
|
|
35
|
+
else:
|
|
36
|
+
console.print(f"[cyan italic]{chronon_root}[/cyan italic] already on python path.")
|
|
37
|
+
|
|
38
|
+
return __compile(chronon_root, ignore_python_errors)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def __compile(chronon_root, ignore_python_errors=False):
|
|
42
|
+
if chronon_root:
|
|
43
|
+
chronon_root_path = os.path.expanduser(chronon_root)
|
|
44
|
+
os.chdir(chronon_root_path)
|
|
45
|
+
|
|
46
|
+
# check that a "teams.py" file exists in the current directory
|
|
47
|
+
if not (os.path.exists("teams.py") or os.path.exists("teams.json")):
|
|
48
|
+
raise click.ClickException(
|
|
49
|
+
(
|
|
50
|
+
"teams.py or teams.json file not found in current directory."
|
|
51
|
+
" Please run from the top level of conf directory."
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
compile_context = CompileContext(ignore_python_errors=ignore_python_errors)
|
|
56
|
+
compiler = Compiler(compile_context)
|
|
57
|
+
results = compiler.compile()
|
|
58
|
+
return results
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == "__main__":
|
|
62
|
+
compile()
|