awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/ttypes.py +6 -6
- ai/chronon/airflow_helpers.py +20 -23
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +40 -17
- ai/chronon/cli/compile/compile_context.py +13 -17
- ai/chronon/cli/compile/compiler.py +59 -36
- ai/chronon/cli/compile/conf_validator.py +251 -99
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +6 -16
- ai/chronon/cli/compile/display/compile_status.py +10 -10
- ai/chronon/cli/compile/display/diff_result.py +79 -14
- ai/chronon/cli/compile/fill_templates.py +3 -8
- ai/chronon/cli/compile/parse_configs.py +10 -17
- ai/chronon/cli/compile/parse_teams.py +38 -34
- ai/chronon/cli/compile/serializer.py +3 -9
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +2 -13
- ai/chronon/cli/logger.py +0 -2
- ai/chronon/constants.py +1 -1
- ai/chronon/group_by.py +47 -47
- ai/chronon/join.py +46 -32
- ai/chronon/logger.py +1 -2
- ai/chronon/model.py +9 -4
- ai/chronon/query.py +2 -2
- ai/chronon/repo/__init__.py +1 -2
- ai/chronon/repo/aws.py +17 -31
- ai/chronon/repo/cluster.py +121 -50
- ai/chronon/repo/compile.py +14 -8
- ai/chronon/repo/constants.py +1 -1
- ai/chronon/repo/default_runner.py +32 -54
- ai/chronon/repo/explore.py +70 -73
- ai/chronon/repo/extract_objects.py +6 -9
- ai/chronon/repo/gcp.py +89 -88
- ai/chronon/repo/gitpython_utils.py +3 -2
- ai/chronon/repo/hub_runner.py +145 -55
- ai/chronon/repo/hub_uploader.py +2 -1
- ai/chronon/repo/init.py +12 -5
- ai/chronon/repo/join_backfill.py +19 -5
- ai/chronon/repo/run.py +42 -39
- ai/chronon/repo/serializer.py +4 -12
- ai/chronon/repo/utils.py +72 -63
- ai/chronon/repo/zipline.py +3 -19
- ai/chronon/repo/zipline_hub.py +211 -39
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +4 -8
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +9 -6
- ai/chronon/resources/gcp/teams.py +9 -21
- ai/chronon/source.py +2 -4
- ai/chronon/staging_query.py +60 -19
- ai/chronon/types.py +3 -2
- ai/chronon/utils.py +21 -68
- ai/chronon/windows.py +2 -4
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/METADATA +48 -24
- awx_zipline_ai-0.3.1.dist-info/RECORD +96 -0
- awx_zipline_ai-0.3.1.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- {ai/chronon → gen_thrift}/api/ttypes.py +327 -197
- {ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
- gen_thrift/eval/ttypes.py +660 -0
- {ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
- {ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
- {ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
- ai/chronon/eval/__init__.py +0 -122
- ai/chronon/eval/query_parsing.py +0 -19
- ai/chronon/eval/sample_tables.py +0 -100
- ai/chronon/eval/table_scan.py +0 -186
- ai/chronon/orchestration/ttypes.py +0 -4406
- ai/chronon/resources/gcp/README.md +0 -174
- ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
- awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
- awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
- awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
- /jars/__init__.py → /__init__.py +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/WHEEL +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/entry_points.txt +0 -0
- {ai/chronon → gen_thrift}/api/__init__.py +0 -0
- {ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
- {ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
- {ai/chronon/api → gen_thrift/common}/constants.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
- {ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
- {ai/chronon → gen_thrift}/planner/__init__.py +0 -0
- {ai/chronon → gen_thrift}/planner/constants.py +0 -0
ai/chronon/repo/cluster.py
CHANGED
|
@@ -1,65 +1,136 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
def generate_dataproc_cluster_config(
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
def generate_dataproc_cluster_config(
|
|
5
|
+
num_workers,
|
|
6
|
+
project_id,
|
|
7
|
+
artifact_prefix,
|
|
8
|
+
master_host_type="n2-highmem-64",
|
|
9
|
+
worker_host_type="n2-highmem-16",
|
|
10
|
+
subnetwork="default",
|
|
11
|
+
idle_timeout="7200s",
|
|
12
|
+
initialization_actions=None,
|
|
13
|
+
tags=None,
|
|
14
|
+
):
|
|
7
15
|
"""
|
|
8
16
|
Create a configuration for a Dataproc cluster.
|
|
9
17
|
:return: A json string representing the configuration.
|
|
10
18
|
"""
|
|
11
19
|
if initialization_actions is None:
|
|
12
20
|
initialization_actions = []
|
|
13
|
-
return json.dumps(
|
|
14
|
-
|
|
15
|
-
"
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
21
|
+
return json.dumps(
|
|
22
|
+
{
|
|
23
|
+
"gceClusterConfig": {
|
|
24
|
+
"subnetworkUri": subnetwork,
|
|
25
|
+
"serviceAccount": "dataproc@" + project_id + ".iam.gserviceaccount.com",
|
|
26
|
+
"serviceAccountScopes": [
|
|
27
|
+
"https://www.googleapis.com/auth/cloud-platform",
|
|
28
|
+
"https://www.googleapis.com/auth/monitoring",
|
|
29
|
+
"https://www.googleapis.com/auth/cloud.useraccounts.readonly",
|
|
30
|
+
"https://www.googleapis.com/auth/devstorage.read_write",
|
|
31
|
+
"https://www.googleapis.com/auth/logging.write",
|
|
32
|
+
],
|
|
33
|
+
"metadata": {
|
|
34
|
+
"hive-version": "3.1.2",
|
|
35
|
+
"SPARK_BQ_CONNECTOR_URL": "gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar",
|
|
36
|
+
"artifact_prefix": artifact_prefix.rstrip("/"),
|
|
37
|
+
},
|
|
38
|
+
"tags": tags or [],
|
|
39
|
+
},
|
|
40
|
+
"masterConfig": {
|
|
41
|
+
"numInstances": 1,
|
|
42
|
+
"machineTypeUri": master_host_type,
|
|
43
|
+
"diskConfig": {"bootDiskType": "pd-standard", "bootDiskSizeGb": 1024},
|
|
44
|
+
},
|
|
45
|
+
"workerConfig": {
|
|
46
|
+
"numInstances": num_workers,
|
|
47
|
+
"machineTypeUri": worker_host_type,
|
|
48
|
+
"diskConfig": {
|
|
49
|
+
"bootDiskType": "pd-standard",
|
|
50
|
+
"bootDiskSizeGb": 64,
|
|
51
|
+
"numLocalSsds": 2,
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
"softwareConfig": {
|
|
55
|
+
"imageVersion": "2.2.66-debian12",
|
|
56
|
+
"optionalComponents": [
|
|
57
|
+
"FLINK",
|
|
58
|
+
"JUPYTER",
|
|
59
|
+
],
|
|
60
|
+
"properties": {
|
|
61
|
+
"dataproc:dataproc.logging.stackdriver.enable": "true",
|
|
62
|
+
"dataproc:jobs.file-backed-output.enable": "true",
|
|
63
|
+
"dataproc:dataproc.logging.stackdriver.job.driver.enable": "true",
|
|
64
|
+
"dataproc:dataproc.logging.stackdriver.job.yarn.container.enable": "true",
|
|
65
|
+
},
|
|
66
|
+
},
|
|
67
|
+
"initializationActions": [
|
|
68
|
+
{"executable_file": initialization_action}
|
|
69
|
+
for initialization_action in (
|
|
70
|
+
(initialization_actions or [])
|
|
71
|
+
+ [artifact_prefix.rstrip("/") + "/scripts/copy_java_security.sh"]
|
|
72
|
+
)
|
|
22
73
|
],
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
|
|
26
|
-
|
|
74
|
+
"endpointConfig": {
|
|
75
|
+
"enableHttpPortAccess": True,
|
|
76
|
+
},
|
|
77
|
+
"lifecycleConfig": {
|
|
78
|
+
"idleDeleteTtl": idle_timeout,
|
|
27
79
|
},
|
|
28
|
-
|
|
80
|
+
}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def fixed_cluster(
|
|
85
|
+
size,
|
|
86
|
+
project_id,
|
|
87
|
+
artifact_prefix,
|
|
88
|
+
subnetwork="default",
|
|
89
|
+
initialization_actions=None,
|
|
90
|
+
tags=None,
|
|
91
|
+
):
|
|
92
|
+
"""
|
|
93
|
+
Create a Dataproc cluster configuration based on t-shirt sizes.
|
|
94
|
+
|
|
95
|
+
:param size: T-shirt size - 'small', 'medium', or 'large'
|
|
96
|
+
:param project_id: GCP project ID
|
|
97
|
+
:param artifact_prefix: Artifact prefix for initialization scripts
|
|
98
|
+
:param subnetwork: Subnetwork for the cluster
|
|
99
|
+
:param initialization_actions: List of initialization actions
|
|
100
|
+
:param tags: List of tags for the cluster
|
|
101
|
+
:return: A json string representing the cluster configuration
|
|
102
|
+
"""
|
|
103
|
+
size_configs = {
|
|
104
|
+
"small": {
|
|
105
|
+
"num_workers": 20,
|
|
106
|
+
"worker_host_type": "n2-highmem-4", # 16GB, 4 cores
|
|
107
|
+
"master_host_type": "n2-highmem-4", # Same as worker for consistency
|
|
29
108
|
},
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"bootDiskType": "pd-standard",
|
|
35
|
-
"bootDiskSizeGb": 1024
|
|
36
|
-
}
|
|
109
|
+
"medium": {
|
|
110
|
+
"num_workers": 50,
|
|
111
|
+
"worker_host_type": "n2-highmem-16", # 32GB, 8 cores
|
|
112
|
+
"master_host_type": "n2-highmem-16", # Same as worker for consistency
|
|
37
113
|
},
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"bootDiskType": "pd-standard",
|
|
43
|
-
"bootDiskSizeGb": 64,
|
|
44
|
-
"numLocalSsds": 2
|
|
45
|
-
}
|
|
114
|
+
"large": {
|
|
115
|
+
"num_workers": 250,
|
|
116
|
+
"worker_host_type": "n2-highmem-16", # 64GB, 16 cores
|
|
117
|
+
"master_host_type": "n2-highmem-16", # Same as worker for consistency
|
|
46
118
|
},
|
|
47
|
-
|
|
48
|
-
"imageVersion": "2.2.50-debian12",
|
|
49
|
-
"optionalComponents": [
|
|
50
|
-
"FLINK",
|
|
51
|
-
"JUPYTER",
|
|
52
|
-
],
|
|
53
|
-
"properties": {
|
|
119
|
+
}
|
|
54
120
|
|
|
55
|
-
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
121
|
+
if size not in size_configs:
|
|
122
|
+
raise ValueError(f"Invalid size '{size}'. Must be one of: {list(size_configs.keys())}")
|
|
123
|
+
|
|
124
|
+
config = size_configs[size]
|
|
125
|
+
|
|
126
|
+
return generate_dataproc_cluster_config(
|
|
127
|
+
num_workers=config["num_workers"],
|
|
128
|
+
project_id=project_id,
|
|
129
|
+
artifact_prefix=artifact_prefix,
|
|
130
|
+
master_host_type=config["master_host_type"],
|
|
131
|
+
worker_host_type=config["worker_host_type"],
|
|
132
|
+
subnetwork=subnetwork,
|
|
133
|
+
idle_timeout="3600s", # 1 hour of inactivity
|
|
134
|
+
initialization_actions=initialization_actions,
|
|
135
|
+
tags=tags,
|
|
136
|
+
)
|
ai/chronon/repo/compile.py
CHANGED
|
@@ -15,24 +15,30 @@ from ai.chronon.cli.compile.display.console import console
|
|
|
15
15
|
help="Path to the root chronon folder",
|
|
16
16
|
default=os.getcwd(),
|
|
17
17
|
)
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
@click.option(
|
|
19
|
+
"--ignore-python-errors",
|
|
20
|
+
is_flag=True,
|
|
21
|
+
default=False,
|
|
22
|
+
help="Allow compilation to proceed even with Python errors (useful for testing)",
|
|
23
|
+
)
|
|
24
|
+
def compile(chronon_root, ignore_python_errors):
|
|
20
25
|
print()
|
|
21
26
|
|
|
27
|
+
if chronon_root is None or chronon_root == "":
|
|
28
|
+
chronon_root = os.getcwd()
|
|
29
|
+
|
|
22
30
|
if chronon_root not in sys.path:
|
|
23
31
|
console.print(
|
|
24
32
|
f"Adding [cyan italic]{chronon_root}[/cyan italic] to python path, during compile."
|
|
25
33
|
)
|
|
26
34
|
sys.path.append(chronon_root)
|
|
27
35
|
else:
|
|
28
|
-
console.print(
|
|
29
|
-
f"[cyan italic]{chronon_root}[/cyan italic] already on python path."
|
|
30
|
-
)
|
|
36
|
+
console.print(f"[cyan italic]{chronon_root}[/cyan italic] already on python path.")
|
|
31
37
|
|
|
32
|
-
return __compile(chronon_root)
|
|
38
|
+
return __compile(chronon_root, ignore_python_errors)
|
|
33
39
|
|
|
34
40
|
|
|
35
|
-
def __compile(chronon_root):
|
|
41
|
+
def __compile(chronon_root, ignore_python_errors=False):
|
|
36
42
|
if chronon_root:
|
|
37
43
|
chronon_root_path = os.path.expanduser(chronon_root)
|
|
38
44
|
os.chdir(chronon_root_path)
|
|
@@ -46,7 +52,7 @@ def __compile(chronon_root):
|
|
|
46
52
|
)
|
|
47
53
|
)
|
|
48
54
|
|
|
49
|
-
compile_context = CompileContext()
|
|
55
|
+
compile_context = CompileContext(ignore_python_errors=ignore_python_errors)
|
|
50
56
|
compiler = Compiler(compile_context)
|
|
51
57
|
results = compiler.compile()
|
|
52
58
|
return results
|
ai/chronon/repo/constants.py
CHANGED
|
@@ -103,7 +103,7 @@ MODE_ARGS = {
|
|
|
103
103
|
RunMode.SOURCE_JOB: OFFLINE_ARGS,
|
|
104
104
|
RunMode.JOIN_PART_JOB: OFFLINE_ARGS,
|
|
105
105
|
RunMode.MERGE_JOB: OFFLINE_ARGS,
|
|
106
|
-
RunMode.METASTORE: "",
|
|
106
|
+
RunMode.METASTORE: "", # purposely left blank. we'll handle this specifically
|
|
107
107
|
RunMode.INFO: "",
|
|
108
108
|
}
|
|
109
109
|
|
|
@@ -63,14 +63,13 @@ class Runner:
|
|
|
63
63
|
and (args.get("online_jar_fetch"))
|
|
64
64
|
):
|
|
65
65
|
print("Downloading online_jar")
|
|
66
|
-
self.online_jar = utils.check_output(
|
|
67
|
-
"
|
|
68
|
-
)
|
|
66
|
+
self.online_jar = utils.check_output("{}".format(args["online_jar_fetch"])).decode(
|
|
67
|
+
"utf-8"
|
|
68
|
+
)
|
|
69
69
|
os.environ["CHRONON_ONLINE_JAR"] = self.online_jar
|
|
70
70
|
print("Downloaded jar to {}".format(self.online_jar))
|
|
71
71
|
|
|
72
|
-
if (self.
|
|
73
|
-
and (self.mode != "metastore")): # TODO: don't check for metastore
|
|
72
|
+
if self.conf and (self.mode != "metastore"): # TODO: don't check for metastore
|
|
74
73
|
try:
|
|
75
74
|
self.context, self.conf_type, self.team, _ = self.conf.split("/")[-4:]
|
|
76
75
|
except Exception as e:
|
|
@@ -81,20 +80,16 @@ class Runner:
|
|
|
81
80
|
)
|
|
82
81
|
raise e
|
|
83
82
|
possible_modes = list(ROUTES[self.conf_type].keys()) + UNIVERSAL_ROUTES
|
|
84
|
-
assert (
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
83
|
+
assert args["mode"] in possible_modes, (
|
|
84
|
+
"Invalid mode:{} for conf:{} of type:{}, please choose from {}".format(
|
|
85
|
+
args["mode"], self.conf, self.conf_type, possible_modes
|
|
86
|
+
)
|
|
88
87
|
)
|
|
89
88
|
|
|
90
89
|
self.ds = args["end_ds"] if "end_ds" in args and args["end_ds"] else args["ds"]
|
|
91
|
-
self.start_ds =
|
|
92
|
-
args["start_ds"] if "start_ds" in args and args["start_ds"] else None
|
|
93
|
-
)
|
|
90
|
+
self.start_ds = args["start_ds"] if "start_ds" in args and args["start_ds"] else None
|
|
94
91
|
self.parallelism = (
|
|
95
|
-
int(args["parallelism"])
|
|
96
|
-
if "parallelism" in args and args["parallelism"]
|
|
97
|
-
else 1
|
|
92
|
+
int(args["parallelism"]) if "parallelism" in args and args["parallelism"] else 1
|
|
98
93
|
)
|
|
99
94
|
self.jar_path = jar_path
|
|
100
95
|
|
|
@@ -103,9 +98,9 @@ class Runner:
|
|
|
103
98
|
if self.mode == "streaming":
|
|
104
99
|
self.spark_submit = args["spark_streaming_submit_path"]
|
|
105
100
|
elif self.mode == "info":
|
|
106
|
-
assert os.path.exists(
|
|
107
|
-
args["render_info"]
|
|
108
|
-
)
|
|
101
|
+
assert os.path.exists(args["render_info"]), (
|
|
102
|
+
"Invalid path for the render info script: {}".format(args["render_info"])
|
|
103
|
+
)
|
|
109
104
|
self.render_info = args["render_info"]
|
|
110
105
|
else:
|
|
111
106
|
self.spark_submit = args["spark_submit_path"]
|
|
@@ -113,21 +108,16 @@ class Runner:
|
|
|
113
108
|
|
|
114
109
|
self.disable_cloud_logging = args.get("disable_cloud_logging")
|
|
115
110
|
|
|
116
|
-
|
|
117
111
|
def run_spark_streaming(self):
|
|
118
112
|
# streaming mode
|
|
119
113
|
self.app_name = self.app_name.replace(
|
|
120
114
|
"_streaming-client_", "_streaming_"
|
|
121
115
|
) # If the job is running cluster mode we want to kill it.
|
|
122
116
|
print(
|
|
123
|
-
"Checking to see if a streaming job by the name {} already exists".format(
|
|
124
|
-
self.app_name
|
|
125
|
-
)
|
|
117
|
+
"Checking to see if a streaming job by the name {} already exists".format(self.app_name)
|
|
126
118
|
)
|
|
127
119
|
running_apps = (
|
|
128
|
-
utils.check_output("{}".format(self.list_apps_cmd))
|
|
129
|
-
.decode("utf-8")
|
|
130
|
-
.split("\n")
|
|
120
|
+
utils.check_output("{}".format(self.list_apps_cmd)).decode("utf-8").split("\n")
|
|
131
121
|
)
|
|
132
122
|
running_app_map = {}
|
|
133
123
|
for app in running_apps:
|
|
@@ -150,9 +140,7 @@ class Runner:
|
|
|
150
140
|
)
|
|
151
141
|
)
|
|
152
142
|
if self.mode == "streaming":
|
|
153
|
-
assert (
|
|
154
|
-
len(filtered_apps) == 1
|
|
155
|
-
), "More than one found, please kill them all"
|
|
143
|
+
assert len(filtered_apps) == 1, "More than one found, please kill them all"
|
|
156
144
|
print("All good. No need to start a new app.")
|
|
157
145
|
return
|
|
158
146
|
elif self.mode == "streaming-client":
|
|
@@ -203,9 +191,7 @@ class Runner:
|
|
|
203
191
|
"To use parallelism, please specify --start-ds and --end-ds to "
|
|
204
192
|
"break down into multiple backfill jobs"
|
|
205
193
|
)
|
|
206
|
-
date_ranges = utils.split_date_range(
|
|
207
|
-
self.start_ds, self.ds, self.parallelism
|
|
208
|
-
)
|
|
194
|
+
date_ranges = utils.split_date_range(self.start_ds, self.ds, self.parallelism)
|
|
209
195
|
for start_ds, end_ds in date_ranges:
|
|
210
196
|
command = (
|
|
211
197
|
"bash {script} --class ai.chronon.spark.Driver "
|
|
@@ -215,9 +201,7 @@ class Runner:
|
|
|
215
201
|
jar=self.jar_path,
|
|
216
202
|
subcommand=ROUTES[self.conf_type][self.mode],
|
|
217
203
|
args=self._gen_final_args(start_ds=start_ds, end_ds=end_ds),
|
|
218
|
-
additional_args=os.environ.get(
|
|
219
|
-
"CHRONON_CONFIG_ADDITIONAL_ARGS", ""
|
|
220
|
-
),
|
|
204
|
+
additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
|
|
221
205
|
)
|
|
222
206
|
command_list.append(command)
|
|
223
207
|
else:
|
|
@@ -229,9 +213,7 @@ class Runner:
|
|
|
229
213
|
jar=self.jar_path,
|
|
230
214
|
subcommand=ROUTES[self.conf_type][self.mode],
|
|
231
215
|
args=self._gen_final_args(self.start_ds),
|
|
232
|
-
additional_args=os.environ.get(
|
|
233
|
-
"CHRONON_CONFIG_ADDITIONAL_ARGS", ""
|
|
234
|
-
),
|
|
216
|
+
additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
|
|
235
217
|
)
|
|
236
218
|
command_list.append(command)
|
|
237
219
|
|
|
@@ -239,17 +221,13 @@ class Runner:
|
|
|
239
221
|
# parallel backfill mode
|
|
240
222
|
with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
|
|
241
223
|
logging.info(
|
|
242
|
-
"Running args list {} with pool size {}".format(
|
|
243
|
-
command_list, self.parallelism
|
|
244
|
-
)
|
|
224
|
+
"Running args list {} with pool size {}".format(command_list, self.parallelism)
|
|
245
225
|
)
|
|
246
226
|
pool.map(utils.check_call, command_list)
|
|
247
227
|
elif len(command_list) == 1:
|
|
248
228
|
utils.check_call(command_list[0])
|
|
249
229
|
|
|
250
|
-
def _gen_final_args(
|
|
251
|
-
self, start_ds=None, end_ds=None, override_conf_path=None, **kwargs
|
|
252
|
-
):
|
|
230
|
+
def _gen_final_args(self, start_ds=None, end_ds=None, override_conf_path=None, **kwargs):
|
|
253
231
|
base_args = MODE_ARGS.get(self.mode).format(
|
|
254
232
|
conf_path=override_conf_path if override_conf_path else self.conf,
|
|
255
233
|
ds=end_ds if end_ds else self.ds,
|
|
@@ -261,7 +239,7 @@ class Runner:
|
|
|
261
239
|
|
|
262
240
|
if self.conf_type:
|
|
263
241
|
submitter_args.append(f"--conf-type={self.conf_type}")
|
|
264
|
-
|
|
242
|
+
|
|
265
243
|
if self.uploader:
|
|
266
244
|
submitter_args.append(f"--uploader={self.uploader}")
|
|
267
245
|
|
|
@@ -269,23 +247,23 @@ class Runner:
|
|
|
269
247
|
submitter_args.append(f"--additional-jars={self.additional_jars}")
|
|
270
248
|
|
|
271
249
|
if self.mode != RunMode.FETCH:
|
|
272
|
-
submitter_args.append(" --local-conf-path={conf}".format(
|
|
273
|
-
conf=self.local_abs_conf_path
|
|
274
|
-
))
|
|
250
|
+
submitter_args.append(" --local-conf-path={conf}".format(conf=self.local_abs_conf_path))
|
|
275
251
|
submitter_args.append(" --original-mode={mode}".format(mode=self.mode))
|
|
276
252
|
|
|
277
|
-
override_start_partition_arg =
|
|
278
|
-
"--start-partition-override=" + start_ds if start_ds else ""
|
|
279
|
-
)
|
|
253
|
+
override_start_partition_arg = "--start-partition-override=" + start_ds if start_ds else ""
|
|
280
254
|
|
|
281
255
|
additional_args = " ".join(
|
|
282
|
-
f"--{key.replace('_', '-')}={value}"
|
|
283
|
-
for key, value in kwargs.items()
|
|
284
|
-
if value
|
|
256
|
+
f"--{key.replace('_', '-')}={value}" for key, value in kwargs.items() if value
|
|
285
257
|
)
|
|
286
258
|
|
|
287
259
|
final_args = " ".join(
|
|
288
|
-
[
|
|
260
|
+
[
|
|
261
|
+
base_args,
|
|
262
|
+
str(self.args),
|
|
263
|
+
override_start_partition_arg,
|
|
264
|
+
" ".join(submitter_args),
|
|
265
|
+
additional_args,
|
|
266
|
+
]
|
|
289
267
|
)
|
|
290
268
|
|
|
291
269
|
return final_args
|