PyPI - awx-zipline-ai - Versions diffs - 0.2.0__py3-none-any.whl - Mend

awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

agent/__init__.py +1 -0
agent/constants.py +15 -0
agent/ttypes.py +1684 -0
ai/__init__.py +0 -0
ai/chronon/__init__.py +0 -0
ai/chronon/airflow_helpers.py +251 -0
ai/chronon/api/__init__.py +1 -0
ai/chronon/api/common/__init__.py +1 -0
ai/chronon/api/common/constants.py +15 -0
ai/chronon/api/common/ttypes.py +1844 -0
ai/chronon/api/constants.py +15 -0
ai/chronon/api/ttypes.py +3624 -0
ai/chronon/cli/compile/column_hashing.py +313 -0
ai/chronon/cli/compile/compile_context.py +177 -0
ai/chronon/cli/compile/compiler.py +160 -0
ai/chronon/cli/compile/conf_validator.py +590 -0
ai/chronon/cli/compile/display/class_tracker.py +112 -0
ai/chronon/cli/compile/display/compile_status.py +95 -0
ai/chronon/cli/compile/display/compiled_obj.py +12 -0
ai/chronon/cli/compile/display/console.py +3 -0
ai/chronon/cli/compile/display/diff_result.py +46 -0
ai/chronon/cli/compile/fill_templates.py +40 -0
ai/chronon/cli/compile/parse_configs.py +141 -0
ai/chronon/cli/compile/parse_teams.py +238 -0
ai/chronon/cli/compile/serializer.py +115 -0
ai/chronon/cli/git_utils.py +156 -0
ai/chronon/cli/logger.py +61 -0
ai/chronon/constants.py +3 -0
ai/chronon/eval/__init__.py +122 -0
ai/chronon/eval/query_parsing.py +19 -0
ai/chronon/eval/sample_tables.py +100 -0
ai/chronon/eval/table_scan.py +186 -0
ai/chronon/fetcher/__init__.py +1 -0
ai/chronon/fetcher/constants.py +15 -0
ai/chronon/fetcher/ttypes.py +127 -0
ai/chronon/group_by.py +692 -0
ai/chronon/hub/__init__.py +1 -0
ai/chronon/hub/constants.py +15 -0
ai/chronon/hub/ttypes.py +1228 -0
ai/chronon/join.py +566 -0
ai/chronon/logger.py +24 -0
ai/chronon/model.py +35 -0
ai/chronon/observability/__init__.py +1 -0
ai/chronon/observability/constants.py +15 -0
ai/chronon/observability/ttypes.py +2192 -0
ai/chronon/orchestration/__init__.py +1 -0
ai/chronon/orchestration/constants.py +15 -0
ai/chronon/orchestration/ttypes.py +4406 -0
ai/chronon/planner/__init__.py +1 -0
ai/chronon/planner/constants.py +15 -0
ai/chronon/planner/ttypes.py +1686 -0
ai/chronon/query.py +126 -0
ai/chronon/repo/__init__.py +40 -0
ai/chronon/repo/aws.py +298 -0
ai/chronon/repo/cluster.py +65 -0
ai/chronon/repo/compile.py +56 -0
ai/chronon/repo/constants.py +164 -0
ai/chronon/repo/default_runner.py +291 -0
ai/chronon/repo/explore.py +421 -0
ai/chronon/repo/extract_objects.py +137 -0
ai/chronon/repo/gcp.py +585 -0
ai/chronon/repo/gitpython_utils.py +14 -0
ai/chronon/repo/hub_runner.py +171 -0
ai/chronon/repo/hub_uploader.py +108 -0
ai/chronon/repo/init.py +53 -0
ai/chronon/repo/join_backfill.py +105 -0
ai/chronon/repo/run.py +293 -0
ai/chronon/repo/serializer.py +141 -0
ai/chronon/repo/team_json_utils.py +46 -0
ai/chronon/repo/utils.py +472 -0
ai/chronon/repo/zipline.py +51 -0
ai/chronon/repo/zipline_hub.py +105 -0
ai/chronon/resources/gcp/README.md +174 -0
ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
ai/chronon/resources/gcp/joins/test/data.py +30 -0
ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
ai/chronon/resources/gcp/sources/test/data.py +23 -0
ai/chronon/resources/gcp/teams.py +70 -0
ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
ai/chronon/source.py +88 -0
ai/chronon/staging_query.py +185 -0
ai/chronon/types.py +57 -0
ai/chronon/utils.py +557 -0
ai/chronon/windows.py +50 -0
awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
jars/__init__.py +0 -0

ai/chronon/repo/constants.py ADDED Viewed

@@ -0,0 +1,164 @@
+from enum import Enum
+class RunMode(str, Enum):
+    def __str__(self):
+        return self.value
+    BACKFILL = "backfill"
+    BACKFILL_LEFT = "backfill-left"
+    BACKFILL_FINAL = "backfill-final"
+    DEPLOY = "deploy"
+    UPLOAD = "upload"
+    UPLOAD_TO_KV = "upload-to-kv"
+    STATS_SUMMARY = "stats-summary"
+    LOG_SUMMARY = "log-summary"
+    ANALYZE = "analyze"
+    STREAMING = "streaming"
+    METADATA_UPLOAD = "metadata-upload"
+    FETCH = "fetch"
+    CONSISTENCY_METRICS_COMPUTE = "consistency-metrics-compute"
+    BUILD_COMPARISON_TABLE = "build-comparison-table"
+    COMPARE = "compare"
+    LOCAL_STREAMING = "local-streaming"
+    LOG_FLATTENER = "log-flattener"
+    METADATA_EXPORT = "metadata-export"
+    LABEL_JOIN = "label-join"
+    STREAMING_CLIENT = "streaming-client"
+    SOURCE_JOB = "source-job"
+    JOIN_PART_JOB = "join-part-job"
+    MERGE_JOB = "merge-job"
+    METASTORE = "metastore"
+    INFO = "info"
+ONLINE_ARGS = "--online-jar={online_jar} --online-class={online_class} "
+OFFLINE_ARGS = "--conf-path={conf_path} --end-date={ds} "
+ONLINE_WRITE_ARGS = "--conf-path={conf_path} " + ONLINE_ARGS
+ONLINE_OFFLINE_WRITE_ARGS = OFFLINE_ARGS + ONLINE_ARGS
+ONLINE_MODES = [
+    RunMode.STREAMING,
+    RunMode.METADATA_UPLOAD,
+    RunMode.FETCH,
+    RunMode.LOCAL_STREAMING,
+    RunMode.STREAMING_CLIENT,
+]
+SPARK_MODES = [
+    RunMode.BACKFILL,
+    RunMode.BACKFILL_LEFT,
+    RunMode.BACKFILL_FINAL,
+    RunMode.UPLOAD,
+    RunMode.UPLOAD_TO_KV,
+    RunMode.STREAMING,
+    RunMode.STREAMING_CLIENT,
+    RunMode.CONSISTENCY_METRICS_COMPUTE,
+    RunMode.BUILD_COMPARISON_TABLE,
+    RunMode.COMPARE,
+    RunMode.ANALYZE,
+    RunMode.STATS_SUMMARY,
+    RunMode.LOG_SUMMARY,
+    RunMode.LOG_FLATTENER,
+    RunMode.METADATA_EXPORT,
+    RunMode.LABEL_JOIN,
+    RunMode.SOURCE_JOB,
+    RunMode.JOIN_PART_JOB,
+    RunMode.MERGE_JOB,
+]
+MODES_USING_EMBEDDED = [
+    RunMode.METADATA_UPLOAD,
+    RunMode.FETCH,
+    RunMode.LOCAL_STREAMING,
+]
+# Constants for supporting multiple spark versions.
+SUPPORTED_SPARK = ["2.4.0", "3.1.1", "3.2.1", "3.5.1"]
+SCALA_VERSION_FOR_SPARK = {
+    "2.4.0": "2.11",
+    "3.1.1": "2.12",
+    "3.2.1": "2.13",
+    "3.5.1": "2.12",
+}
+MODE_ARGS = {
+    RunMode.BACKFILL: OFFLINE_ARGS,
+    RunMode.BACKFILL_LEFT: OFFLINE_ARGS,
+    RunMode.BACKFILL_FINAL: OFFLINE_ARGS,
+    RunMode.UPLOAD: OFFLINE_ARGS,
+    RunMode.UPLOAD_TO_KV: ONLINE_OFFLINE_WRITE_ARGS,
+    RunMode.STATS_SUMMARY: OFFLINE_ARGS,
+    RunMode.LOG_SUMMARY: OFFLINE_ARGS,
+    RunMode.ANALYZE: OFFLINE_ARGS,
+    RunMode.STREAMING: ONLINE_WRITE_ARGS,
+    RunMode.METADATA_UPLOAD: ONLINE_WRITE_ARGS,
+    RunMode.FETCH: ONLINE_ARGS,
+    RunMode.CONSISTENCY_METRICS_COMPUTE: OFFLINE_ARGS,
+    RunMode.BUILD_COMPARISON_TABLE: OFFLINE_ARGS,
+    RunMode.COMPARE: OFFLINE_ARGS,
+    RunMode.LOCAL_STREAMING: ONLINE_WRITE_ARGS + " -d",
+    RunMode.LOG_FLATTENER: OFFLINE_ARGS,
+    RunMode.METADATA_EXPORT: OFFLINE_ARGS,
+    RunMode.LABEL_JOIN: OFFLINE_ARGS,
+    RunMode.STREAMING_CLIENT: ONLINE_WRITE_ARGS,
+    RunMode.SOURCE_JOB: OFFLINE_ARGS,
+    RunMode.JOIN_PART_JOB: OFFLINE_ARGS,
+    RunMode.MERGE_JOB: OFFLINE_ARGS,
+    RunMode.METASTORE: "", # purposely left blank. we'll handle this specifically
+    RunMode.INFO: "",
+}
+ROUTES = {
+    "group_bys": {
+        RunMode.UPLOAD: "group-by-upload",
+        RunMode.UPLOAD_TO_KV: "group-by-upload-bulk-load",
+        RunMode.BACKFILL: "group-by-backfill",
+        RunMode.STREAMING: "group-by-streaming",
+        RunMode.METADATA_UPLOAD: "metadata-upload",
+        RunMode.LOCAL_STREAMING: "group-by-streaming",
+        RunMode.FETCH: "fetch",
+        RunMode.ANALYZE: "analyze",
+        RunMode.METADATA_EXPORT: "metadata-export",
+        RunMode.STREAMING_CLIENT: "group-by-streaming",
+    },
+    "joins": {
+        RunMode.BACKFILL: "join",
+        RunMode.BACKFILL_LEFT: "join-left",
+        RunMode.BACKFILL_FINAL: "join-final",
+        RunMode.METADATA_UPLOAD: "metadata-upload",
+        RunMode.FETCH: "fetch",
+        RunMode.CONSISTENCY_METRICS_COMPUTE: "consistency-metrics-compute",
+        RunMode.BUILD_COMPARISON_TABLE: "build-comparison-table",
+        RunMode.COMPARE: "compare-join-query",
+        RunMode.STATS_SUMMARY: "stats-summary",
+        RunMode.LOG_SUMMARY: "log-summary",
+        RunMode.ANALYZE: "analyze",
+        RunMode.LOG_FLATTENER: "log-flattener",
+        RunMode.METADATA_EXPORT: "metadata-export",
+        RunMode.LABEL_JOIN: "label-join",
+        RunMode.SOURCE_JOB: "source-job",
+        RunMode.JOIN_PART_JOB: "join-part-job",
+        RunMode.MERGE_JOB: "merge-job",
+    },
+    "staging_queries": {
+        RunMode.BACKFILL: "staging-query-backfill",
+        RunMode.METADATA_EXPORT: "metadata-export",
+    },
+}
+UNIVERSAL_ROUTES = ["info"]
+APP_NAME_TEMPLATE = "chronon_{conf_type}_{mode}_{context}_{name}"
+RENDER_INFO_DEFAULT_SCRIPT = "scripts/render_info.py"
+ZIPLINE_DIRECTORY = "/tmp/zipline"
+CLOUD_PROVIDER_KEYWORD = "CLOUD_PROVIDER"
+# cloud provider
+AWS = "AWS"
+GCP = "GCP"
+# arg keywords
+ONLINE_CLASS_ARG = "online_class"
+ONLINE_JAR_ARG = "online_jar"
+ONLINE_ARGS = "online_args"

ai/chronon/repo/default_runner.py ADDED Viewed

@@ -0,0 +1,291 @@
+import json
+import logging
+import multiprocessing
+import os
+from ai.chronon.repo import utils
+from ai.chronon.repo.constants import (
+    MODE_ARGS,
+    ONLINE_ARGS,
+    ONLINE_CLASS_ARG,
+    ONLINE_JAR_ARG,
+    ONLINE_MODES,
+    ROUTES,
+    SPARK_MODES,
+    UNIVERSAL_ROUTES,
+    RunMode,
+)
+class Runner:
+    def __init__(self, args, jar_path):
+        self.repo = args["repo"]
+        self.conf = args["conf"]
+        self.local_abs_conf_path = os.path.realpath(os.path.join(self.repo, self.conf))
+        self.sub_help = args["sub_help"]
+        self.mode = args["mode"]
+        self.online_jar = args.get(ONLINE_JAR_ARG)
+        self.online_class = args.get(ONLINE_CLASS_ARG)
+        self.online_args = args.get(ONLINE_ARGS)
+        self.conf_type = (args.get("conf_type") or "").replace(
+            "-", "_"
+        )  # in case user sets dash instead of underscore
+        # streaming flink
+        self.conf_metadata_name = utils.get_metadata_name_from_conf(self.repo, self.conf)
+        self.kafka_bootstrap = args.get("kafka_bootstrap")
+        self.latest_savepoint = args.get("latest_savepoint")
+        self.custom_savepoint = args.get("custom_savepoint")
+        self.no_savepoint = args.get("no_savepoint")
+        self.version_check = args.get("version_check")
+        self.additional_jars = args.get("additional_jars")
+        flink_state_uri = args.get("flink_state_uri")
+        if flink_state_uri:
+            self.streaming_manifest_path = os.path.join(flink_state_uri, "manifests")
+            self.streaming_checkpoint_path = os.path.join(flink_state_uri, "checkpoints")
+        self.mock_source = args.get("mock_source")
+        self.validate = args.get("validate")
+        self.validate_rows = args.get("validate_rows")
+        self.enable_debug = args.get("enable_debug")
+        self.uploader = args.get("uploader")
+        valid_jar = args["online_jar"] and os.path.exists(args["online_jar"])
+        # fetch online jar if necessary
+        if (
+            (self.mode in ONLINE_MODES)
+            and (not args["sub_help"])
+            and not valid_jar
+            and (args.get("online_jar_fetch"))
+        ):
+            print("Downloading online_jar")
+            self.online_jar = utils.check_output(
+                "{}".format(args["online_jar_fetch"])
+            ).decode("utf-8")
+            os.environ["CHRONON_ONLINE_JAR"] = self.online_jar
+            print("Downloaded jar to {}".format(self.online_jar))
+        if (self.conf
+                and (self.mode != "metastore")): # TODO: don't check for metastore
+            try:
+                self.context, self.conf_type, self.team, _ = self.conf.split("/")[-4:]
+            except Exception as e:
+                logging.error(
+                    "Invalid conf path: {}, please ensure to supply the relative path to zipline/ folder".format(
+                        self.conf
+                    )
+                )
+                raise e
+            possible_modes = list(ROUTES[self.conf_type].keys()) + UNIVERSAL_ROUTES
+            assert (
+                args["mode"] in possible_modes
+            ), "Invalid mode:{} for conf:{} of type:{}, please choose from {}".format(
+                args["mode"], self.conf, self.conf_type, possible_modes
+            )
+        self.ds = args["end_ds"] if "end_ds" in args and args["end_ds"] else args["ds"]
+        self.start_ds = (
+            args["start_ds"] if "start_ds" in args and args["start_ds"] else None
+        )
+        self.parallelism = (
+            int(args["parallelism"])
+            if "parallelism" in args and args["parallelism"]
+            else 1
+        )
+        self.jar_path = jar_path
+        self.args = args["args"] if args["args"] else ""
+        self.app_name = args["app_name"]
+        if self.mode == "streaming":
+            self.spark_submit = args["spark_streaming_submit_path"]
+        elif self.mode == "info":
+            assert os.path.exists(
+                args["render_info"]
+            ), "Invalid path for the render info script: {}".format(args["render_info"])
+            self.render_info = args["render_info"]
+        else:
+            self.spark_submit = args["spark_submit_path"]
+        self.list_apps_cmd = args["list_apps"]
+        self.disable_cloud_logging = args.get("disable_cloud_logging")
+    def run_spark_streaming(self):
+        # streaming mode
+        self.app_name = self.app_name.replace(
+            "_streaming-client_", "_streaming_"
+        )  # If the job is running cluster mode we want to kill it.
+        print(
+            "Checking to see if a streaming job by the name {} already exists".format(
+                self.app_name
+            )
+        )
+        running_apps = (
+            utils.check_output("{}".format(self.list_apps_cmd))
+            .decode("utf-8")
+            .split("\n")
+        )
+        running_app_map = {}
+        for app in running_apps:
+            try:
+                app_json = json.loads(app.strip())
+                app_name = app_json["app_name"].strip()
+                if app_name not in running_app_map:
+                    running_app_map[app_name] = []
+                running_app_map[app_name].append(app_json)
+            except Exception as ex:
+                print("failed to process line into app: " + app)
+                print(ex)
+        filtered_apps = running_app_map.get(self.app_name, [])
+        if len(filtered_apps) > 0:
+            print(
+                "Found running apps by the name {} in \n{}\n".format(
+                    self.app_name,
+                    "\n".join([str(app) for app in filtered_apps]),
+                )
+            )
+            if self.mode == "streaming":
+                assert (
+                    len(filtered_apps) == 1
+                ), "More than one found, please kill them all"
+                print("All good. No need to start a new app.")
+                return
+            elif self.mode == "streaming-client":
+                raise RuntimeError(
+                    "Attempting to submit an application in client mode, but there's already"
+                    " an existing one running."
+                )
+        command = (
+            "bash {script} --class ai.chronon.spark.Driver {jar} {subcommand} {args} {additional_args}"
+        ).format(
+            script=self.spark_submit,
+            jar=self.jar_path,
+            subcommand=ROUTES[self.conf_type][self.mode],
+            args=self._gen_final_args(),
+            additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
+        )
+        return command
+    def run(self):
+        command_list = []
+        if self.mode == "info":
+            command_list.append(
+                "python3 {script} --conf {conf} --ds {ds} --repo {repo}".format(
+                    script=self.render_info, conf=self.conf, ds=self.ds, repo=self.repo
+                )
+            )
+        elif self.sub_help or (self.mode not in SPARK_MODES):
+            if self.mode == "fetch":
+                entrypoint = "ai.chronon.online.fetcher.FetcherMain"
+            else:
+                entrypoint = "ai.chronon.spark.Driver"
+            command_list.append(
+                "java -cp {jar} {entrypoint} {subcommand} {args}".format(
+                    jar=self.jar_path,
+                    entrypoint=entrypoint,
+                    args="--help" if self.sub_help else self._gen_final_args(),
+                    subcommand=ROUTES[self.conf_type][self.mode],
+                )
+            )
+        else:
+            if self.mode in ["streaming", "streaming-client"]:
+                # streaming mode
+                command = self.run_spark_streaming()
+                command_list.append(command)
+            else:
+                if self.parallelism > 1:
+                    assert self.start_ds is not None and self.ds is not None, (
+                        "To use parallelism, please specify --start-ds and --end-ds to "
+                        "break down into multiple backfill jobs"
+                    )
+                    date_ranges = utils.split_date_range(
+                        self.start_ds, self.ds, self.parallelism
+                    )
+                    for start_ds, end_ds in date_ranges:
+                        command = (
+                            "bash {script} --class ai.chronon.spark.Driver "
+                            + "{jar} {subcommand} {args} {additional_args}"
+                        ).format(
+                            script=self.spark_submit,
+                            jar=self.jar_path,
+                            subcommand=ROUTES[self.conf_type][self.mode],
+                            args=self._gen_final_args(start_ds=start_ds, end_ds=end_ds),
+                            additional_args=os.environ.get(
+                                "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
+                            ),
+                        )
+                        command_list.append(command)
+                else:
+                    command = (
+                        "bash {script} --class ai.chronon.spark.Driver "
+                        + "{jar} {subcommand} {args} {additional_args}"
+                    ).format(
+                        script=self.spark_submit,
+                        jar=self.jar_path,
+                        subcommand=ROUTES[self.conf_type][self.mode],
+                        args=self._gen_final_args(self.start_ds),
+                        additional_args=os.environ.get(
+                            "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
+                        ),
+                    )
+                    command_list.append(command)
+        if len(command_list) > 1:
+            # parallel backfill mode
+            with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
+                logging.info(
+                    "Running args list {} with pool size {}".format(
+                        command_list, self.parallelism
+                    )
+                )
+                pool.map(utils.check_call, command_list)
+        elif len(command_list) == 1:
+            utils.check_call(command_list[0])
+    def _gen_final_args(
+        self, start_ds=None, end_ds=None, override_conf_path=None, **kwargs
+    ):
+        base_args = MODE_ARGS.get(self.mode).format(
+            conf_path=override_conf_path if override_conf_path else self.conf,
+            ds=end_ds if end_ds else self.ds,
+            online_jar=self.online_jar,
+            online_class=self.online_class,
+        )
+        submitter_args = []
+        if self.conf_type:
+            submitter_args.append(f"--conf-type={self.conf_type}")
+        if self.uploader:
+            submitter_args.append(f"--uploader={self.uploader}")
+        if self.additional_jars:
+            submitter_args.append(f"--additional-jars={self.additional_jars}")
+        if self.mode != RunMode.FETCH:
+            submitter_args.append(" --local-conf-path={conf}".format(
+                conf=self.local_abs_conf_path
+            ))
+            submitter_args.append(" --original-mode={mode}".format(mode=self.mode))
+        override_start_partition_arg = (
+            "--start-partition-override=" + start_ds if start_ds else ""
+        )
+        additional_args = " ".join(
+            f"--{key.replace('_', '-')}={value}"
+            for key, value in kwargs.items()
+            if value
+        )
+        final_args = " ".join(
+            [base_args, str(self.args), override_start_partition_arg, ' '.join(submitter_args), additional_args]
+        )
+        return final_args