PyPI - awx-zipline-ai - Versions diffs - 0.0.32__py3-none-any.whl - Mend

awx-zipline-ai 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

__init__.py +0 -0
agent/__init__.py +1 -0
agent/constants.py +15 -0
agent/ttypes.py +1684 -0
ai/__init__.py +0 -0
ai/chronon/__init__.py +0 -0
ai/chronon/airflow_helpers.py +248 -0
ai/chronon/cli/__init__.py +0 -0
ai/chronon/cli/compile/__init__.py +0 -0
ai/chronon/cli/compile/column_hashing.py +336 -0
ai/chronon/cli/compile/compile_context.py +173 -0
ai/chronon/cli/compile/compiler.py +183 -0
ai/chronon/cli/compile/conf_validator.py +742 -0
ai/chronon/cli/compile/display/__init__.py +0 -0
ai/chronon/cli/compile/display/class_tracker.py +102 -0
ai/chronon/cli/compile/display/compile_status.py +95 -0
ai/chronon/cli/compile/display/compiled_obj.py +12 -0
ai/chronon/cli/compile/display/console.py +3 -0
ai/chronon/cli/compile/display/diff_result.py +111 -0
ai/chronon/cli/compile/fill_templates.py +35 -0
ai/chronon/cli/compile/parse_configs.py +134 -0
ai/chronon/cli/compile/parse_teams.py +242 -0
ai/chronon/cli/compile/serializer.py +109 -0
ai/chronon/cli/compile/version_utils.py +42 -0
ai/chronon/cli/git_utils.py +145 -0
ai/chronon/cli/logger.py +59 -0
ai/chronon/constants.py +3 -0
ai/chronon/group_by.py +692 -0
ai/chronon/join.py +580 -0
ai/chronon/logger.py +23 -0
ai/chronon/model.py +40 -0
ai/chronon/query.py +126 -0
ai/chronon/repo/__init__.py +39 -0
ai/chronon/repo/aws.py +284 -0
ai/chronon/repo/cluster.py +136 -0
ai/chronon/repo/compile.py +62 -0
ai/chronon/repo/constants.py +164 -0
ai/chronon/repo/default_runner.py +269 -0
ai/chronon/repo/explore.py +418 -0
ai/chronon/repo/extract_objects.py +134 -0
ai/chronon/repo/gcp.py +586 -0
ai/chronon/repo/gitpython_utils.py +15 -0
ai/chronon/repo/hub_runner.py +261 -0
ai/chronon/repo/hub_uploader.py +109 -0
ai/chronon/repo/init.py +60 -0
ai/chronon/repo/join_backfill.py +119 -0
ai/chronon/repo/run.py +296 -0
ai/chronon/repo/serializer.py +133 -0
ai/chronon/repo/team_json_utils.py +46 -0
ai/chronon/repo/utils.py +481 -0
ai/chronon/repo/zipline.py +35 -0
ai/chronon/repo/zipline_hub.py +277 -0
ai/chronon/resources/__init__.py +0 -0
ai/chronon/resources/gcp/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
ai/chronon/resources/gcp/joins/__init__.py +0 -0
ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
ai/chronon/resources/gcp/joins/test/data.py +26 -0
ai/chronon/resources/gcp/sources/__init__.py +0 -0
ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
ai/chronon/resources/gcp/sources/test/data.py +26 -0
ai/chronon/resources/gcp/teams.py +58 -0
ai/chronon/source.py +86 -0
ai/chronon/staging_query.py +226 -0
ai/chronon/types.py +58 -0
ai/chronon/utils.py +510 -0
ai/chronon/windows.py +48 -0
awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
gen_thrift/__init__.py +0 -0
gen_thrift/api/__init__.py +1 -0
gen_thrift/api/constants.py +15 -0
gen_thrift/api/ttypes.py +3754 -0
gen_thrift/common/__init__.py +1 -0
gen_thrift/common/constants.py +15 -0
gen_thrift/common/ttypes.py +1814 -0
gen_thrift/eval/__init__.py +1 -0
gen_thrift/eval/constants.py +15 -0
gen_thrift/eval/ttypes.py +660 -0
gen_thrift/fetcher/__init__.py +1 -0
gen_thrift/fetcher/constants.py +15 -0
gen_thrift/fetcher/ttypes.py +127 -0
gen_thrift/hub/__init__.py +1 -0
gen_thrift/hub/constants.py +15 -0
gen_thrift/hub/ttypes.py +1109 -0
gen_thrift/observability/__init__.py +1 -0
gen_thrift/observability/constants.py +15 -0
gen_thrift/observability/ttypes.py +2355 -0
gen_thrift/planner/__init__.py +1 -0
gen_thrift/planner/constants.py +15 -0
gen_thrift/planner/ttypes.py +1967 -0

ai/chronon/repo/run.py ADDED Viewed

@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+"""
+run.py needs to only depend in python standard library to simplify execution requirements.
+"""
+#     Copyright (C) 2023 The Chronon Authors.
+#
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+import os
+from datetime import datetime
+import click
+from ai.chronon.repo.aws import (
+    ZIPLINE_AWS_JAR_DEFAULT,
+    ZIPLINE_AWS_ONLINE_CLASS_DEFAULT,
+    AwsRunner,
+)
+from ai.chronon.repo.constants import (
+    APP_NAME_TEMPLATE,
+    AWS,
+    CLOUD_PROVIDER_KEYWORD,
+    GCP,
+    MODE_ARGS,
+    ONLINE_CLASS_ARG,
+    ONLINE_JAR_ARG,
+    ONLINE_MODES,
+    RENDER_INFO_DEFAULT_SCRIPT,
+    ZIPLINE_DIRECTORY,
+    RunMode,
+)
+from ai.chronon.repo.default_runner import Runner
+from ai.chronon.repo.gcp import (
+    ZIPLINE_GCP_JAR_DEFAULT,
+    ZIPLINE_GCP_ONLINE_CLASS_DEFAULT,
+    GcpRunner,
+)
+from ai.chronon.repo.utils import get_environ_arg, set_runtime_env_v3
+# TODO: @davidhan - we should move these to all be in the defaults of the choice args
+def set_defaults(ctx):
+    """Set default values based on environment."""
+    chronon_repo_path = os.environ.get("CHRONON_REPO_PATH", ".")
+    today = datetime.today().strftime("%Y-%m-%d")
+    obj = ctx.obj if ctx.obj is not None else dict()
+    defaults = {
+        "ds": today,  # TODO: this breaks if the partition column is not the same as yyyy-MM-dd.
+        "app_name": os.environ.get("APP_NAME"),
+        "online_jar": os.environ.get("CHRONON_ONLINE_JAR"),
+        "repo": chronon_repo_path,
+        "online_class": os.environ.get("CHRONON_ONLINE_CLASS"),
+        "version": os.environ.get("VERSION") or obj.get("version"),
+        "spark_version": os.environ.get("SPARK_VERSION", "2.4.0"),
+        "spark_submit_path": os.path.join(chronon_repo_path, "scripts/spark_submit.sh"),
+        "spark_streaming_submit_path": os.path.join(
+            chronon_repo_path, "scripts/spark_streaming.sh"
+        ),
+        # NOTE: We don't want to ever call the fetch_online_jar.py script since we're working
+        # on our internal zipline fork of the chronon repo
+        # "online_jar_fetch": os.path.join(chronon_repo_path, "scripts/fetch_online_jar.py"),
+        "online_args": os.environ.get("CHRONON_ONLINE_ARGS"),
+        "chronon_jar": os.environ.get("CHRONON_DRIVER_JAR"),
+        "list_apps": "python3 " + os.path.join(chronon_repo_path, "scripts/yarn_list.py"),
+        "render_info": os.path.join(chronon_repo_path, RENDER_INFO_DEFAULT_SCRIPT),
+        "project_conf": obj.get("project_conf"),
+        "artifact_prefix": os.environ.get("ARTIFACT_PREFIX"),
+        "flink_state_uri": os.environ.get("FLINK_STATE_URI"),
+    }
+    for key, value in defaults.items():
+        if ctx.params.get(key) is None and value is not None:
+            ctx.params[key] = value
+def validate_flink_state(ctx, param, value):
+    uri_schemes = ["gs://", "s3://"]
+    if value and not any(value.startswith(scheme) for scheme in uri_schemes):
+        raise click.BadParameter(f"Flink state uri must start with {uri_schemes}")
+    return value
+def validate_additional_jars(ctx, param, value):
+    if value:
+        jars = value.split(",")
+        for jar in jars:
+            if not jar.startswith(("gs://", "s3://")):
+                raise click.BadParameter(f"Additional jars must start with gs://, s3://: {jar}")
+    return value
+@click.command(
+    name="run",
+    context_settings=dict(allow_extra_args=True, ignore_unknown_options=True),
+)
+@click.option(
+    "--conf", required=True, help="Conf param - required for every mode"
+)  # TODO: @davidhan - we should be able to infer this in the future
+@click.option(
+    "--env",
+    required=False,
+    default="dev",
+    help="Running environment - default to be dev",
+)
+@click.option(
+    "--mode", type=click.Choice([str(k) for k in MODE_ARGS.keys()]), default=str(RunMode.BACKFILL)
+)
+@click.option("--ds", help="the end partition to backfill the data")
+@click.option("--app-name", help="app name. Default to {}".format(APP_NAME_TEMPLATE))
+@click.option(
+    "--start-ds",
+    help="override the original start partition for a range backfill. "
+    "It only supports staging query, group by backfill and join jobs. "
+    "It could leave holes in your final output table due to the override date range.",
+)
+@click.option("--end-ds", help="the end ds for a range backfill")
+@click.option(
+    "--parallelism",
+    help="break down the backfill range into this number of tasks in parallel. "
+    "Please use it along with --start-ds and --end-ds and only in manual mode",
+)
+@click.option("--repo", help="Path to chronon repo", default=".")
+@click.option(
+    "--online-jar",
+    help="Jar containing Online KvStore & Deserializer Impl. "
+    "Used for streaming and metadata-upload mode.",
+)
+@click.option(
+    "--online-class",
+    help="Class name of Online Impl. Used for streaming and metadata-upload mode.",
+)
+@click.option("--version", required=False, help="Chronon version to use.")
+@click.option("--spark-version", default="2.4.0", help="Spark version to use for downloading jar.")
+@click.option("--spark-submit-path", help="Path to spark-submit")
+@click.option("--spark-streaming-submit-path", help="Path to spark-submit for streaming")
+@click.option(
+    "--online-jar-fetch",
+    help="Path to script that can pull online jar. This will run only "
+    "when a file doesn't exist at location specified by online_jar",
+)
+@click.option("--sub-help", is_flag=True, help="print help command of the underlying jar and exit")
+@click.option(
+    "--conf-type",
+    help="related to sub-help - no need to set unless you are not working with a conf",
+)
+@click.option("--online-args", help="Basic arguments that need to be supplied to all online modes")
+@click.option("--chronon-jar", help="Path to chronon OS jar")
+@click.option("--release-tag", help="Use the latest jar for a particular tag.")
+@click.option("--list-apps", help="command/script to list running jobs on the scheduler")
+@click.option(
+    "--render-info",
+    help="Path to script rendering additional information of the given config. "
+    "Only applicable when mode is set to info",
+)
+@click.option("--kafka-bootstrap", help="Kafka bootstrap server in host:port format")
+@click.option(
+    "--latest-savepoint",
+    is_flag=True,
+    default=False,
+    help="Deploys streaming job with latest savepoint",
+)
+@click.option("--custom-savepoint", help="Savepoint to deploy streaming job with.")
+@click.option(
+    "--no-savepoint", is_flag=True, default=False, help="Deploys streaming job without a savepoint"
+)
+@click.option(
+    "--version-check",
+    is_flag=True,
+    default=False,
+    help="Checks if Zipline version of running streaming job is different from local version and deploys the job if they are different",
+)
+@click.option(
+    "--flink-state-uri",
+    help="Bucket for storing flink state checkpoints/savepoints and other internal pieces for orchestration.",
+    callback=validate_flink_state,
+)
+@click.option(
+    "--additional-jars",
+    help="Comma separated list of additional jar URIs to be included in the Flink job classpath (e.g. gs://bucket/jar1.jar,gs://bucket/jar2.jar).",
+    callback=validate_additional_jars,
+)
+@click.option(
+    "--validate",
+    is_flag=True,
+    help="Validate the catalyst util Spark expression evaluation logic",
+)
+@click.option("--validate-rows", default="10000", help="Number of rows to  run the validation on")
+@click.option("--join-part-name", help="Name of the join part to use for join-part-job")
+@click.option(
+    "--artifact-prefix",
+    help="Remote artifact URI to install zipline client artifacts necessary for interacting with Zipline infrastructure.",
+)
+@click.option("--disable-cloud-logging", is_flag=True, default=False, help="Disables cloud logging")
+@click.option(
+    "--enable-debug",
+    is_flag=True,
+    default=False,
+    help="Enables verbose debug logging in run modes that support it",
+)
+@click.option(
+    "--uploader",
+    type=click.Choice(["spark", "bigquery"], case_sensitive=False),
+    help="Bulk put uploader to use when load data to kv store, applied to upload-to-kv mode",
+)
+@click.pass_context
+def main(
+    ctx,
+    conf,
+    env,
+    mode,
+    ds,
+    app_name,
+    start_ds,
+    end_ds,
+    parallelism,
+    repo,
+    online_jar,
+    online_class,
+    version,
+    spark_version,
+    spark_submit_path,
+    spark_streaming_submit_path,
+    online_jar_fetch,
+    sub_help,
+    conf_type,
+    online_args,
+    chronon_jar,
+    release_tag,
+    list_apps,
+    render_info,
+    kafka_bootstrap,
+    latest_savepoint,
+    custom_savepoint,
+    no_savepoint,
+    version_check,
+    flink_state_uri,
+    validate,
+    validate_rows,
+    join_part_name,
+    artifact_prefix,
+    disable_cloud_logging,
+    additional_jars,
+    enable_debug,
+    uploader,
+):
+    unknown_args = ctx.args
+    click.echo("Running with args: {}".format(ctx.params))
+    conf_path = os.path.join(repo, conf)
+    if not os.path.isfile(conf_path):
+        raise ValueError(f"Conf file {conf_path} does not exist.")
+    set_runtime_env_v3(ctx.params, conf)
+    set_defaults(ctx)
+    extra_args = (" " + online_args) if mode in ONLINE_MODES and online_args else ""
+    ctx.params["args"] = " ".join(unknown_args) + extra_args
+    os.makedirs(ZIPLINE_DIRECTORY, exist_ok=True)
+    cloud_provider = get_environ_arg(CLOUD_PROVIDER_KEYWORD, ignoreError=True)
+    print(f"Cloud provider: {cloud_provider}")
+    if not cloud_provider:
+        # Support open source chronon runs
+        if chronon_jar:
+            Runner(ctx.params, os.path.expanduser(chronon_jar)).run()
+        else:
+            raise ValueError("Jar path is not set.")
+    elif cloud_provider.upper() == GCP:
+        ctx.params[ONLINE_JAR_ARG] = ZIPLINE_GCP_JAR_DEFAULT
+        ctx.params[ONLINE_CLASS_ARG] = ZIPLINE_GCP_ONLINE_CLASS_DEFAULT
+        ctx.params[CLOUD_PROVIDER_KEYWORD] = cloud_provider
+        GcpRunner(ctx.params).run()
+    elif cloud_provider.upper() == AWS:
+        ctx.params[ONLINE_JAR_ARG] = ZIPLINE_AWS_JAR_DEFAULT
+        ctx.params[ONLINE_CLASS_ARG] = ZIPLINE_AWS_ONLINE_CLASS_DEFAULT
+        ctx.params[CLOUD_PROVIDER_KEYWORD] = cloud_provider
+        AwsRunner(ctx.params).run()
+    else:
+        raise ValueError(f"Unsupported cloud provider: {cloud_provider}")
+if __name__ == "__main__":
+    main()

ai/chronon/repo/serializer.py ADDED Viewed

@@ -0,0 +1,133 @@
+#     Copyright (C) 2023 The Chronon Authors.
+#
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+import json
+from thrift import TSerialization
+from thrift.protocol.TBinaryProtocol import TBinaryProtocolAccelerated
+from thrift.protocol.TJSONProtocol import (
+    TJSONProtocolFactory,
+    TSimpleJSONProtocolFactory,
+)
+from thrift.Thrift import TType
+from thrift.transport.TTransport import TMemoryBuffer
+from ai.chronon.utils import JsonDiffer
+class ThriftJSONDecoder(json.JSONDecoder):
+    def __init__(self, *args, **kwargs):
+        self._thrift_class = kwargs.pop("thrift_class")
+        super(ThriftJSONDecoder, self).__init__(*args, **kwargs)
+    def decode(self, json_str):
+        if isinstance(json_str, dict):
+            dct = json_str
+        else:
+            dct = super(ThriftJSONDecoder, self).decode(json_str)
+        return self._convert(
+            dct, TType.STRUCT, (self._thrift_class, self._thrift_class.thrift_spec)
+        )
+    def _convert(self, val, ttype, ttype_info):
+        if ttype == TType.STRUCT:
+            (thrift_class, thrift_spec) = ttype_info
+            ret = thrift_class()
+            for field in thrift_spec:
+                if field is None:
+                    continue
+                (_, field_ttype, field_name, field_ttype_info, dummy) = field
+                if field_name not in val:
+                    continue
+                converted_val = self._convert(val[field_name], field_ttype, field_ttype_info)
+                setattr(ret, field_name, converted_val)
+        elif ttype == TType.LIST:
+            (element_ttype, element_ttype_info, _) = ttype_info
+            ret = [self._convert(x, element_ttype, element_ttype_info) for x in val]
+        elif ttype == TType.SET:
+            (element_ttype, element_ttype_info) = ttype_info
+            ret = set([self._convert(x, element_ttype, element_ttype_info) for x in val])
+        elif ttype == TType.MAP:
+            (key_ttype, key_ttype_info, val_ttype, val_ttype_info, _) = ttype_info
+            ret = dict(
+                [
+                    (
+                        self._convert(k, key_ttype, key_ttype_info),
+                        self._convert(v, val_ttype, val_ttype_info),
+                    )
+                    for (k, v) in val.items()
+                ]
+            )
+        elif ttype == TType.STRING:
+            ret = str(val)
+        elif ttype == TType.DOUBLE:
+            ret = float(val)
+        elif ttype == TType.I64:
+            ret = int(val)
+        elif ttype == TType.I32 or ttype == TType.I16 or ttype == TType.BYTE:
+            ret = int(val)
+        elif ttype == TType.BOOL:
+            ret = bool(val)
+        else:
+            raise TypeError("Unrecognized thrift field type: %d" % ttype)
+        return ret
+def json2thrift(json_str, thrift_class):
+    return json.loads(json_str, cls=ThriftJSONDecoder, thrift_class=thrift_class)
+def json2binary(json_str, thrift_class):
+    thrift = json2thrift(json_str, thrift_class)
+    transport = TMemoryBuffer()
+    protocol = TBinaryProtocolAccelerated(transport)
+    thrift.write(protocol)
+    # Get the raw bytes representing the object in Thrift binary format
+    return transport.getvalue()
+def file2thrift(path, thrift_class):
+    try:
+        with open(path, "r") as file:
+            return json2thrift(file.read(), thrift_class)
+    except json.decoder.JSONDecodeError as e:
+        raise Exception(
+            f"Error decoding file into a {thrift_class.__name__}:  {path}. "
+            + f"Please double check that {path} represents a valid {thrift_class.__name__}."
+        ) from e
+def thrift_json(obj):
+    return TSerialization.serialize(obj, protocol_factory=TJSONProtocolFactory())
+def thrift_simple_json(obj):
+    simple = TSerialization.serialize(obj, protocol_factory=TSimpleJSONProtocolFactory())
+    parsed = json.loads(simple)
+    return json.dumps(parsed, indent=2, sort_keys=True)
+def thrift_simple_json_protected(obj, obj_type) -> str:
+    serialized = thrift_simple_json(obj)
+    # ensure that reversal works - we will use this reversal during deployment
+    thrift_obj = json.loads(serialized, cls=ThriftJSONDecoder, thrift_class=obj_type)
+    actual = thrift_simple_json(thrift_obj)
+    differ = JsonDiffer()
+    diff = differ.diff(serialized, actual)
+    assert len(diff) == 0, f"""Serialization can't be reversed
+diff: \n{diff}
+original: \n{serialized}
+"""
+    differ.clean()
+    return serialized

ai/chronon/repo/team_json_utils.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""A module used for reading teams.json file."""
+#     Copyright (C) 2023 The Chronon Authors.
+#
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+import json
+# `default` team in teams.json contains default values.
+DEFAULT_CONF_TEAM = "default"
+loaded_jsons = {}
+def read_conf_json(json_path):
+    if json_path not in loaded_jsons:
+        with open(json_path) as w:
+            team_json = json.load(w)
+            loaded_jsons[json_path] = team_json
+    return loaded_jsons[json_path]
+def team_exists(json_path, team):
+    team_json = read_conf_json(json_path)
+    return team in team_json
+def get_team_conf(json_path, team, key):
+    team_json = read_conf_json(json_path)
+    if team not in team_json:
+        raise ValueError("team {} does not exist in {}".format(team, json_path))
+    team_dict = team_json[team]
+    if key in team_dict:
+        return team_dict[key]
+    else:
+        return team_json[DEFAULT_CONF_TEAM][key]