PyPI - awx-zipline-ai - Versions diffs - 0.2.0__py3-none-any.whl - Mend

awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

agent/__init__.py +1 -0
agent/constants.py +15 -0
agent/ttypes.py +1684 -0
ai/__init__.py +0 -0
ai/chronon/__init__.py +0 -0
ai/chronon/airflow_helpers.py +251 -0
ai/chronon/api/__init__.py +1 -0
ai/chronon/api/common/__init__.py +1 -0
ai/chronon/api/common/constants.py +15 -0
ai/chronon/api/common/ttypes.py +1844 -0
ai/chronon/api/constants.py +15 -0
ai/chronon/api/ttypes.py +3624 -0
ai/chronon/cli/compile/column_hashing.py +313 -0
ai/chronon/cli/compile/compile_context.py +177 -0
ai/chronon/cli/compile/compiler.py +160 -0
ai/chronon/cli/compile/conf_validator.py +590 -0
ai/chronon/cli/compile/display/class_tracker.py +112 -0
ai/chronon/cli/compile/display/compile_status.py +95 -0
ai/chronon/cli/compile/display/compiled_obj.py +12 -0
ai/chronon/cli/compile/display/console.py +3 -0
ai/chronon/cli/compile/display/diff_result.py +46 -0
ai/chronon/cli/compile/fill_templates.py +40 -0
ai/chronon/cli/compile/parse_configs.py +141 -0
ai/chronon/cli/compile/parse_teams.py +238 -0
ai/chronon/cli/compile/serializer.py +115 -0
ai/chronon/cli/git_utils.py +156 -0
ai/chronon/cli/logger.py +61 -0
ai/chronon/constants.py +3 -0
ai/chronon/eval/__init__.py +122 -0
ai/chronon/eval/query_parsing.py +19 -0
ai/chronon/eval/sample_tables.py +100 -0
ai/chronon/eval/table_scan.py +186 -0
ai/chronon/fetcher/__init__.py +1 -0
ai/chronon/fetcher/constants.py +15 -0
ai/chronon/fetcher/ttypes.py +127 -0
ai/chronon/group_by.py +692 -0
ai/chronon/hub/__init__.py +1 -0
ai/chronon/hub/constants.py +15 -0
ai/chronon/hub/ttypes.py +1228 -0
ai/chronon/join.py +566 -0
ai/chronon/logger.py +24 -0
ai/chronon/model.py +35 -0
ai/chronon/observability/__init__.py +1 -0
ai/chronon/observability/constants.py +15 -0
ai/chronon/observability/ttypes.py +2192 -0
ai/chronon/orchestration/__init__.py +1 -0
ai/chronon/orchestration/constants.py +15 -0
ai/chronon/orchestration/ttypes.py +4406 -0
ai/chronon/planner/__init__.py +1 -0
ai/chronon/planner/constants.py +15 -0
ai/chronon/planner/ttypes.py +1686 -0
ai/chronon/query.py +126 -0
ai/chronon/repo/__init__.py +40 -0
ai/chronon/repo/aws.py +298 -0
ai/chronon/repo/cluster.py +65 -0
ai/chronon/repo/compile.py +56 -0
ai/chronon/repo/constants.py +164 -0
ai/chronon/repo/default_runner.py +291 -0
ai/chronon/repo/explore.py +421 -0
ai/chronon/repo/extract_objects.py +137 -0
ai/chronon/repo/gcp.py +585 -0
ai/chronon/repo/gitpython_utils.py +14 -0
ai/chronon/repo/hub_runner.py +171 -0
ai/chronon/repo/hub_uploader.py +108 -0
ai/chronon/repo/init.py +53 -0
ai/chronon/repo/join_backfill.py +105 -0
ai/chronon/repo/run.py +293 -0
ai/chronon/repo/serializer.py +141 -0
ai/chronon/repo/team_json_utils.py +46 -0
ai/chronon/repo/utils.py +472 -0
ai/chronon/repo/zipline.py +51 -0
ai/chronon/repo/zipline_hub.py +105 -0
ai/chronon/resources/gcp/README.md +174 -0
ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
ai/chronon/resources/gcp/joins/test/data.py +30 -0
ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
ai/chronon/resources/gcp/sources/test/data.py +23 -0
ai/chronon/resources/gcp/teams.py +70 -0
ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
ai/chronon/source.py +88 -0
ai/chronon/staging_query.py +185 -0
ai/chronon/types.py +57 -0
ai/chronon/utils.py +557 -0
ai/chronon/windows.py +50 -0
awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
jars/__init__.py +0 -0

ai/chronon/repo/explore.py ADDED Viewed

@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+#     Copyright (C) 2023 The Chronon Authors.
+#
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+import argparse
+import json
+import os
+import subprocess
+from contextlib import contextmanager
+from pathlib import Path
+CWD = os.getcwd()
+GB_INDEX_SPEC = {
+    "sources": [
+        "sources[].events.table",
+        "sources[].entities.snapshotTable",
+        "sources[].entities.mutationTable",
+        "sources[].entities.topic",
+        "sources[].events.topic",
+    ],
+    "_event_tables": ["sources[].events.table"],
+    "_event_topics": ["sources[].events.topic"],
+    "aggregation": [
+        "aggregations[].inputColumn"
+    ],
+    "keys": [
+        "keyColumns"
+    ],
+    "name": [
+        "metaData.name"
+    ],
+    "online": [
+        "metaData.online"
+    ],
+    "output_namespace": [
+        "metaData.outputNamespace"
+    ],
+}
+JOIN_INDEX_SPEC = {
+    "input_table": [
+        "left.entities.snapshotTable",
+        "left.events.table",
+    ],
+    "_events_driver": ["left.events.table"],
+    "group_bys": [
+        "joinParts[].groupBy.metaData.name",
+        "rightParts[].groupBy.name",
+    ],
+    "name": [
+        "metaData.name"
+    ],
+    "output_namespace": [
+        "metaData.outputNamespace"
+    ],
+    "_group_bys": [
+        "joinParts[].groupBy",
+        "rightParts[].groupBy"
+    ]
+}
+DEFAULTS_SPEC = {
+    'outputNamespace': "namespace"
+}
+GB_REL_PATH = "production/group_bys"
+JOIN_REL_PATH = "production/joins"
+FILTER_COLUMNS = ["aggregation", "keys", "name", "sources", "joins"]
+PATH_FIELDS = ['file', 'json_file']
+# colors chosen to be visible clearly on BOTH black and white terminals
+# change with caution
+NORMAL = '\033[0m'
+BOLD = '\033[1m'
+ITALIC = '\033[3m'
+UNDERLINE = '\033[4m'
+RED = '\033[38;5;160m'
+GREEN = '\033[38;5;28m'
+ORANGE = '\033[38;5;130m'
+BLUE = '\033[38;5;27m'
+GREY = '\033[38;5;246m'
+HIGHLIGHT = BOLD+ITALIC+RED
+# walks the json nodes recursively collecting all values that match the path
+# a trailing `[]` in a field in the path indicates that there is an array of
+# object in the correspoding node value.
+def extract_json(json_path, conf_json):
+    if json_path is None:
+        return conf_json
+    steps = json_path.split(".", 1)
+    key = steps[0]
+    next = steps[1] if len(steps) > 1 else None
+    if key.endswith("[]"):
+        key = key[:-2]
+        if key in conf_json:
+            result = []
+            for value in conf_json[key]:
+                result.extend(extract_json(next, value))
+            return result
+    else:
+        if key in conf_json:
+            final = extract_json(next, conf_json[key])
+            if isinstance(final, list):
+                return final
+            else:
+                return [final]
+    return []
+def build_entry(conf, index_spec, conf_type, root=CWD, teams=None):
+    conf_dict = conf
+    if isinstance(conf, str):
+        with open(conf) as conf_file:
+            try:
+                conf_dict = json.load(conf_file)
+            except BaseException as ex:
+                print(f"Failed to parse {conf} due to :: {ex}")
+                return
+    entry = {"file": None}
+    for column, paths in index_spec.items():
+        result = []
+        for path in paths:
+            result.extend(extract_json(path, conf_dict))
+        entry[column] = result
+    if len(entry["name"]) == 0:
+        return None
+    # derive python file path from the name & conf_type
+    (team, conf_module) = entry["name"][0].split(".", 1)
+    # Update missing values with teams defaults.
+    for field, mapped_field in DEFAULTS_SPEC.items():
+        if field in entry and not entry[field]:
+            team_dict = teams[team].__dict__
+            entry[field] = [team_dict[mapped_field]]
+    file_base = "/".join(conf_module.split(".")[:-1])
+    py_file = file_base + ".py"
+    init_file = file_base + "/__init__.py"
+    py_path = os.path.join(root, conf_type, team, py_file)
+    init_path = os.path.join(root, conf_type, team, init_file)
+    conf_path = py_path if os.path.exists(py_path) else init_path
+    entry["json_file"] = os.path.join(root, "production", conf_type, team, conf_module)
+    entry["file"] = conf_path
+    return entry
+@contextmanager
+def chdir(path):
+    """
+    Context manager to run subprocesses in the appropriate folder so git can get the relevant info.
+    """
+    origin = Path().absolute()
+    try:
+        os.chdir(path)
+        yield
+    finally:
+        os.chdir(origin)
+git_info_cache = {}
+# git_info is the most expensive part of the entire script - so we will have to parallelize
+def git_info(file_paths, exclude=None, root=CWD):
+    exclude_args = f"--invert-grep --grep={exclude}" if exclude else ''
+    procs = []
+    with chdir(root):
+        for file_path in file_paths:
+            if file_path in git_info_cache:
+                procs.append((file_path, git_info_cache[file_path]))
+            else:
+                args = (
+                    f"echo $(git log -n 2 --pretty='format:{BLUE} %as/%an/%ae' {exclude_args} -- "
+                    f"{file_path.replace(root, '')})")
+                procs.append((file_path, subprocess.Popen(args, stdout=subprocess.PIPE, shell=True)))
+        result = {}
+        for file_path, proc in procs:
+            if isinstance(proc, subprocess.Popen):
+                lines = []
+                for line in proc.stdout.readlines():
+                    lines.append(line.decode("utf-8").strip())
+                git_info_cache[file_path] = lines[0]
+            result[file_path] = git_info_cache[file_path]
+    return result
+def walk_files(path):
+    for root, _, files in os.walk(path):
+        for file in files:
+            yield os.path.join(root, file)
+def build_index(conf_type, index_spec, root=CWD, teams=None):
+    rel_path = os.path.join(root, "production", conf_type)
+    teams = teams or {}
+    index_table = {}
+    for path in walk_files(rel_path):
+        index_entry = build_entry(path, index_spec, conf_type, root=root, teams=teams)
+        if index_entry is not None:
+            index_table[index_entry["name"][0]] = index_entry
+    return index_table
+def find_string(text, word):
+    start = text.find(word)
+    while start > -1:
+        yield start
+        start = text.find(word, start + 1)
+def highlight(text, word):
+    result = ""
+    prev_idx = 0
+    for idx in find_string(text, word):
+        result = result + text[prev_idx:idx] + HIGHLIGHT + word + NORMAL
+        prev_idx = idx + len(word)
+    result += text[prev_idx:len(text)]
+    return result
+def prettify_entry(entry, target, modification, show=10, root=CWD, trim_paths=False):
+    lines = []
+    if trim_paths:
+        for field in filter(lambda x: x in entry, PATH_FIELDS):
+            entry[field] = entry[field].replace(root, '')
+    for column, values in entry.items():
+        name = " "*(15 - len(column)) + column
+        if column in FILTER_COLUMNS and len(values) > show:
+            values = [value for value in set(values) if target in value]
+            if (len(values) > show):
+                truncated = ', '.join(values[:show])
+                remaining = len(values) - show
+                values = f"[{truncated} ... {GREY}{UNDERLINE}{remaining} more{NORMAL}]"
+        if column == "file":
+            values = f"{BOLD}{values} {modification}{NORMAL}"
+        else:
+            values = highlight(str(values), target)
+        lines.append(f"{BOLD}{ORANGE}{name}{NORMAL} - {values}")
+    content = "\n" + "\n".join(lines)
+    return content
+def find_in_index(index_table, target):
+    def valid_entry(entry):
+        return any([
+            target in value
+            for column, values in entry.items()
+            if column in FILTER_COLUMNS
+            for value in values
+        ])
+    return find_in_index_pred(index_table, valid_entry)
+def find_in_index_pred(index_table, valid_entry):
+    return [entry for _, entry in index_table.items() if valid_entry(entry)]
+def display_entries(entries, target, root=CWD, trim_paths=False):
+    git_infos = git_info([entry["file"] for entry in entries], root=root)
+    display = []
+    for entry in entries:
+        info = git_infos[entry["file"]]
+        pretty = prettify_entry(entry, target, info, root=root, trim_paths=trim_paths)
+        display.append((info, pretty))
+    for (_, pretty_entry) in sorted(display):
+        print(pretty_entry)
+def enrich_with_joins(gb_index, join_index, root=CWD, teams=None):
+    # nested gb entries
+    for _, join_entry in join_index.items():
+        for gb in join_entry["_group_bys"]:
+            entry = build_entry(gb, GB_INDEX_SPEC, "group_bys", root=root, teams=teams)
+            gb_index[entry["name"][0]] = entry
+    # lineage -> reverse index from gb -> join
+    for _, group_by in gb_index.items():
+        group_by["joins"] = []
+        group_by["join_event_driver"] = []
+    for _, join in join_index.items():
+        for gb_name in join["group_bys"]:
+            if gb_name in gb_index:
+                gb_index[gb_name]["joins"].append(join["name"][0])
+                if len(join["_events_driver"]) > 0:
+                    gb_index[gb_name]["join_event_driver"].append(join["_events_driver"][0])
+# reuse `git log` command result
+file_to_author = {}
+# extract information based on GROUPBY_INDEX_SPEC into this
+gb_index = []
+# extract information based on JOIN_INDEX_SPEC into this
+join_index = []
+def author_name_email(file, exclude=None):
+    if not os.path.exists(file):
+        return ("", "")
+    if file not in file_to_author:
+        for filepath, auth_str in git_info([file], exclude).items():
+            file_to_author[filepath] = auth_str.split("/")[-2:]
+    return file_to_author[file]
+def conf_file(conf_type, conf_name):
+    path_parts = ["production", conf_type]
+    path_parts.extend(conf_name.split(".", 1))
+    return os.path.join(*path_parts)
+# args[0] is output tsv file
+# args[1] is commit messages to exclude when extracting author and email information
+def events_without_topics(output_file=None, exclude_commit_message=None):
+    result = []
+    emails = set()
+    def is_events_without_topics(entry):
+        found = len(entry["_event_topics"]) == 0 and len(entry["_event_tables"]) > 0
+        is_online = len(entry["online"]) > 0
+        joins = ", ".join(entry["joins"]) if len(entry["joins"]) > 0 else "STANDALONE"
+        if found:
+            file = entry["json_file"] if os.path.exists(entry["json_file"]) else entry["file"]
+            producer_name, producer_email = author_name_email(file, exclude_commit_message)
+            emails.add(producer_email)
+            consumers = set()
+            for join in entry["joins"]:
+                conf_file_path = conf_file("joins", join)
+                consumer_name, consumer_email = author_name_email(conf_file_path, exclude_commit_message)
+                consumers.add(consumer_name)
+                emails.add(consumer_email)
+            row = [
+                entry["name"][0],
+                producer_name,
+                is_online,
+                entry["_event_tables"][0],
+                joins,
+                ", ".join(consumers)
+            ]
+            result.append(row)
+        return found
+    find_in_index_pred(gb_index, is_events_without_topics)
+    if output_file:
+        with open(os.path.expanduser(output_file), 'w') as tsv_file:
+            for row in result:
+                tsv_file.write('\t'.join(map(str, row))+'\n')
+        print("wrote information about cases where events us used " +
+              f"without topics set into file {os.path.expanduser(output_file)}")
+    else:
+        for row in result:
+            print('\t'.join(map(str, row))+'\n')
+    print(",".join(list(emails)))
+def load_team_data(path='', teams_root=None):
+    # Check if path is teams.json or teams.py
+    if 'teams.json' in path:
+        with open(path, 'r') as infile:
+            teams = json.load(infile)
+        base_defaults = teams.get('default', {})
+        full_info = teams.copy()
+        for team, values in teams.items():
+            full_info[team] = dict(base_defaults, **values)
+        return full_info
+    else:
+        from ai.chronon.cli.compile import parse_teams
+        assert teams_root is not None, "Need root to load teams.py"
+        teams_py = parse_teams.load_teams(teams_root)
+        return teams_py
+# register all handlers here
+handlers = {
+    "_events_without_topics": events_without_topics
+}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Explore tool for chronon")
+    parser.add_argument("keyword", help="Keyword to look up keys")
+    parser.add_argument("--conf-root", help="Conf root for the configs", default=CWD)
+    parser.add_argument(
+        "--handler-args", nargs="*", help="Special arguments for handler keywords of the form param=value")
+    args = parser.parse_args()
+    root = args.conf_root
+    if not (root.endswith("chronon") or root.endswith("zipline")):
+        print("This script needs to be run from chronon conf root - with folder named 'chronon' or 'zipline', found: "
+              + root)
+    teams = load_team_data(os.path.join(root, 'teams.json'), teams_root=root)
+    gb_index = build_index("group_bys", GB_INDEX_SPEC, root=root, teams=teams)
+    join_index = build_index("joins", JOIN_INDEX_SPEC, root=root, teams=teams)
+    enrich_with_joins(gb_index, join_index, root=root, teams=teams)
+    candidate = args.keyword
+    if candidate in handlers:
+        print(f"{candidate} is a registered handler")
+        handler = handlers[candidate]
+        handler_args = {}
+        for arg in args.handler_args:
+            splits = arg.split("=", 1)
+            assert len(splits) == 2, f"need args to handler for the form, param=value. Found and invalid arg:{arg}"
+            key, value = splits
+            handler_args[key] = value
+        handler(**handler_args)
+    else:
+        group_bys = find_in_index(gb_index, args.keyword)
+        display_entries(group_bys, args.keyword, root=root, trim_paths=True)

ai/chronon/repo/extract_objects.py ADDED Viewed

@@ -0,0 +1,137 @@
+#     Copyright (C) 2023 The Chronon Authors.
+#
+#     Licensed under the Apache License, Version 2.0 (the "License");
+#     you may not use this file except in compliance with the License.
+#     You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#     Unless required by applicable law or agreed to in writing, software
+#     distributed under the License is distributed on an "AS IS" BASIS,
+#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#     See the License for the specific language governing permissions and
+#     limitations under the License.
+import glob
+import importlib.machinery
+import importlib.util
+import logging
+import os
+from ai.chronon.logger import get_logger
+from ai.chronon.repo import FOLDER_NAME_TO_CLASS
+def from_folder(full_path: str, cls: type, log_level=logging.INFO):
+    """
+    Recursively consumes a folder, and constructs a map
+    Creates a map of object qualifier to
+    """
+    if full_path.endswith("/"):
+        full_path = full_path[:-1]
+    python_files = glob.glob(os.path.join(full_path, "**/*.py"), recursive=True)
+    result = {}
+    for f in python_files:
+        try:
+            result.update(from_file(f, cls, log_level))
+        except Exception as e:
+            logging.error(f"Failed to extract: {f}")
+            logging.exception(e)
+    return result
+def from_folderV2(full_path: str, target_file: str, cls: type):
+    """
+    Recursively consumes a folder, and constructs a map of
+    object qualifier to StagingQuery, GroupBy, or Join
+    """
+    if full_path.endswith("/"):
+        full_path = full_path[:-1]
+    python_files = glob.glob(os.path.join(full_path, "**/*.py"), recursive=True)
+    results = {}
+    errors = {}
+    target_file_error = None
+    for f in python_files:
+        try:
+            results_dict = from_file(f, cls, log_level=logging.NOTSET)
+            for k, v in results_dict.items():
+                results[k] = (v, f)
+        except Exception as e:
+            if f == target_file:
+                target_file_error = e
+            errors[f] = e
+    return results, errors, target_file_error
+def import_module_set_name(module, cls):
+    """
+    evaluate imported modules to assign object name.
+    """
+    for name, obj in list(module.__dict__.items()):
+        if isinstance(obj, cls):
+            # the name would be `team_name.python_script_name.[group_by_name|join_name|staging_query_name]__version`
+            # example module.__name__=group_bys.user.avg_session_length, version=1
+            # obj.metaData.name=user.avg_session_length.v1__1
+            # obj.metaData.team=user
+            base_name = module.__name__.partition(".")[2] + "." + name
+            # Add version suffix if version is set
+            if hasattr(obj.metaData, 'version') and obj.metaData.version is not None:
+                base_name = base_name + "__" + str(obj.metaData.version)
+            obj.metaData.name = base_name
+            obj.metaData.team = module.__name__.split(".")[1]
+    return module
+def from_file(file_path: str, cls: type, log_level=logging.INFO):
+    logger = get_logger(log_level)
+    logger.debug("Loading objects of type {cls} from {file_path}".format(**locals()))
+    # mod_qualifier includes team name and python script name without `.py`
+    # this line takes the full file path as input, strips the root path on the left side
+    # strips `.py` on the right side and finally replaces the slash sign to dot
+    # eg: the output would be `team_name.python_script_name`
+    module_qualifier = module_path(file_path)
+    mod = importlib.import_module(module_qualifier)
+    # the key of result dict would be `team_name.python_script_name.[group_by_name|join_name|staging_query_name]`
+    # real world case: psx.reservation_status.v1
+    import_module_set_name(mod, cls)
+    result = {}
+    for obj in [o for o in mod.__dict__.values() if isinstance(o, cls)]:
+        result[obj.metaData.name] = obj
+    return result
+def chronon_path(file_path: str) -> str:
+    conf_types = FOLDER_NAME_TO_CLASS.keys()
+    splits = file_path.split("/")
+    conf_occurences = [splits.index(typ) for typ in conf_types if typ in splits]
+    assert (
+        len(conf_occurences) > 0
+    ), f"Path: {file_path} doesn't contain folder with name among {conf_types}"
+    index = min([splits.index(typ) for typ in conf_types if typ in splits])
+    rel_path = "/".join(splits[index:])
+    return rel_path
+def module_path(file_path: str) -> str:
+    adjusted_path = chronon_path(file_path)
+    assert adjusted_path.endswith(".py"), f"Path: {file_path} doesn't end with '.py'"
+    without_extension = adjusted_path[:-3]
+    mod_path = without_extension.replace("/", ".")
+    return mod_path