PyPI - awx-zipline-ai - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

awx-zipline-ai 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

agent/ttypes.py +6 -6
ai/chronon/airflow_helpers.py +20 -23
ai/chronon/cli/__init__.py +0 -0
ai/chronon/cli/compile/__init__.py +0 -0
ai/chronon/cli/compile/column_hashing.py +40 -17
ai/chronon/cli/compile/compile_context.py +13 -17
ai/chronon/cli/compile/compiler.py +59 -36
ai/chronon/cli/compile/conf_validator.py +251 -99
ai/chronon/cli/compile/display/__init__.py +0 -0
ai/chronon/cli/compile/display/class_tracker.py +6 -16
ai/chronon/cli/compile/display/compile_status.py +10 -10
ai/chronon/cli/compile/display/diff_result.py +79 -14
ai/chronon/cli/compile/fill_templates.py +3 -8
ai/chronon/cli/compile/parse_configs.py +10 -17
ai/chronon/cli/compile/parse_teams.py +38 -34
ai/chronon/cli/compile/serializer.py +3 -9
ai/chronon/cli/compile/version_utils.py +42 -0
ai/chronon/cli/git_utils.py +2 -13
ai/chronon/cli/logger.py +0 -2
ai/chronon/constants.py +1 -1
ai/chronon/group_by.py +47 -47
ai/chronon/join.py +46 -32
ai/chronon/logger.py +1 -2
ai/chronon/model.py +9 -4
ai/chronon/query.py +2 -2
ai/chronon/repo/__init__.py +1 -2
ai/chronon/repo/aws.py +17 -31
ai/chronon/repo/cluster.py +121 -50
ai/chronon/repo/compile.py +14 -8
ai/chronon/repo/constants.py +1 -1
ai/chronon/repo/default_runner.py +32 -54
ai/chronon/repo/explore.py +70 -73
ai/chronon/repo/extract_objects.py +6 -9
ai/chronon/repo/gcp.py +89 -88
ai/chronon/repo/gitpython_utils.py +3 -2
ai/chronon/repo/hub_runner.py +145 -55
ai/chronon/repo/hub_uploader.py +2 -1
ai/chronon/repo/init.py +12 -5
ai/chronon/repo/join_backfill.py +19 -5
ai/chronon/repo/run.py +42 -39
ai/chronon/repo/serializer.py +4 -12
ai/chronon/repo/utils.py +72 -63
ai/chronon/repo/zipline.py +3 -19
ai/chronon/repo/zipline_hub.py +211 -39
ai/chronon/resources/__init__.py +0 -0
ai/chronon/resources/gcp/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
ai/chronon/resources/gcp/joins/__init__.py +0 -0
ai/chronon/resources/gcp/joins/test/data.py +4 -8
ai/chronon/resources/gcp/sources/__init__.py +0 -0
ai/chronon/resources/gcp/sources/test/data.py +9 -6
ai/chronon/resources/gcp/teams.py +9 -21
ai/chronon/source.py +2 -4
ai/chronon/staging_query.py +60 -19
ai/chronon/types.py +3 -2
ai/chronon/utils.py +21 -68
ai/chronon/windows.py +2 -4
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/METADATA +48 -24
awx_zipline_ai-0.3.1.dist-info/RECORD +96 -0
awx_zipline_ai-0.3.1.dist-info/top_level.txt +4 -0
gen_thrift/__init__.py +0 -0
{ai/chronon → gen_thrift}/api/ttypes.py +327 -197
{ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
gen_thrift/eval/ttypes.py +660 -0
{ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
{ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
{ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
ai/chronon/eval/__init__.py +0 -122
ai/chronon/eval/query_parsing.py +0 -19
ai/chronon/eval/sample_tables.py +0 -100
ai/chronon/eval/table_scan.py +0 -186
ai/chronon/orchestration/ttypes.py +0 -4406
ai/chronon/resources/gcp/README.md +0 -174
ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
/jars/__init__.py → /__init__.py +0 -0
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/WHEEL +0 -0
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/entry_points.txt +0 -0
{ai/chronon → gen_thrift}/api/__init__.py +0 -0
{ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
{ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
{ai/chronon/api → gen_thrift/common}/constants.py +0 -0
{ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
{ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
{ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
{ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
{ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
{ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
{ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
{ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
{ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
{ai/chronon → gen_thrift}/planner/__init__.py +0 -0
{ai/chronon → gen_thrift}/planner/constants.py +0 -0

ai/chronon/repo/cluster.py CHANGED Viewed

@@ -1,65 +1,136 @@
 import json
-def generate_dataproc_cluster_config(num_workers, project_id, artifact_prefix, master_host_type="n2-highmem-64",
-                                     worker_host_type="n2-highmem-16",
-                                     subnetwork="default", idle_timeout="7200s", initialization_actions=None, tags=None):
+def generate_dataproc_cluster_config(
+    num_workers,
+    project_id,
+    artifact_prefix,
+    master_host_type="n2-highmem-64",
+    worker_host_type="n2-highmem-16",
+    subnetwork="default",
+    idle_timeout="7200s",
+    initialization_actions=None,
+    tags=None,
+):
     """
     Create a configuration for a Dataproc cluster.
     :return: A json string representing the configuration.
     """
     if initialization_actions is None:
         initialization_actions = []
-    return json.dumps({
-        "gceClusterConfig": {
-            "subnetworkUri": subnetwork,
-            "serviceAccount": "dataproc@" + project_id + ".iam.gserviceaccount.com",
-            "serviceAccountScopes": [
-                "https://www.googleapis.com/auth/cloud-platform",
-                "https://www.googleapis.com/auth/cloud.useraccounts.readonly",
-                "https://www.googleapis.com/auth/devstorage.read_write",
-                "https://www.googleapis.com/auth/logging.write"
+    return json.dumps(
+        {
+            "gceClusterConfig": {
+                "subnetworkUri": subnetwork,
+                "serviceAccount": "dataproc@" + project_id + ".iam.gserviceaccount.com",
+                "serviceAccountScopes": [
+                    "https://www.googleapis.com/auth/cloud-platform",
+                    "https://www.googleapis.com/auth/monitoring",
+                    "https://www.googleapis.com/auth/cloud.useraccounts.readonly",
+                    "https://www.googleapis.com/auth/devstorage.read_write",
+                    "https://www.googleapis.com/auth/logging.write",
+                ],
+                "metadata": {
+                    "hive-version": "3.1.2",
+                    "SPARK_BQ_CONNECTOR_URL": "gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar",
+                    "artifact_prefix": artifact_prefix.rstrip("/"),
+                },
+                "tags": tags or [],
+            },
+            "masterConfig": {
+                "numInstances": 1,
+                "machineTypeUri": master_host_type,
+                "diskConfig": {"bootDiskType": "pd-standard", "bootDiskSizeGb": 1024},
+            },
+            "workerConfig": {
+                "numInstances": num_workers,
+                "machineTypeUri": worker_host_type,
+                "diskConfig": {
+                    "bootDiskType": "pd-standard",
+                    "bootDiskSizeGb": 64,
+                    "numLocalSsds": 2,
+                },
+            },
+            "softwareConfig": {
+                "imageVersion": "2.2.66-debian12",
+                "optionalComponents": [
+                    "FLINK",
+                    "JUPYTER",
+                ],
+                "properties": {
+                    "dataproc:dataproc.logging.stackdriver.enable": "true",
+                    "dataproc:jobs.file-backed-output.enable": "true",
+                    "dataproc:dataproc.logging.stackdriver.job.driver.enable": "true",
+                    "dataproc:dataproc.logging.stackdriver.job.yarn.container.enable": "true",
+                },
+            },
+            "initializationActions": [
+                {"executable_file": initialization_action}
+                for initialization_action in (
+                    (initialization_actions or [])
+                    + [artifact_prefix.rstrip("/") + "/scripts/copy_java_security.sh"]
+                )
             ],
-            "metadata": {
-                "hive-version": "3.1.2",
-                "SPARK_BQ_CONNECTOR_URL": "gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar",
-                "artifact_prefix": artifact_prefix.rstrip("/"),
+            "endpointConfig": {
+                "enableHttpPortAccess": True,
+            },
+            "lifecycleConfig": {
+                "idleDeleteTtl": idle_timeout,
             },
-            "tags": tags or []
+        }
+    )
+def fixed_cluster(
+    size,
+    project_id,
+    artifact_prefix,
+    subnetwork="default",
+    initialization_actions=None,
+    tags=None,
+):
+    """
+    Create a Dataproc cluster configuration based on t-shirt sizes.
+    :param size: T-shirt size - 'small', 'medium', or 'large'
+    :param project_id: GCP project ID
+    :param artifact_prefix: Artifact prefix for initialization scripts
+    :param subnetwork: Subnetwork for the cluster
+    :param initialization_actions: List of initialization actions
+    :param tags: List of tags for the cluster
+    :return: A json string representing the cluster configuration
+    """
+    size_configs = {
+        "small": {
+            "num_workers": 20,
+            "worker_host_type": "n2-highmem-4",  # 16GB, 4 cores
+            "master_host_type": "n2-highmem-4",  # Same as worker for consistency
         },
-        "masterConfig": {
-            "numInstances": 1,
-            "machineTypeUri": master_host_type,
-            "diskConfig": {
-                "bootDiskType": "pd-standard",
-                "bootDiskSizeGb": 1024
-            }
+        "medium": {
+            "num_workers": 50,
+            "worker_host_type": "n2-highmem-16",  # 32GB, 8 cores
+            "master_host_type": "n2-highmem-16",  # Same as worker for consistency
         },
-        "workerConfig": {
-            "numInstances": num_workers,
-            "machineTypeUri": worker_host_type,
-            "diskConfig": {
-                "bootDiskType": "pd-standard",
-                "bootDiskSizeGb": 64,
-                "numLocalSsds": 2
-            }
+        "large": {
+            "num_workers": 250,
+            "worker_host_type": "n2-highmem-16",  # 64GB, 16 cores
+            "master_host_type": "n2-highmem-16",  # Same as worker for consistency
         },
-        "softwareConfig": {
-            "imageVersion": "2.2.50-debian12",
-            "optionalComponents": [
-                "FLINK",
-                "JUPYTER",
-            ],
-            "properties": {
+    }
-            }
-        },
-        "initializationActions": [{"executable_file": initialization_action} for initialization_action in (
-                    (initialization_actions or []) + [artifact_prefix.rstrip("/")+"/scripts/copy_java_security.sh"])],
-        "endpointConfig": {
-            "enableHttpPortAccess": True,
-        },
-        "lifecycleConfig": {
-            "idleDeleteTtl": idle_timeout,
-        }
-    })
+    if size not in size_configs:
+        raise ValueError(f"Invalid size '{size}'. Must be one of: {list(size_configs.keys())}")
+    config = size_configs[size]
+    return generate_dataproc_cluster_config(
+        num_workers=config["num_workers"],
+        project_id=project_id,
+        artifact_prefix=artifact_prefix,
+        master_host_type=config["master_host_type"],
+        worker_host_type=config["worker_host_type"],
+        subnetwork=subnetwork,
+        idle_timeout="3600s",  # 1 hour of inactivity
+        initialization_actions=initialization_actions,
+        tags=tags,
+    )

ai/chronon/repo/compile.py CHANGED Viewed

@@ -15,24 +15,30 @@ from ai.chronon.cli.compile.display.console import console
     help="Path to the root chronon folder",
     default=os.getcwd(),
 )
-def compile(chronon_root):
+@click.option(
+    "--ignore-python-errors",
+    is_flag=True,
+    default=False,
+    help="Allow compilation to proceed even with Python errors (useful for testing)",
+)
+def compile(chronon_root, ignore_python_errors):
     print()
+    if chronon_root is None or chronon_root == "":
+        chronon_root = os.getcwd()
     if chronon_root not in sys.path:
         console.print(
             f"Adding [cyan italic]{chronon_root}[/cyan italic] to python path, during compile."
         )
         sys.path.append(chronon_root)
     else:
-        console.print(
-            f"[cyan italic]{chronon_root}[/cyan italic] already on python path."
-        )
+        console.print(f"[cyan italic]{chronon_root}[/cyan italic] already on python path.")
-    return __compile(chronon_root)
+    return __compile(chronon_root, ignore_python_errors)
-def __compile(chronon_root):
+def __compile(chronon_root, ignore_python_errors=False):
     if chronon_root:
         chronon_root_path = os.path.expanduser(chronon_root)
         os.chdir(chronon_root_path)
@@ -46,7 +52,7 @@ def __compile(chronon_root):
             )
         )
-    compile_context = CompileContext()
+    compile_context = CompileContext(ignore_python_errors=ignore_python_errors)
     compiler = Compiler(compile_context)
     results = compiler.compile()
     return results

ai/chronon/repo/constants.py CHANGED Viewed

@@ -103,7 +103,7 @@ MODE_ARGS = {
     RunMode.SOURCE_JOB: OFFLINE_ARGS,
     RunMode.JOIN_PART_JOB: OFFLINE_ARGS,
     RunMode.MERGE_JOB: OFFLINE_ARGS,
-    RunMode.METASTORE: "", # purposely left blank. we'll handle this specifically
+    RunMode.METASTORE: "",  # purposely left blank. we'll handle this specifically
     RunMode.INFO: "",
 }

ai/chronon/repo/default_runner.py CHANGED Viewed

@@ -63,14 +63,13 @@ class Runner:
             and (args.get("online_jar_fetch"))
         ):
             print("Downloading online_jar")
-            self.online_jar = utils.check_output(
-                "{}".format(args["online_jar_fetch"])
-            ).decode("utf-8")
+            self.online_jar = utils.check_output("{}".format(args["online_jar_fetch"])).decode(
+                "utf-8"
+            )
             os.environ["CHRONON_ONLINE_JAR"] = self.online_jar
             print("Downloaded jar to {}".format(self.online_jar))
-        if (self.conf
-                and (self.mode != "metastore")): # TODO: don't check for metastore
+        if self.conf and (self.mode != "metastore"):  # TODO: don't check for metastore
             try:
                 self.context, self.conf_type, self.team, _ = self.conf.split("/")[-4:]
             except Exception as e:
@@ -81,20 +80,16 @@ class Runner:
                 )
                 raise e
             possible_modes = list(ROUTES[self.conf_type].keys()) + UNIVERSAL_ROUTES
-            assert (
-                args["mode"] in possible_modes
-            ), "Invalid mode:{} for conf:{} of type:{}, please choose from {}".format(
-                args["mode"], self.conf, self.conf_type, possible_modes
+            assert args["mode"] in possible_modes, (
+                "Invalid mode:{} for conf:{} of type:{}, please choose from {}".format(
+                    args["mode"], self.conf, self.conf_type, possible_modes
+                )
             )
         self.ds = args["end_ds"] if "end_ds" in args and args["end_ds"] else args["ds"]
-        self.start_ds = (
-            args["start_ds"] if "start_ds" in args and args["start_ds"] else None
-        )
+        self.start_ds = args["start_ds"] if "start_ds" in args and args["start_ds"] else None
         self.parallelism = (
-            int(args["parallelism"])
-            if "parallelism" in args and args["parallelism"]
-            else 1
+            int(args["parallelism"]) if "parallelism" in args and args["parallelism"] else 1
         )
         self.jar_path = jar_path
@@ -103,9 +98,9 @@ class Runner:
         if self.mode == "streaming":
             self.spark_submit = args["spark_streaming_submit_path"]
         elif self.mode == "info":
-            assert os.path.exists(
-                args["render_info"]
-            ), "Invalid path for the render info script: {}".format(args["render_info"])
+            assert os.path.exists(args["render_info"]), (
+                "Invalid path for the render info script: {}".format(args["render_info"])
+            )
             self.render_info = args["render_info"]
         else:
             self.spark_submit = args["spark_submit_path"]
@@ -113,21 +108,16 @@ class Runner:
         self.disable_cloud_logging = args.get("disable_cloud_logging")
     def run_spark_streaming(self):
         # streaming mode
         self.app_name = self.app_name.replace(
             "_streaming-client_", "_streaming_"
         )  # If the job is running cluster mode we want to kill it.
         print(
-            "Checking to see if a streaming job by the name {} already exists".format(
-                self.app_name
-            )
+            "Checking to see if a streaming job by the name {} already exists".format(self.app_name)
         )
         running_apps = (
-            utils.check_output("{}".format(self.list_apps_cmd))
-            .decode("utf-8")
-            .split("\n")
+            utils.check_output("{}".format(self.list_apps_cmd)).decode("utf-8").split("\n")
         )
         running_app_map = {}
         for app in running_apps:
@@ -150,9 +140,7 @@ class Runner:
                 )
             )
             if self.mode == "streaming":
-                assert (
-                    len(filtered_apps) == 1
-                ), "More than one found, please kill them all"
+                assert len(filtered_apps) == 1, "More than one found, please kill them all"
                 print("All good. No need to start a new app.")
                 return
             elif self.mode == "streaming-client":
@@ -203,9 +191,7 @@ class Runner:
                         "To use parallelism, please specify --start-ds and --end-ds to "
                         "break down into multiple backfill jobs"
                     )
-                    date_ranges = utils.split_date_range(
-                        self.start_ds, self.ds, self.parallelism
-                    )
+                    date_ranges = utils.split_date_range(self.start_ds, self.ds, self.parallelism)
                     for start_ds, end_ds in date_ranges:
                         command = (
                             "bash {script} --class ai.chronon.spark.Driver "
@@ -215,9 +201,7 @@ class Runner:
                             jar=self.jar_path,
                             subcommand=ROUTES[self.conf_type][self.mode],
                             args=self._gen_final_args(start_ds=start_ds, end_ds=end_ds),
-                            additional_args=os.environ.get(
-                                "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
-                            ),
+                            additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
                         )
                         command_list.append(command)
                 else:
@@ -229,9 +213,7 @@ class Runner:
                         jar=self.jar_path,
                         subcommand=ROUTES[self.conf_type][self.mode],
                         args=self._gen_final_args(self.start_ds),
-                        additional_args=os.environ.get(
-                            "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
-                        ),
+                        additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
                     )
                     command_list.append(command)
@@ -239,17 +221,13 @@ class Runner:
             # parallel backfill mode
             with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
                 logging.info(
-                    "Running args list {} with pool size {}".format(
-                        command_list, self.parallelism
-                    )
+                    "Running args list {} with pool size {}".format(command_list, self.parallelism)
                 )
                 pool.map(utils.check_call, command_list)
         elif len(command_list) == 1:
             utils.check_call(command_list[0])
-    def _gen_final_args(
-        self, start_ds=None, end_ds=None, override_conf_path=None, **kwargs
-    ):
+    def _gen_final_args(self, start_ds=None, end_ds=None, override_conf_path=None, **kwargs):
         base_args = MODE_ARGS.get(self.mode).format(
             conf_path=override_conf_path if override_conf_path else self.conf,
             ds=end_ds if end_ds else self.ds,
@@ -261,7 +239,7 @@ class Runner:
         if self.conf_type:
             submitter_args.append(f"--conf-type={self.conf_type}")
         if self.uploader:
             submitter_args.append(f"--uploader={self.uploader}")
@@ -269,23 +247,23 @@ class Runner:
             submitter_args.append(f"--additional-jars={self.additional_jars}")
         if self.mode != RunMode.FETCH:
-            submitter_args.append(" --local-conf-path={conf}".format(
-                conf=self.local_abs_conf_path
-            ))
+            submitter_args.append(" --local-conf-path={conf}".format(conf=self.local_abs_conf_path))
             submitter_args.append(" --original-mode={mode}".format(mode=self.mode))
-        override_start_partition_arg = (
-            "--start-partition-override=" + start_ds if start_ds else ""
-        )
+        override_start_partition_arg = "--start-partition-override=" + start_ds if start_ds else ""
         additional_args = " ".join(
-            f"--{key.replace('_', '-')}={value}"
-            for key, value in kwargs.items()
-            if value
+            f"--{key.replace('_', '-')}={value}" for key, value in kwargs.items() if value
         )
         final_args = " ".join(
-            [base_args, str(self.args), override_start_partition_arg, ' '.join(submitter_args), additional_args]
+            [
+                base_args,
+                str(self.args),
+                override_start_partition_arg,
+                " ".join(submitter_args),
+                additional_args,
+            ]
         )
         return final_args

awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

awx-zipline-ai 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl