PyPI - awx-zipline-ai - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

awx-zipline-ai 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

agent/ttypes.py +6 -6
ai/chronon/airflow_helpers.py +20 -23
ai/chronon/cli/__init__.py +0 -0
ai/chronon/cli/compile/__init__.py +0 -0
ai/chronon/cli/compile/column_hashing.py +40 -17
ai/chronon/cli/compile/compile_context.py +13 -17
ai/chronon/cli/compile/compiler.py +59 -36
ai/chronon/cli/compile/conf_validator.py +251 -99
ai/chronon/cli/compile/display/__init__.py +0 -0
ai/chronon/cli/compile/display/class_tracker.py +6 -16
ai/chronon/cli/compile/display/compile_status.py +10 -10
ai/chronon/cli/compile/display/diff_result.py +79 -14
ai/chronon/cli/compile/fill_templates.py +3 -8
ai/chronon/cli/compile/parse_configs.py +10 -17
ai/chronon/cli/compile/parse_teams.py +38 -34
ai/chronon/cli/compile/serializer.py +3 -9
ai/chronon/cli/compile/version_utils.py +42 -0
ai/chronon/cli/git_utils.py +2 -13
ai/chronon/cli/logger.py +0 -2
ai/chronon/constants.py +1 -1
ai/chronon/group_by.py +47 -47
ai/chronon/join.py +46 -32
ai/chronon/logger.py +1 -2
ai/chronon/model.py +9 -4
ai/chronon/query.py +2 -2
ai/chronon/repo/__init__.py +1 -2
ai/chronon/repo/aws.py +17 -31
ai/chronon/repo/cluster.py +121 -50
ai/chronon/repo/compile.py +14 -8
ai/chronon/repo/constants.py +1 -1
ai/chronon/repo/default_runner.py +32 -54
ai/chronon/repo/explore.py +70 -73
ai/chronon/repo/extract_objects.py +6 -9
ai/chronon/repo/gcp.py +89 -88
ai/chronon/repo/gitpython_utils.py +3 -2
ai/chronon/repo/hub_runner.py +145 -55
ai/chronon/repo/hub_uploader.py +2 -1
ai/chronon/repo/init.py +12 -5
ai/chronon/repo/join_backfill.py +19 -5
ai/chronon/repo/run.py +42 -39
ai/chronon/repo/serializer.py +4 -12
ai/chronon/repo/utils.py +72 -63
ai/chronon/repo/zipline.py +3 -19
ai/chronon/repo/zipline_hub.py +211 -39
ai/chronon/resources/__init__.py +0 -0
ai/chronon/resources/gcp/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
ai/chronon/resources/gcp/joins/__init__.py +0 -0
ai/chronon/resources/gcp/joins/test/data.py +4 -8
ai/chronon/resources/gcp/sources/__init__.py +0 -0
ai/chronon/resources/gcp/sources/test/data.py +9 -6
ai/chronon/resources/gcp/teams.py +9 -21
ai/chronon/source.py +2 -4
ai/chronon/staging_query.py +60 -19
ai/chronon/types.py +3 -2
ai/chronon/utils.py +21 -68
ai/chronon/windows.py +2 -4
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/METADATA +48 -24
awx_zipline_ai-0.3.1.dist-info/RECORD +96 -0
awx_zipline_ai-0.3.1.dist-info/top_level.txt +4 -0
gen_thrift/__init__.py +0 -0
{ai/chronon → gen_thrift}/api/ttypes.py +327 -197
{ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
gen_thrift/eval/ttypes.py +660 -0
{ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
{ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
{ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
ai/chronon/eval/__init__.py +0 -122
ai/chronon/eval/query_parsing.py +0 -19
ai/chronon/eval/sample_tables.py +0 -100
ai/chronon/eval/table_scan.py +0 -186
ai/chronon/orchestration/ttypes.py +0 -4406
ai/chronon/resources/gcp/README.md +0 -174
ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
/jars/__init__.py → /__init__.py +0 -0
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/WHEEL +0 -0
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/entry_points.txt +0 -0
{ai/chronon → gen_thrift}/api/__init__.py +0 -0
{ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
{ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
{ai/chronon/api → gen_thrift/common}/constants.py +0 -0
{ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
{ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
{ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
{ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
{ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
{ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
{ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
{ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
{ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
{ai/chronon → gen_thrift}/planner/__init__.py +0 -0
{ai/chronon → gen_thrift}/planner/constants.py +0 -0

ai/chronon/repo/gcp.py CHANGED Viewed

@@ -38,9 +38,7 @@ class GcpRunner(Runner):
     def __init__(self, args):
         self._remote_artifact_prefix = args.get("artifact_prefix")
         if not self._remote_artifact_prefix:
-            raise ValueError(
-                "GCP artifact prefix not set."
-            )
+            raise ValueError("GCP artifact prefix not set.")
         self._version = args.get("version")
         gcp_jar_path = GcpRunner.download_zipline_dataproc_jar(
@@ -55,18 +53,13 @@ class GcpRunner(Runner):
             self._version,
             ZIPLINE_GCP_SERVICE_JAR,
         )
-        jar_path = (
-            f"{service_jar_path}:{gcp_jar_path}"
-            if args["mode"] == "fetch"
-            else gcp_jar_path
-        )
+        jar_path = f"{gcp_jar_path}:{service_jar_path}" if args["mode"] == "fetch" else gcp_jar_path
         self._args = args
         self.job_id = str(uuid.uuid4())
         super().__init__(args, os.path.expanduser(jar_path))
     @staticmethod
     def get_gcp_project_id() -> str:
         return get_environ_arg("GCP_PROJECT_ID")
@@ -101,9 +94,7 @@ class GcpRunner(Runner):
             else:
                 return blob.exists(), blob.download_as_text()
         except Exception as e:
-            raise RuntimeError(
-                f"Failed to download {source_blob_name}: {str(e)}"
-            ) from e
+            raise RuntimeError(f"Failed to download {source_blob_name}: {str(e)}") from e
     @staticmethod
     @retry_decorator(retries=2, backoff=5)
@@ -128,9 +119,7 @@ class GcpRunner(Runner):
                 )
             )
         except Exception as e:
-            raise RuntimeError(
-                f"Failed to download {source_blob_name}: {str(e)}"
-            ) from e
+            raise RuntimeError(f"Failed to download {source_blob_name}: {str(e)}") from e
     @staticmethod
     @retry_decorator(retries=2, backoff=5)
@@ -163,9 +152,7 @@ class GcpRunner(Runner):
         blob = bucket.get_blob(blob_name)
         if not blob:
-            raise FileNotFoundError(
-                f"File {blob_name} not found in bucket {bucket_name}"
-            )
+            raise FileNotFoundError(f"File {blob_name} not found in bucket {bucket_name}")
         return blob.crc32c
@@ -191,9 +178,7 @@ class GcpRunner(Runner):
         return base64.b64encode(crc32c_hash.digest()).decode("utf-8")
     @staticmethod
-    def compare_gcs_and_local_file_hashes(
-        remote_file_path: str, local_file_path: str
-    ) -> bool:
+    def compare_gcs_and_local_file_hashes(remote_file_path: str, local_file_path: str) -> bool:
         """
         Compare hashes of a GCS file and a local file to check if they're identical.
@@ -219,15 +204,14 @@ class GcpRunner(Runner):
             return False
     @staticmethod
-    def download_zipline_dataproc_jar(remote_file_path: str, local_file_path: str, version: str, jar_name: str
+    def download_zipline_dataproc_jar(
+        remote_file_path: str, local_file_path: str, version: str, jar_name: str
     ):
         source_path = os.path.join(remote_file_path, "release", version, "jars", jar_name)
         dest_path = os.path.join(local_file_path, jar_name)
         are_identical = (
-            GcpRunner.compare_gcs_and_local_file_hashes(
-                source_path, dest_path
-            )
+            GcpRunner.compare_gcs_and_local_file_hashes(source_path, dest_path)
             if os.path.exists(dest_path)
             else False
         )
@@ -235,9 +219,7 @@ class GcpRunner(Runner):
         if are_identical:
             LOG.info(f"{dest_path} matches GCS {source_path}")
         else:
-            LOG.info(
-                f"{dest_path} does NOT match GCS {source_path}"
-            )
+            LOG.info(f"{dest_path} does NOT match GCS {source_path}")
             LOG.info(f"Downloading {jar_name} from GCS...")
             GcpRunner.download_gcs_file(source_path, dest_path)
@@ -251,7 +233,6 @@ class GcpRunner(Runner):
         job_type: JobType = JobType.SPARK,
         metadata_conf_path: str = None,
     ):
         parsed = urlparse(customer_artifact_prefix)
         source_blob_name = parsed.path.lstrip("/")
@@ -263,7 +244,7 @@ class GcpRunner(Runner):
                 source_blob_name,
                 "metadata",
                 self.job_id,
-                f"{extract_filename_from_path(metadata_conf_path)}"
+                f"{extract_filename_from_path(metadata_conf_path)}",
             )
             gcs_files.append(
                 GcpRunner.upload_gcs_blob(
@@ -283,35 +264,42 @@ class GcpRunner(Runner):
             main_class = "ai.chronon.flink.FlinkJob"
             flink_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_JAR_DEFAULT}")
             enable_pubsub = GcpRunner.is_pubsub_enabled()
-            flink_pubsub_connector_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT}")
-            base_formatted_args = final_args.format(
+            flink_pubsub_connector_jar_uri = os.path.join(
+                release_prefix, f"{ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT}"
+            )
+            base_formatted_args = (
+                final_args.format(
                     user_args=user_args,
                     jar_uri=jar_uri,
                     job_type=job_type.value,
                     main_class=main_class,
                     zipline_version=self._version,
                     job_id=self.job_id,
-                ) + f" --flink-main-jar-uri={flink_jar_uri}"
+                )
+                + f" --flink-main-jar-uri={flink_jar_uri}"
+            )
             if enable_pubsub:
                 base_formatted_args += f" --flink-pubsub-jar-uri={flink_pubsub_connector_jar_uri}"
             return base_formatted_args
         elif job_type == JobType.SPARK:
             main_class = "ai.chronon.spark.Driver"
-            return " ".join([
-                final_args.format(
-                    user_args=user_args,
-                    jar_uri=jar_uri,
-                    job_type=job_type.value,
-                    main_class=main_class,
-                    zipline_version=self._version,
-                    job_id=self.job_id,
-                ), "--is-gcp",
-                        f"--gcp-project-id={GcpRunner.get_gcp_project_id()}",
-                        f"--gcp-bigtable-instance-id={GcpRunner.get_gcp_bigtable_instance_id()}",
-                        f"--files={gcs_file_args}" if gcs_file_args else "",
-                    ]
-                )
+            return " ".join(
+                [
+                    final_args.format(
+                        user_args=user_args,
+                        jar_uri=jar_uri,
+                        job_type=job_type.value,
+                        main_class=main_class,
+                        zipline_version=self._version,
+                        job_id=self.job_id,
+                    ),
+                    "--is-gcp",
+                    f"--gcp-project-id={GcpRunner.get_gcp_project_id()}",
+                    f"--gcp-bigtable-instance-id={GcpRunner.get_gcp_bigtable_instance_id()}",
+                    f"--files={gcs_file_args}" if gcs_file_args else "",
+                ]
+            )
         else:
             raise ValueError(f"Invalid job type: {job_type}")
@@ -335,7 +323,6 @@ class GcpRunner(Runner):
             "--streaming-manifest-path": self.streaming_manifest_path,
             "--streaming-checkpoint-path": self.streaming_checkpoint_path,
             "--local-zipline-version": self._version,
             # Need these for extracting metadata name in submitter
             "--local-conf-path": self.local_abs_conf_path,
             "--original-mode": self.mode,
@@ -405,19 +392,19 @@ class GcpRunner(Runner):
             # for now only poking for a particular partition is supported.
             args = self._args.get("args")
             supported_subcommands = ["check-partitions"]
-            assert (
-                "check-partitions" in args
-            ), f"Must specify one of the following subcommands: {supported_subcommands}"
-            assert (
-                "--partition-names" in args
-            ), "Must specify a list of `--partition-names=schema.table/pk1=pv1/pk2=pv2"
+            assert "check-partitions" in args, (
+                f"Must specify one of the following subcommands: {supported_subcommands}"
+            )
+            assert "--partition-names" in args, (
+                "Must specify a list of `--partition-names=schema.table/pk1=pv1/pk2=pv2"
+            )
             dataproc_args = self.generate_dataproc_submitter_args(
                 # for now, self.conf is the only local file that requires uploading to gcs
                 user_args=self._gen_final_args(),
                 version=self._version,
                 customer_artifact_prefix=self._remote_artifact_prefix,
-                metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None
+                metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None,
             )
             command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
             command_list.append(command)
@@ -446,14 +433,10 @@ class GcpRunner(Runner):
                             # the file is copied to root and not the complete path
                             # is copied.
                             override_conf_path=(
-                                extract_filename_from_path(self.conf)
-                                if self.conf
-                                else None
+                                extract_filename_from_path(self.conf) if self.conf else None
                             ),
                         ),
-                        additional_args=os.environ.get(
-                            "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
-                        ),
+                        additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
                     )
                     dataproc_args = self.generate_dataproc_submitter_args(
@@ -461,11 +444,11 @@ class GcpRunner(Runner):
                         user_args=user_args,
                         version=self._version,
                         customer_artifact_prefix=self._remote_artifact_prefix,
-                        metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None,
-                    )
-                    command = (
-                        f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
+                        metadata_conf_path=str(os.path.join(self.repo, self.conf))
+                        if self.conf
+                        else None,
                     )
+                    command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
                     command_list.append(command)
             else:
                 user_args = ("{subcommand} {args} {additional_args}").format(
@@ -481,15 +464,15 @@ class GcpRunner(Runner):
                             extract_filename_from_path(self.conf) if self.conf else None
                         ),
                     ),
-                    additional_args=os.environ.get(
-                        "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
-                    ),
+                    additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
                 )
                 dataproc_args = self.generate_dataproc_submitter_args(
                     user_args=user_args,
                     version=self._version,
                     customer_artifact_prefix=self._remote_artifact_prefix,
-                    metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None
+                    metadata_conf_path=str(os.path.join(self.repo, self.conf))
+                    if self.conf
+                    else None,
                 )
                 command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
                 command_list.append(command)
@@ -498,9 +481,7 @@ class GcpRunner(Runner):
             # parallel backfill mode
             with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
                 LOG.info(
-                    "Running args list {} with pool size {}".format(
-                        command_list, self.parallelism
-                    )
+                    "Running args list {} with pool size {}".format(command_list, self.parallelism)
                 )
                 pool.map(check_call, command_list)
         elif len(command_list) == 1:
@@ -518,7 +499,6 @@ class GcpRunner(Runner):
                     log[log.index(dataproc_submitter_id_str) + len(dataproc_submitter_id_str) + 1 :]
                 ).strip()
             if not self.disable_cloud_logging and submitted_job_id:
                 LOG.info(
                     """
@@ -537,17 +517,24 @@ class GcpRunner(Runner):
                 # Fetch the final job state
                 job_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
-                LOG.info("<<<<<<<<<<<<<<<<-----------------JOB STATUS----------------->>>>>>>>>>>>>>>>>")
-                if job_state != 'DONE':
-                    LOG.info(f"Job {submitted_job_id} is not in DONE state. Current state: {job_state}")
+                LOG.info(
+                    "<<<<<<<<<<<<<<<<-----------------JOB STATUS----------------->>>>>>>>>>>>>>>>>"
+                )
+                if job_state != "DONE":
+                    LOG.info(
+                        f"Job {submitted_job_id} is not in DONE state. Current state: {job_state}"
+                    )
                     raise RuntimeError(f"Job {submitted_job_id} failed.")
                 else:
                     LOG.info(f"Job {submitted_job_id} is in DONE state.")
                 return
             # If streaming deploy job, poll and check for final
-            if (submitted_job_id and self.mode in ["streaming", "streaming-client"]
-                    and "deploy" in self._args.get("args")):
+            if (
+                submitted_job_id
+                and self.mode in ["streaming", "streaming-client"]
+                and "deploy" in self._args.get("args")
+            ):
                 # Poll the dataproc job id for 5 minutes until the job
                 total_time_seconds = 5 * 60
                 interval_seconds = 10
@@ -555,28 +542,42 @@ class GcpRunner(Runner):
                 while time.time() - start_time < total_time_seconds:
                     current_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
-                    non_terminal_states = ['SETUP_DONE', 'RUNNING', 'PENDING', 'STATE_UNSPECIFIED']
+                    non_terminal_states = ["SETUP_DONE", "RUNNING", "PENDING", "STATE_UNSPECIFIED"]
                     if current_state not in non_terminal_states:
-                        raise RuntimeError(f"Flink job is not in {non_terminal_states}. "
-                                           f"Current state: {current_state}")
+                        raise RuntimeError(
+                            f"Flink job is not in {non_terminal_states}. "
+                            f"Current state: {current_state}"
+                        )
-                    manifest_path = os.path.join(self.streaming_manifest_path, self.conf_metadata_name, "manifest.txt")
+                    manifest_path = os.path.join(
+                        self.streaming_manifest_path, self.conf_metadata_name, "manifest.txt"
+                    )
                     manifest_exists, raw_manifest = self.download_gcs_to_text(str(manifest_path))
                     if manifest_exists:
                         manifest = raw_manifest.strip()
-                        LOG.info(f"Checking Flink manifest to confirm deployment. Manifest: [{manifest}]")
+                        LOG.info(
+                            f"Checking Flink manifest to confirm deployment. Manifest: [{manifest}]"
+                        )
                         manifest_tuples = manifest.split(",")
-                        flink_job_id = [f.split("=")[1] for f in manifest_tuples if f.startswith("flinkJobId")][0]
-                        parent_job_id = [f.split("=")[1] for f in manifest_tuples if f.startswith("parentJobId")][0]
+                        flink_job_id = [
+                            f.split("=")[1] for f in manifest_tuples if f.startswith("flinkJobId")
+                        ][0]
+                        parent_job_id = [
+                            f.split("=")[1] for f in manifest_tuples if f.startswith("parentJobId")
+                        ][0]
                         if parent_job_id == submitted_job_id:
-                            LOG.info(f"Flink job has been deployed successfully. Flink job ID = [{flink_job_id}]."
-                                     f" Dataproc job ID = [{submitted_job_id}]")
+                            LOG.info(
+                                f"Flink job has been deployed successfully. Flink job ID = [{flink_job_id}]."
+                                f" Dataproc job ID = [{submitted_job_id}]"
+                            )
                             break
                         else:
-                            LOG.info(f"Flink manifest not updated with new Dataproc job id {submitted_job_id}.")
+                            LOG.info(
+                                f"Flink manifest not updated with new Dataproc job id {submitted_job_id}."
+                            )
                     LOG.info(f"Sleeping for {interval_seconds} seconds...")
                     time.sleep(interval_seconds)
                 else:

ai/chronon/repo/gitpython_utils.py CHANGED Viewed

@@ -6,9 +6,10 @@ from git import Repo
 def get_default_origin_branch(path, repo: Optional[Repo] = None):
     if not repo:
         repo = Repo(path, search_parent_directories=True)
-    return repo.remotes.origin.refs.HEAD.reference.name.split('/')[-1]
+    return repo.remotes.origin.refs.HEAD.reference.name.split("/")[-1]
 def get_current_branch(path, repo: Optional[Repo] = None):
     if not repo:
         repo = Repo(path, search_parent_directories=True)
-    return repo.active_branch.name
+    return repo.active_branch.name

awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

awx-zipline-ai 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl