PyPI - awx-zipline-ai - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

awx-zipline-ai 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

agent/ttypes.py +6 -6
ai/chronon/airflow_helpers.py +20 -23
ai/chronon/cli/__init__.py +0 -0
ai/chronon/cli/compile/__init__.py +0 -0
ai/chronon/cli/compile/column_hashing.py +40 -17
ai/chronon/cli/compile/compile_context.py +13 -17
ai/chronon/cli/compile/compiler.py +59 -36
ai/chronon/cli/compile/conf_validator.py +251 -99
ai/chronon/cli/compile/display/__init__.py +0 -0
ai/chronon/cli/compile/display/class_tracker.py +6 -16
ai/chronon/cli/compile/display/compile_status.py +10 -10
ai/chronon/cli/compile/display/diff_result.py +79 -14
ai/chronon/cli/compile/fill_templates.py +3 -8
ai/chronon/cli/compile/parse_configs.py +10 -17
ai/chronon/cli/compile/parse_teams.py +38 -34
ai/chronon/cli/compile/serializer.py +3 -9
ai/chronon/cli/compile/version_utils.py +42 -0
ai/chronon/cli/git_utils.py +2 -13
ai/chronon/cli/logger.py +0 -2
ai/chronon/constants.py +1 -1
ai/chronon/group_by.py +47 -47
ai/chronon/join.py +46 -32
ai/chronon/logger.py +1 -2
ai/chronon/model.py +9 -4
ai/chronon/query.py +2 -2
ai/chronon/repo/__init__.py +1 -2
ai/chronon/repo/aws.py +17 -31
ai/chronon/repo/cluster.py +121 -50
ai/chronon/repo/compile.py +14 -8
ai/chronon/repo/constants.py +1 -1
ai/chronon/repo/default_runner.py +32 -54
ai/chronon/repo/explore.py +70 -73
ai/chronon/repo/extract_objects.py +6 -9
ai/chronon/repo/gcp.py +89 -88
ai/chronon/repo/gitpython_utils.py +3 -2
ai/chronon/repo/hub_runner.py +145 -55
ai/chronon/repo/hub_uploader.py +2 -1
ai/chronon/repo/init.py +12 -5
ai/chronon/repo/join_backfill.py +19 -5
ai/chronon/repo/run.py +42 -39
ai/chronon/repo/serializer.py +4 -12
ai/chronon/repo/utils.py +72 -63
ai/chronon/repo/zipline.py +3 -19
ai/chronon/repo/zipline_hub.py +211 -39
ai/chronon/resources/__init__.py +0 -0
ai/chronon/resources/gcp/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
ai/chronon/resources/gcp/joins/__init__.py +0 -0
ai/chronon/resources/gcp/joins/test/data.py +4 -8
ai/chronon/resources/gcp/sources/__init__.py +0 -0
ai/chronon/resources/gcp/sources/test/data.py +9 -6
ai/chronon/resources/gcp/teams.py +9 -21
ai/chronon/source.py +2 -4
ai/chronon/staging_query.py +60 -19
ai/chronon/types.py +3 -2
ai/chronon/utils.py +21 -68
ai/chronon/windows.py +2 -4
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/METADATA +48 -24
awx_zipline_ai-0.3.1.dist-info/RECORD +96 -0
awx_zipline_ai-0.3.1.dist-info/top_level.txt +4 -0
gen_thrift/__init__.py +0 -0
{ai/chronon → gen_thrift}/api/ttypes.py +327 -197
{ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
gen_thrift/eval/ttypes.py +660 -0
{ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
{ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
{ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
ai/chronon/eval/__init__.py +0 -122
ai/chronon/eval/query_parsing.py +0 -19
ai/chronon/eval/sample_tables.py +0 -100
ai/chronon/eval/table_scan.py +0 -186
ai/chronon/orchestration/ttypes.py +0 -4406
ai/chronon/resources/gcp/README.md +0 -174
ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
/jars/__init__.py → /__init__.py +0 -0
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/WHEEL +0 -0
{awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/entry_points.txt +0 -0
{ai/chronon → gen_thrift}/api/__init__.py +0 -0
{ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
{ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
{ai/chronon/api → gen_thrift/common}/constants.py +0 -0
{ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
{ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
{ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
{ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
{ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
{ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
{ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
{ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
{ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
{ai/chronon → gen_thrift}/planner/__init__.py +0 -0
{ai/chronon → gen_thrift}/planner/constants.py +0 -0

ai/chronon/eval/__init__.py DELETED Viewed

@@ -1,122 +0,0 @@
-from typing import Any, List
-from pyspark.sql import DataFrame, SparkSession
-import ai.chronon.api.ttypes as chronon
-from ai.chronon.eval.query_parsing import get_tables_from_query
-from ai.chronon.eval.sample_tables import sample_tables, sample_with_query
-from ai.chronon.eval.table_scan import (
-    TableScan,
-    clean_table_name,
-    table_scans_in_group_by,
-    table_scans_in_join,
-    table_scans_in_source,
-)
-def eval(obj: Any) -> List[DataFrame]:
-    if isinstance(obj, chronon.Source):
-        return _run_table_scans(table_scans_in_source(obj))
-    elif isinstance(obj, chronon.GroupBy):
-        return _run_table_scans(table_scans_in_group_by(obj))
-    elif isinstance(obj, chronon.Join):
-        return _run_table_scans(table_scans_in_join(obj))
-    elif isinstance(obj, chronon.StagingQuery):
-        return _sample_and_eval_query(_render_staging_query(obj))
-    elif isinstance(obj, str):
-        has_white_spaces = any(char.isspace() for char in obj)
-        if has_white_spaces:
-            return _sample_and_eval_query(obj)
-        else:
-            return _sample_and_eval_query(f"SELECT * FROM {obj} LIMIT 1000")
-    elif isinstance(obj, chronon.Model):
-        _run_table_scans(table_scans_in_source(obj.source))
-    else:
-        raise Exception(f"Unsupported object type for: {obj}")
-def _sample_and_eval_query(query: str) -> DataFrame:
-    table_names = get_tables_from_query(query)
-    sample_tables(table_names)
-    clean_query = query
-    for table_name in table_names:
-        clean_name = clean_table_name(table_name)
-        clean_query = clean_query.replace(table_name, clean_name)
-    return _run_query(clean_query)
-def _run_query(query: str) -> DataFrame:
-    spark = _get_spark()
-    return spark.sql(query)
-def _sample_table_scan(table_scan: TableScan) -> str:
-    table = table_scan.table
-    output_path = table_scan.output_path()
-    query = table_scan.raw_scan_query(local_table_view=False)
-    return sample_with_query(table, query, output_path)
-def _run_table_scans(table_scans: List[TableScan]) -> List[DataFrame]:
-    spark = _get_spark()
-    df_list = []
-    for table_scan in table_scans:
-        output_path = table_scan.output_path()
-        status = " (exists)" if output_path.exists() else ""
-        print(
-            f"table: {table_scan.table}\n"
-            f"view: {table_scan.view_name()}\n"
-            f"local_file: {output_path}{status}\n"
-        )
-    for table_scan in table_scans:
-        view_name = table_scan.view_name()
-        output_path = _sample_table_scan(table_scan)
-        print(f"Creating view {view_name} from parquet file {output_path}")
-        df = spark.read.parquet(str(output_path))
-        df.createOrReplaceTempView(view_name)
-        scan_query = table_scan.scan_query(local_table_view=True)
-        print(f"Scanning {table_scan.table} with query: \n{scan_query}\n")
-        df = spark.sql(scan_query)
-        df.show(5)
-        df_list.append(df)
-    return df_list
-_spark: SparkSession = None
-def _get_spark() -> SparkSession:
-    global _spark
-    if not _spark:
-        _spark = (
-            SparkSession.builder.appName("Chronon Evaluator")
-            .config("spark.driver.bindAddress", "127.0.0.1")
-            .config("spark.driver.host", "127.0.0.1")
-            .config("spark.sql.parquet.columnarReaderBatchSize", "16")
-            .config("spark.executor.memory", "4g")
-            .config("spark.driver.memory", "4g")
-            .config("spark.driver.maxResultSize", "2g")
-            .getOrCreate()
-        )
-    return _spark
-def _render_staging_query(staging_query: chronon.StagingQuery) -> str:
-    raise NotImplementedError("Staging query evals are not yet implemented")

ai/chronon/eval/query_parsing.py DELETED Viewed

@@ -1,19 +0,0 @@
-from typing import List
-def get_tables_from_query(sql_query) -> List[str]:
-    import sqlglot
-    # Parse the query
-    parsed = sqlglot.parse_one(sql_query, dialect="bigquery")
-    # Extract all table references
-    tables = parsed.find_all(sqlglot.exp.Table)
-    table_names = []
-    for table in tables:
-        name_parts = [part for part in [table.catalog, table.db, table.name] if part]
-        table_name = ".".join(name_parts)
-        table_names.append(table_name)
-    return table_names

ai/chronon/eval/sample_tables.py DELETED Viewed

@@ -1,100 +0,0 @@
-import os
-from typing import List
-from ai.chronon.eval.table_scan import local_warehouse
-def sample_with_query(table, query, output_path) -> str:
-    # if file exists, skip
-    if os.path.exists(output_path):
-        print(f"File {output_path} already exists. Skipping sampling.")
-        return output_path
-    raw_scan_query = query
-    print(f"Sampling {table} with query: {raw_scan_query}")
-    _sample_internal(raw_scan_query, output_path)
-    return output_path
-def sample_tables(table_names: List[str]) -> None:
-    for table in table_names:
-        query = f"SELECT * FROM {table} LIMIT 10000"
-        sample_with_query(table, query, local_warehouse / f"{table}.parquet")
-_sampling_engine = os.getenv("CHRONON_SAMPLING_ENGINE", "bigquery")
-def _sample_internal(query, output_path) -> str:
-    if _sampling_engine == "bigquery":
-        _sample_bigquery(query, output_path)
-    elif _sampling_engine == "trino":
-        _sample_trino(query, output_path)
-    else:
-        raise ValueError("Invalid sampling engine")
-def _sample_trino(query, output_path):
-    raise NotImplementedError("Trino sampling is not yet implemented")
-def _sample_bigquery(query, output_path):
-    from google.cloud import bigquery
-    project_id = os.getenv("GCP_PROJECT_ID")
-    assert project_id, "Please set the GCP_PROJECT_ID environment variable"
-    client = bigquery.Client(project=project_id)
-    results = client.query_and_wait(query)
-    df = results.to_dataframe()
-    df.to_parquet(output_path)
-def _sample_bigquery_fast(query, destination_path):
-    import os
-    import pyarrow.parquet as pq
-    from google.cloud import bigquery
-    from google.cloud.bigquery_storage import BigQueryReadClient
-    from google.cloud.bigquery_storage_v1.types import DataFormat, ReadSession
-    project_id = os.getenv("GCP_PROJECT_ID")
-    assert project_id, "Please set the GCP_PROJECT_ID environment variable"
-    client = bigquery.Client(project=project_id)
-    bqstorage_client = BigQueryReadClient()
-    # Create query job
-    query_job = client.query(query)
-    table_ref = query_job.destination
-    # Create read session
-    read_session = ReadSession()
-    read_session.table = table_ref.to_bqstorage()
-    read_session.data_format = DataFormat.ARROW
-    print("Fetching from BigQuery... (this might take a while)")
-    session = bqstorage_client.create_read_session(
-        parent=f"projects/{client.project}",
-        read_session=read_session,
-        max_stream_count=1,
-    )
-    print("Writing to local parquet file...")
-    # Read using Arrow
-    stream = bqstorage_client.read_rows(session.streams[0].name)
-    table = stream.to_arrow(read_session=session)
-    # Write to Parquet directly
-    pq.write_table(table, destination_path)
-    print(f"Wrote results to {destination_path}")
-    return destination_path

ai/chronon/eval/table_scan.py DELETED Viewed

@@ -1,186 +0,0 @@
-import hashlib
-import os
-import re
-from dataclasses import dataclass
-from datetime import datetime, timedelta
-from pathlib import Path
-from typing import List, Tuple
-import ai.chronon.api.ttypes as chronon
-def clean_table_name(name: str) -> str:
-    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
-local_warehouse = Path(os.getenv("CHRONON_ROOT", os.getcwd())) / "local_warehouse"
-limit = int(os.getenv("SAMPLE_LIMIT", "100"))
-# create local_warehouse if it doesn't exist
-local_warehouse.mkdir(parents=True, exist_ok=True)
-@dataclass
-class TableScan:
-    table: str
-    partition_col: str
-    partition_date: str
-    query: chronon.Query
-    is_mutations: bool = False
-    def output_path(self) -> str:
-        return Path(local_warehouse) / f"{self.view_name()}.parquet"
-    def view_name(self) -> str:
-        return clean_table_name(self.table) + "_" + self.where_id()
-    def table_name(self, local_table_view) -> str:
-        return self.view_name() if local_table_view else self.table
-    def where_id(self) -> str:
-        return "_" + hashlib.md5(self.where_block().encode()).hexdigest()[:3]
-    def where_block(self) -> str:
-        wheres = []
-        partition_scan = f"{self.partition_col} = '{self.partition_date}'"
-        wheres.append(partition_scan)
-        if self.query.wheres:
-            wheres.extend(self.query.wheres)
-        return " AND\n    ".join([f"({where})" for where in wheres])
-    def raw_scan_query(self, local_table_view: bool = True) -> str:
-        return f"""
-SELECT * FROM {self.table_name(local_table_view)}
-WHERE
-    {self.where_block()}
-LIMIT {limit}
-"""
-    def scan_query(self, local_table_view=True) -> str:
-        selects = []
-        base_selects = self.query.selects.copy()
-        if self.is_mutations:
-            base_selects["is_before"] = coalesce(self.query.reversalColumn, "is_before")
-            base_selects["mutation_ts"] = coalesce(
-                self.query.mutationTimeColumn, "mutation_ts"
-            )
-        if self.query.timeColumn:
-            base_selects["ts"] = coalesce(self.query.timeColumn, "ts")
-        for k, v in base_selects.items():
-            selects.append(f"{v} as {k}")
-        select_clauses = ",\n    ".join(selects)
-        return f"""
-SELECT
-    {select_clauses}
-FROM
-    {self.table_name(local_table_view)}
-WHERE
-    {self.where_block()}
-LIMIT
-    {limit}
-"""
-# TODO: use teams.py to get the default date column
-DEFAULT_DATE_COLUMN = "_date"
-DEFAULT_DATE_FORMAT = "%Y-%m-%d"
-two_days_ago = (datetime.now() - timedelta(days=2)).strftime(DEFAULT_DATE_FORMAT)
-_sample_date = os.getenv("SAMPLE_DATE", two_days_ago)
-def get_date(query: chronon.Query) -> Tuple[str, str]:
-    assert query and query.selects, "please specify source.query.selects"
-    partition_col = query.selects.get("ds", DEFAULT_DATE_COLUMN)
-    partition_date = coalesce(query.endPartition, _sample_date)
-    return (partition_col, partition_date)
-def coalesce(*args):
-    for arg in args:
-        if arg:
-            return arg
-def table_scans_in_source(source: chronon.Source) -> List[TableScan]:
-    result = []
-    if not source:
-        return result
-    if source.entities:
-        query: chronon.Query = source.entities.query
-        col, date = get_date(query)
-        snapshot = TableScan(source.entities.snapshotTable, col, date, query)
-        result.append(snapshot)
-        if source.entities.mutationTable:
-            mutations = TableScan(source.entities.mutationTable, col, date, query, True)
-            result.append(mutations)
-    if source.events:
-        query = source.events.query
-        col, date = get_date(query)
-        table = TableScan(source.events.table, col, date, query)
-        result.append(table)
-    if source.joinSource:
-        result.extend(table_scans_in_source(source.joinSource.join.left))
-    return result
-def table_scans_in_sources(sources: List[chronon.Source]) -> List[TableScan]:
-    result = []
-    for source in sources:
-        result.extend(table_scans_in_source(source))
-    return result
-def table_scans_in_group_by(gb: chronon.GroupBy) -> List[TableScan]:
-    if not gb:
-        return []
-    return table_scans_in_sources(gb.sources)
-def table_scans_in_join(join: chronon.Join) -> List[TableScan]:
-    result = []
-    if not join:
-        return result
-    result.extend(table_scans_in_source(join.left))
-    parts: List[chronon.JoinPart] = join.joinParts
-    if parts:
-        for part in parts:
-            result.extend(table_scans_in_group_by(part.groupBy))
-    bootstraps: List[chronon.BootstrapPart] = join.bootstrapParts
-    if bootstraps:
-        for bootstrap in bootstraps:
-            query = bootstrap.query
-            col, date = get_date(query)
-            bootstrap = TableScan(bootstrap.table, col, date, query)
-            result.append(bootstrap)
-    if join.labelParts:
-        labelParts: List[chronon.JoinPart] = join.labelParts.labels
-        for part in labelParts:
-            result.extend(table_scans_in_sources(part.groupBy))
-    return result

awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

awx-zipline-ai 0.2.1py3-none-any.whl → 0.3.1py3-none-any.whl