PyPI - mage-ai - Versions diffs - 0.8.26__py3-none-any.whl → 0.8.27__py3-none-any.whl - Mend

mage-ai 0.8.26py3-none-any.whl → 0.8.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mage-ai might be problematic. Click here for more details.

Files changed (107) hide show

mage_ai/data_preparation/executors/streaming_pipeline_executor.py CHANGED Viewed

@@ -3,6 +3,7 @@ from mage_ai.data_preparation.executors.pipeline_executor import PipelineExecuto
 from mage_ai.data_preparation.models.constants import BlockType
 from mage_ai.data_preparation.models.pipeline import Pipeline
 from mage_ai.data_preparation.shared.stream import StreamToLogger
+from mage_ai.shared.hash import merge_dict
 from typing import Callable, Dict, List, Union
 import os
 import yaml
@@ -83,7 +84,7 @@ class StreamingPipelineExecutor(PipelineExecutor):
             if not build_block_output_stdout:
                 self.logger.exception(
                         f'Failed to execute streaming pipeline {self.pipeline.uuid}',
-                        error=e,
+                        **merge_dict(dict(error=e), tags),
                     )
             raise e

mage_ai/data_preparation/logging/logger_manager.py CHANGED Viewed

@@ -7,6 +7,8 @@ import io
 import logging
 import os
+MAX_LOG_FILE_SIZE = 5 * 1024 * 1024
 class LoggerManager:
     def __init__(
@@ -50,7 +52,11 @@ class LoggerManager:
                 handler = self.create_stream_handler()
             else:
                 log_filepath = self.get_log_filepath(create_dir=True)
-                handler = logging.FileHandler(log_filepath)
+                handler = logging.handlers.RotatingFileHandler(
+                    log_filepath,
+                    backupCount=10,
+                    maxBytes=MAX_LOG_FILE_SIZE,
+                )
             handler.setLevel(self.log_level)
             handler.setFormatter(self.formatter)

mage_ai/data_preparation/models/block/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from inspect import Parameter, signature
 from logging import Logger
 from mage_ai.data_cleaner.shared.utils import (
     is_geo_dataframe,
+    is_spark_dataframe,
 )
 from mage_ai.data_preparation.models.block.extension.utils import handle_run_tests
 from mage_ai.data_preparation.models.block.utils import (
@@ -50,7 +51,6 @@ import functools
 import json
 import os
 import pandas as pd
-import re
 import simplejson
 import sys
 import time
@@ -252,6 +252,10 @@ class Block:
         self.dynamic_block_uuid = None
         self.dynamic_upstream_block_uuids = None
+        # Spark session
+        self.spark = None
+        self.spark_init = False
     @property
     def uuid(self):
         return self.dynamic_block_uuid or self._uuid
@@ -348,34 +352,22 @@ class Block:
     @property
     def full_table_name(self) -> str:
         from mage_ai.data_preparation.models.block.sql.utils.shared import (
-            extract_and_replace_text_between_strings,
+            extract_create_statement_table_name,
+            extract_insert_statement_table_names,
         )
         if not self.content:
             return None
-        statement_partial, _ = extract_and_replace_text_between_strings(
-            self.content,
-            'create',
-            r'\(',
-        )
-        if not statement_partial:
-            matches = re.findall(
-                r'insert(?: overwrite)*(?: into)*[\s]+([\w.]+)',
-                self.content,
-                re.IGNORECASE,
-            )
-            if len(matches) >= 1:
-                return matches[len(matches) - 1]
-            else:
-                return None
+        table_name = extract_create_statement_table_name(self.content)
+        if table_name:
+            return table_name
-        if not statement_partial:
+        matches = extract_insert_statement_table_names(self.content)
+        if len(matches) == 0:
             return None
-        parts = statement_partial[:len(statement_partial) - 1].strip().split(' ')
-        return parts[-1]
+        return matches[len(matches) - 1]
     @classmethod
     def after_create(self, block: 'Block', **kwargs):
@@ -1053,7 +1045,6 @@ class Block:
             block_uuid,
             partition=execution_partition,
         )
         if not include_print_outputs:
             all_variables = self.output_variables(execution_partition=execution_partition)
@@ -1063,6 +1054,7 @@ class Block:
                 block_uuid,
                 v,
                 partition=execution_partition,
+                spark=self.__get_spark_session(),
             )
             if variable_type is not None and variable_object.variable_type != variable_type:
@@ -1071,6 +1063,7 @@ class Block:
             data = variable_object.read_data(
                 sample=True,
                 sample_count=sample_count,
+                spark=self.__get_spark_session(),
             )
             if type(data) is pd.DataFrame:
                 try:
@@ -1130,6 +1123,19 @@ df = get_variable('{self.pipeline.uuid}', '{self.uuid}', 'df')
                     type=DataType.TEXT,
                     variable_uuid=v,
                 )
+            elif is_spark_dataframe(data):
+                df = data.toPandas()
+                columns_to_display = df.columns.tolist()[:DATAFRAME_ANALYSIS_MAX_COLUMNS]
+                data = dict(
+                    sample_data=dict(
+                        columns=columns_to_display,
+                        rows=json.loads(df[columns_to_display].to_json(orient='split'))['data']
+                    ),
+                    type=DataType.TABLE,
+                    variable_uuid=v,
+                )
+                data_products.append(data)
+                continue
             outputs.append(data)
         return outputs + data_products
@@ -1166,6 +1172,7 @@ df = get_variable('{self.pipeline.uuid}', '{self.uuid}', 'df')
                 block_uuid,
                 v,
                 partition=execution_partition,
+                spark=self.__get_spark_session(),
             )
             if variable_type is not None and variable_object.variable_type != variable_type:
@@ -1174,6 +1181,7 @@ df = get_variable('{self.pipeline.uuid}', '{self.uuid}', 'df')
             data = await variable_object.read_data_async(
                 sample=True,
                 sample_count=sample_count,
+                spark=self.__get_spark_session(),
             )
             if type(data) is pd.DataFrame:
                 try:
@@ -1233,6 +1241,19 @@ df = get_variable('{self.pipeline.uuid}', '{block_uuid}', 'df')
                     type=DataType.TEXT,
                     variable_uuid=v,
                 )
+            elif is_spark_dataframe(data):
+                df = data.toPandas()
+                columns_to_display = df.columns.tolist()[:DATAFRAME_ANALYSIS_MAX_COLUMNS]
+                data = dict(
+                    sample_data=dict(
+                        columns=columns_to_display,
+                        rows=json.loads(df[columns_to_display].to_json(orient='split'))['data']
+                    ),
+                    type=DataType.TABLE,
+                    variable_uuid=v,
+                )
+                data_products.append(data)
+                continue
             outputs.append(data)
         return outputs + data_products
@@ -1663,14 +1684,23 @@ df = get_variable('{self.pipeline.uuid}', '{block_uuid}', 'df')
                 is_spark_env()):
             global_vars = global_vars or dict()
             if not global_vars.get('spark'):
-                try:
-                    from pyspark.sql import SparkSession
-                    global_vars['spark'] = SparkSession.builder.master(
-                        os.getenv('SPARK_MASTER_HOST', 'local')).getOrCreate()
-                except Exception:
-                    pass
+                spark = self.__get_spark_session()
+                if spark is not None:
+                    global_vars['spark'] = spark
         return global_vars
+    def __get_spark_session(self):
+        if self.spark_init:
+            return self.spark
+        try:
+            from pyspark.sql import SparkSession
+            self.spark = SparkSession.builder.master(
+                os.getenv('SPARK_MASTER_HOST', 'local')).getOrCreate()
+        except Exception:
+            self.spark = None
+        self.spark_init = True
+        return self.spark
     def __store_variables_prepare(
         self,
         variable_mapping: Dict,
@@ -1722,7 +1752,8 @@ df = get_variable('{self.pipeline.uuid}', '{block_uuid}', 'df')
             dynamic_block_uuid,
         )
         for uuid, data in variables_data['variable_mapping'].items():
-            if spark is not None and type(data) is pd.DataFrame:
+            if spark is not None and self.pipeline.type == PipelineType.PYSPARK \
+                    and type(data) is pd.DataFrame:
                 data = spark.createDataFrame(data)
             self.pipeline.variable_manager.add_variable(
                 self.pipeline.uuid,

mage_ai/data_preparation/models/block/sql/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from mage_ai.data_preparation.models.block.sql import (
     trino,
 )
 from mage_ai.data_preparation.models.block.sql.utils.shared import (
+    has_create_or_insert_statement,
     interpolate_vars,
 )
 from mage_ai.data_preparation.models.constants import BlockType
@@ -389,11 +390,15 @@ def split_query_string(query_string: str) -> List[str]:
     arr = []
     for query in queries:
         query = query.strip()
+        if not query:
+            continue
+        lines = query.split('\n')
+        query = '\n'.join(list(filter(lambda x: not x.startswith('--'), lines)))
+        query = query.strip()
+        query = re.sub(MAGE_SEMI_COLON, ';', query)
         if query:
-            lines = query.split('\n')
-            query = '\n'.join(list(filter(lambda x: not x.startswith('--'), lines)))
-            query = query.strip()
-            query = re.sub(MAGE_SEMI_COLON, ';', query)
             arr.append(query)
     return arr
@@ -409,11 +414,23 @@ def execute_raw_sql(
     queries = []
     fetch_query_at_indexes = []
-    for query in split_query_string(query_string):
-        queries.append(query)
-        fetch_query_at_indexes.append(False)
+    has_create_or_insert = has_create_or_insert_statement(query_string)
-    if should_query:
+    for query in split_query_string(query_string):
+        if has_create_or_insert:
+            queries.append(query)
+            fetch_query_at_indexes.append(False)
+        else:
+            if should_query:
+                query = f"""SELECT *
+FROM (
+    {query}
+) AS {block.table_name}__limit
+LIMIT 1000"""
+            queries.append(query)
+            fetch_query_at_indexes.append(True)
+    if should_query and has_create_or_insert:
         queries.append(f'SELECT * FROM {block.full_table_name} LIMIT 1000')
         fetch_query_at_indexes.append(block.full_table_name)

mage_ai/data_preparation/models/block/sql/utils/shared.py CHANGED Viewed

@@ -47,23 +47,33 @@ def interpolate_input(block, query, replace_func=None):
     for idx, upstream_block in enumerate(block.upstream_blocks):
         matcher1 = '{} df_{} {}'.format('{{', idx + 1, '}}')
-        if BlockLanguage.SQL == upstream_block.type:
+        is_sql = BlockLanguage.SQL == upstream_block.language
+        if is_sql:
             configuration = upstream_block.configuration
         else:
             configuration = block.configuration
+        use_raw_sql = configuration.get('use_raw_sql')
         database = configuration.get('data_provider_database', '')
         schema = configuration.get('data_provider_schema', '')
+        replace_with = __replace_func(database, schema, upstream_block.table_name)
+        upstream_block_content = upstream_block.content
+        if is_sql and use_raw_sql and not has_create_or_insert_statement(upstream_block_content):
+            upstream_query = interpolate_input(upstream_block, upstream_block_content)
+            replace_with = f"""(
+    {upstream_query}
+) AS {upstream_block.table_name}"""
         query = re.sub(
             '{}[ ]*df_{}[ ]*{}'.format(r'\{\{', idx + 1, r'\}\}'),
-            __replace_func(database, schema, upstream_block.table_name),
+            replace_with,
             query,
         )
         query = query.replace(
             f'{matcher1}',
-            __replace_func(database, schema, upstream_block.table_name),
+            replace_with,
         )
     return query
@@ -170,3 +180,39 @@ def extract_and_replace_text_between_strings(
     new_text = text[0:max(start_idx - 1, 0)] + replace_string + text[end_idx + 1:]
     return extracted_text, new_text
+def remove_comments(text: str) -> str:
+    lines = text.split('\n')
+    return '\n'.join(line for line in lines if not line.startswith('--'))
+def extract_create_statement_table_name(text: str) -> str:
+    statement_partial, _ = extract_and_replace_text_between_strings(
+        remove_comments(text),
+        r'create table(?: if not exists)*',
+        r'\(',
+    )
+    if not statement_partial:
+        return None
+    parts = statement_partial[:len(statement_partial) - 1].strip().split(' ')
+    return parts[-1]
+def extract_insert_statement_table_names(text: str) -> List[str]:
+    matches = re.findall(
+        r'insert(?: overwrite)*(?: into)*[\s]+([\w.]+)',
+        remove_comments(text),
+        re.IGNORECASE,
+    )
+    return matches
+def has_create_or_insert_statement(text: str) -> bool:
+    table_name = extract_create_statement_table_name(text)
+    if table_name:
+        return True
+    matches = extract_insert_statement_table_names(text)
+    return len(matches) >= 1

mage_ai/data_preparation/models/variable.py CHANGED Viewed

@@ -175,6 +175,8 @@ class Variable:
         """
         if self.variable_type == VariableType.DATAFRAME:
             return self.__read_parquet(sample=sample, sample_count=sample_count)
+        elif self.variable_type == VariableType.SPARK_DATAFRAME:
+            return self.__read_spark_parquet(sample=sample, sample_count=sample_count, spark=spark)
         elif self.variable_type == VariableType.DATAFRAME_ANALYSIS:
             return await self.__read_dataframe_analysis_async(
                 dataframe_analysis_keys=dataframe_analysis_keys,
@@ -367,7 +369,7 @@ class Variable:
     def __read_spark_parquet(self, sample: bool = False, sample_count: int = None, spark=None):
         if spark is None:
             return None
-        return (
+        df = (
             spark.read
             .format('csv')
             .option('header', 'true')
@@ -375,6 +377,9 @@ class Variable:
             .option('delimiter', ',')
             .load(self.variable_path)
         )
+        if sample and sample_count:
+            df = df.limit(sample_count)
+        return df
     def __write_geo_dataframe(self, data) -> None:
         os.makedirs(self.variable_path, exist_ok=True)

mage_ai/data_preparation/repo_manager.py CHANGED Viewed

@@ -114,12 +114,15 @@ def init_repo(repo_path: str) -> None:
     if os.path.exists(repo_path):
         raise FileExistsError(f'Repository {repo_path} already exists')
-    os.makedirs(os.getenv(MAGE_DATA_DIR_ENV_VAR, DEFAULT_MAGE_DATA_DIR), exist_ok=True)
+    os.makedirs(
+        os.getenv(MAGE_DATA_DIR_ENV_VAR) or DEFAULT_MAGE_DATA_DIR,
+        exist_ok=True,
+    )
     copy_template_directory('repo', repo_path)
 def get_data_dir() -> str:
-    return os.getenv(MAGE_DATA_DIR_ENV_VAR, DEFAULT_MAGE_DATA_DIR)
+    return os.getenv(MAGE_DATA_DIR_ENV_VAR) or DEFAULT_MAGE_DATA_DIR
 def get_repo_name() -> str:

mage_ai/data_preparation/shared/secrets.py CHANGED Viewed

@@ -66,6 +66,9 @@ def get_secret_value(name: str) -> str:
     from mage_ai.orchestration.db.models import Secret
     fernet = Fernet(get_encryption_key())
-    secret = Secret.query.filter(Secret.name == name).one_or_none()
-    if secret:
-        return fernet.decrypt(secret.value.encode('utf-8')).decode('utf-8')
+    try:
+        secret = Secret.query.filter(Secret.name == name).one_or_none()
+        if secret:
+            return fernet.decrypt(secret.value.encode('utf-8')).decode('utf-8')
+    except Exception:
+        print(f'WARNING: Could not find secret value for secret {name}')

mage_ai/data_preparation/templates/sensors/bigquery.py ADDED Viewed

@@ -0,0 +1,32 @@
+from mage_ai.data_preparation.repo_manager import get_repo_path
+from mage_ai.io.bigquery import BigQuery
+from mage_ai.io.config import ConfigFileLoader
+from os import path
+if 'sensor' not in globals():
+    from mage_ai.data_preparation.decorators import sensor
+@sensor
+def query_bigquery_and_check_condition(**kwargs) -> bool:
+    """
+    Template code for checking the results of a BigQuery query.
+    Specify your configuration settings in 'io_config.yaml'.
+    Return: True if the sensor should complete, False if it should
+    keep waiting
+    """
+    config_path = path.join(get_repo_path(), 'io_config.yaml')
+    config_profile = 'default'
+    query = 'Your BigQuery query'  # Specify your SQL query here
+    loader = BigQuery.with_config(ConfigFileLoader(config_path, config_profile))
+    df = loader.load(query)
+    # Add your checks here
+    if df.empty:
+        return False
+    return True

mage_ai/data_preparation/templates/sensors/mysql.py ADDED Viewed

@@ -0,0 +1,33 @@
+from mage_ai.data_preparation.repo_manager import get_repo_path
+from mage_ai.io.config import ConfigFileLoader
+from mage_ai.io.mysql import MySQL
+from os import path
+if 'sensor' not in globals():
+    from mage_ai.data_preparation.decorators import sensor
+@sensor
+def query_mysql_and_check_condition(**kwargs) -> bool:
+    """
+    Template code for checking the results of a MySQL query.
+    Specify your configuration settings in 'io_config.yaml'.
+    Return: True if the sensor should complete, False if it should
+    keep waiting
+    """
+    config_path = path.join(get_repo_path(), 'io_config.yaml')
+    config_profile = 'default'
+    query = 'Your MySQL query'  # Specify your SQL query here
+    with MySQL.with_config(
+            ConfigFileLoader(config_path, config_profile)) as loader:
+        df = loader.load(query)
+        # Add your checks here
+        if df.empty:
+            return False
+    return True

mage_ai/data_preparation/templates/sensors/postgres.py ADDED Viewed

@@ -0,0 +1,33 @@
+from mage_ai.data_preparation.repo_manager import get_repo_path
+from mage_ai.io.config import ConfigFileLoader
+from mage_ai.io.postgres import Postgres
+from os import path
+if 'sensor' not in globals():
+    from mage_ai.data_preparation.decorators import sensor
+@sensor
+def query_postgres_and_check_condition(**kwargs) -> bool:
+    """
+    Template code for checking the results of a Postgres query.
+    Specify your configuration settings in 'io_config.yaml'.
+    Return: True if the sensor should complete, False if it should
+    keep waiting
+    """
+    config_path = path.join(get_repo_path(), 'io_config.yaml')
+    config_profile = 'default'
+    query = 'Your Postgres query'  # Specify your SQL query here
+    with Postgres.with_config(
+            ConfigFileLoader(config_path, config_profile)) as loader:
+        df = loader.load(query)
+        # Add your checks here
+        if df.empty:
+            return False
+    return True

mage_ai/data_preparation/templates/sensors/redshift.py ADDED Viewed

@@ -0,0 +1,33 @@
+from mage_ai.data_preparation.repo_manager import get_repo_path
+from mage_ai.io.config import ConfigFileLoader
+from mage_ai.io.redshift import Redshift
+from os import path
+if 'sensor' not in globals():
+    from mage_ai.data_preparation.decorators import sensor
+@sensor
+def query_redshift_and_check_condition(**kwargs) -> bool:
+    """
+    Template code for checking the results of a Redshift query.
+    Specify your configuration settings in 'io_config.yaml'.
+    Return: True if the sensor should complete, False if it should
+    keep waiting
+    """
+    config_path = path.join(get_repo_path(), 'io_config.yaml')
+    config_profile = 'default'
+    query = 'Your Redshift query'  # Specify your SQL query here
+    with Redshift.with_config(
+            ConfigFileLoader(config_path, config_profile)) as loader:
+        df = loader.load(query)
+        # Add your checks here
+        if df.empty:
+            return False
+    return True

mage_ai/data_preparation/templates/sensors/s3.py CHANGED Viewed

@@ -3,9 +3,6 @@ from mage_ai.io.config import ConfigFileLoader
 from mage_ai.io.s3 import S3
 from os import path
-import time
 if 'sensor' not in globals():
     from mage_ai.data_preparation.decorators import sensor
@@ -13,15 +10,22 @@ if 'sensor' not in globals():
 @sensor
 def check_condition(**kwargs) -> bool:
     """
-    Template code for checking if a partition exists in a S3 bucket
+    Template code for checking if a file or folder exists in a S3 bucket
+    You will also need to fill out the following AWS related fields
+    in `io_config.yaml`:
+        - AWS_ACCESS_KEY_ID
+        - AWS_SECRET_ACCESS_KEY
+        - AWS_REGION
     """
     config_path = path.join(get_repo_path(), 'io_config.yaml')
     config_profile = 'default'
     bucket_name = 'your_bucket_name'
-    path = 'your_partition_path'
+    s3_path = 'path/to/folder/or/file'
-    return S3.with_config(ConfigFileLoader(config_path, config_profile)).exists(
-        bucket_name, path
+    config_file_loader = ConfigFileLoader(config_path, config_profile)
+    return S3.with_config(config_file_loader).exists(
+        bucket_name, s3_path
     )

mage_ai/data_preparation/templates/sensors/snowflake.py ADDED Viewed

@@ -0,0 +1,33 @@
+from mage_ai.data_preparation.repo_manager import get_repo_path
+from mage_ai.io.config import ConfigFileLoader
+from mage_ai.io.snowflake import Snowflake
+from os import path
+if 'sensor' not in globals():
+    from mage_ai.data_preparation.decorators import sensor
+@sensor
+def query_snowflake_and_check_condition(**kwargs) -> bool:
+    """
+    Template code for checking the results of a Snowflake query.
+    Specify your configuration settings in 'io_config.yaml'.
+    Return: True if the sensor should complete, False if it should
+    keep waiting
+    """
+    config_path = path.join(get_repo_path(), 'io_config.yaml')
+    config_profile = 'default'
+    query = 'Your Snowflake query'  # Specify your SQL query here
+    with Snowflake.with_config(
+            ConfigFileLoader(config_path, config_profile)) as loader:
+        df = loader.load(query)
+        # Add your checks here
+        if df.empty:
+            return False
+    return True

mage_ai/io/postgres.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from mage_ai.io.config import BaseConfigLoader, ConfigKey
 from mage_ai.io.export_utils import BadConversionError, PandasTypes
 from mage_ai.io.sql import BaseSQL
+from mage_ai.shared.utils import is_port_in_use
 from pandas import DataFrame, Series
 from psycopg2 import connect, _psycopg
 from sshtunnel import SSHTunnelForwarder
@@ -87,10 +88,21 @@ class Postgres(BaseSQL):
                     ssh_setting['ssh_pkey'] = self.settings['ssh_pkey']
                 else:
                     ssh_setting['ssh_password'] = self.settings['ssh_password']
+                # Find an available local port
+                local_port = port
+                max_local_port = local_port + 100
+                while is_port_in_use(local_port):
+                    if local_port > max_local_port:
+                        raise Exception(
+                            'Unable to find an open port, please clear your running processes '
+                            'if possible.'
+                        )
+                    local_port += 1
                 self.ssh_tunnel = SSHTunnelForwarder(
                     (self.settings['ssh_host'], self.settings['ssh_port']),
                     remote_bind_address=(host, port),
-                    local_bind_address=('', port),
+                    local_bind_address=('', local_port),
                     **ssh_setting,
                 )
                 self.ssh_tunnel.start()

mage_ai/server/constants.py CHANGED Viewed

@@ -12,4 +12,4 @@ DATAFRAME_OUTPUT_SAMPLE_COUNT = 10
 # Dockerfile depends on it because it runs ./scripts/install_mage.sh and uses
 # the last line to determine the version to install.
 VERSION = \
-'0.8.26'
+'0.8.27'

mage-ai 0.8.26__py3-none-any.whl → 0.8.27__py3-none-any.whl

Potentially problematic release.

mage-ai 0.8.26py3-none-any.whl → 0.8.27py3-none-any.whl