PyPI - tracebloc-ingestor - Versions diffs - 0.1.0__tar.gz - Mend

tracebloc-ingestor 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

tracebloc_ingestor-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2021 tracebloc
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tracebloc_ingestor-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,48 @@
+Metadata-Version: 2.1
+Name: tracebloc_ingestor
+Version: 0.1.0
+Summary: A flexible data ingestion library for various file formats
+Home-page: https://github.com/tracebloc/data-ingestors
+Author: Tracebloc
+Author-email: support@tracebloc.com
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+# Data Ingestors
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+## 📄 Description
+A robust data ingestion framework for machine learning pipelines. This repository provides tools and utilities for managing, processing, and validating training/test datasets. It supports various data sources, formats, and processing pipelines, making it easier to create and maintain ML datasets.
+## 🛠️ Tech Stack
+- Python 3.x
+- Docker (for containerization)
+- Data processing libraries (Pandas, NumPy)
+## 🚀 Installation & Usage Instructions
+1. Clone the repository
+2. Install dependencies:
+   ```bash
+   pip install -r src/requirements.txt
+   ```
+3. Configure your environment
+4. Follow the documentation guide to [Create Your Training/Test Dataset](https://traceblocdocsdev.azureedge.net/environment-setup/create-your-dataset)
+## 📦 Features
+- Multi-source data ingestion
+- Data validation and preprocessing
+- Database integration
+- API endpoints for data management
+- Containerized deployment
+- Kubernetes support
+## 📜 License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## 📞 Support
+For additional support or questions, please refer to our documentation or contact the Tracebloc support team at `support@tracebloc.io`.

tracebloc_ingestor-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tracebloc_ingestor-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+with open("requirements.txt", "r") as f:
+    requirements = f.read().splitlines()
+setup(
+    name="tracebloc_ingestor",
+    version="0.1.0",
+    author="Tracebloc",
+    author_email="support@tracebloc.com",
+    description="A flexible data ingestion library for various file formats",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/tracebloc/data-ingestors",
+    packages=find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.8",
+    install_requires=requirements,
+)

tracebloc_ingestor-0.1.0/tracebloc_ingestor/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Tracebloc Data Ingestor Package.
+A flexible and extensible framework for ingesting data from various sources into a database
+and optionally sending it to an API. The package provides base classes for creating custom
+ingestors and processors, along with built-in support for common data formats.
+"""
+from .config import Config
+from .database import Database
+from .api.client import APIClient
+from .ingestors import BaseIngestor, CSVIngestor, JSONIngestor
+from .processors.base import BaseProcessor
+__version__ = '0.1.0'
+__all__ = [
+    'Config',
+    'Database',
+    'APIClient',
+    'BaseIngestor',
+    'CSVIngestor',
+    'JSONIngestor',
+    'BaseProcessor'
+]

tracebloc_ingestor-0.1.0/tracebloc_ingestor/api/__init__.py ADDED Viewed

File without changes

tracebloc_ingestor-0.1.0/tracebloc_ingestor/api/client.py ADDED Viewed

@@ -0,0 +1,267 @@
+from typing import List, Tuple, Dict, Any
+import requests, json
+import logging
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+from ..config import Config
+from ..utils.logging import setup_logging
+from ..utils.constants import DataCategory, API_TIMEOUT
+# Configure unified logging with config
+config = Config()
+setup_logging(config)
+logger = logging.getLogger(__name__)
+class APIClient:
+    def __init__(self, config: Config):
+        self.config = config
+        self.session = self._create_session()
+        self.token = self.authenticate()
+    def _create_session(self) -> requests.Session:
+        session = requests.Session()
+        # Configure retry strategy
+        retry_strategy = Retry(
+            total=3,
+            backoff_factor=1,
+            status_forcelist=[500, 502, 503, 504]
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        session.mount("http://", adapter)
+        session.mount("https://", adapter)
+        return session
+    def authenticate(self) -> str:
+        """Authenticate and return the token."""
+        try:
+            response = self.session.post(
+                f"{self.config.API_ENDPOINT}/api-token-auth/",
+                json={"username": self.config.CLIENT_USERNAME, "password": self.config.CLIENT_PASSWORD},
+                timeout=API_TIMEOUT
+            )
+            response.raise_for_status()
+            logger.info(f"Authentication response: {response.json()}")
+            return response.json().get("token")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error during authentication: {str(e)}")
+            raise
+    def send_batch(self, records: List[Tuple[int, Dict[str, Any]]], table_name: str, ingestor_id: str) -> bool:
+        """
+        Send a batch of records to the remote API.
+        Args:
+            records: List of tuples containing (id, record) pairs
+            table_name: Name of the table to send data to
+            ingestor_id: Unique ID for the ingestor
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            payload = json.dumps([
+                {
+                    "data_id": record_data.get("data_id"),
+                    "company": self.config.COMPANY,
+                    "data_intent": record_data.get("data_intent", "train"),
+                    "label": record_data.get("label", ""),
+                    "is_sample": False,
+                    "is_active": True,
+                    "injestor_id": ingestor_id,
+                    # "data": record_data
+                }
+                for _, record_data in records
+            ])
+            logger.info(f"Data to send: {payload}")
+            headers = {
+                "Authorization": f"TOKEN {self.token}",
+                "Content-Type": "application/json"
+            }
+            response = self.session.post(
+                f"{self.config.API_ENDPOINT}/global_meta/{table_name}/",
+                data=payload,
+                headers=headers,
+                timeout=API_TIMEOUT
+            )
+            response.raise_for_status()
+            logger.info(f"Successfully sent batch. Response: {response.json()}")
+            return True
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error sending batch to API: {str(e)}")
+            if hasattr(e.response, 'text'):
+                logger.error(f"Error response: {e.response.text}")
+            return False
+    def send_global_meta_meta(self, table_name: str, schema: Dict[str, str]) -> bool:
+        """
+        Sends global metadata, including the schema, to the remote server.
+        Args:
+            table_name: The type of the dataset
+            schema: A dictionary representing the schema
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            payload = json.dumps({
+                "table_name": table_name,
+                "schema": schema
+            })
+            logger.info(f"Global metadata to send: {(payload)}")
+            headers = {
+                "Authorization": f"TOKEN {self.token}",
+                "Content-Type": "application/json"
+            }
+            response = self.session.post(
+                f"{self.config.API_ENDPOINT}/global_meta/global_metadata/",
+                data=payload,
+                headers=headers,
+                timeout=API_TIMEOUT
+            )
+            response.raise_for_status()
+            logger.info(f"Successfully sent global metadata. Response: {response.json()}")
+            return True
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error sending global metadata to API: {str(e)}")
+            if hasattr(e.response, 'text'):
+                logger.error(f"Error response: {e.response.text}")
+            return False
+    def send_generate_edge_label_meta(self, table_name: str, ingestor_id: str) -> bool:
+        """
+        Send a request to generate edge label metadata for the specified dataset type.
+        Args:
+            table_name: The type of the dataset
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            url = f"{self.config.API_ENDPOINT}/global_meta/generate-edge-labels-meta/?table_name={table_name}&injestor_id={ingestor_id}"
+            headers = {
+                "Authorization": f"TOKEN {self.token}"
+            }
+            logger.info(f"Sending request to generate edge label metadata for dataset type: {table_name}")
+            response = self.session.get(url, headers=headers, timeout=API_TIMEOUT)
+            response.raise_for_status()
+            logger.info(f"Successfully generated edge label metadata. Response")
+            return True
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error generating edge label metadata: {str(e)}")
+            if hasattr(e.response, 'text'):
+                logger.error(f"Error response: {e.response.text}")
+            return False
+    def prepare_dataset(self, category: str, ingestor_id: str) -> bool:
+        """
+        Prepare data for a specific category and ingestor.
+        Args:
+            category: The category of data (must be one of DataCategory values)
+            injester_id: The unique identifier for the injester
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        if not DataCategory.is_valid_category(category):
+            logger.error(f"Invalid category: {category}")
+            return False
+        try:
+            url = f"{self.config.API_ENDPOINT}/global_meta/prepare/?category={category}&injestor_id={ingestor_id}"
+            headers = {
+                "Authorization": f"TOKEN {self.token}"
+            }
+            logger.info(f"Sending prepare request for category: {category}, injester_id: {ingestor_id}")
+            response = self.session.get(url, headers=headers, timeout=API_TIMEOUT)
+            response.raise_for_status()
+            logger.info(f"Successfully prepared data. Response: {response.json()}")
+            return True
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error preparing data: {str(e)}")
+            if hasattr(e.response, 'text'):
+                logger.error(f"Error response: {e.response.text}")
+            return False
+    def create_dataset(self, requires_gpu: bool = False, allow_feature_modification: bool = False, ingestor_id: str = None, category: str = None) -> Dict[str, Any]:
+        """
+        Create a new dataset with the specified parameters.
+        Args:
+            title: The title of the dataset (if None, will be generated from category and ingestor_id)
+            requires_gpu: Whether the dataset requires GPU processing
+            allow_feature_modification: Whether feature modification is allowed
+            ingestor_id: The unique identifier for the ingestor
+        Returns:
+            Dict[str, Any]: The created dataset information if successful
+        Raises:
+            requests.exceptions.RequestException: If the API request fails
+        """
+        try:
+            # Generate title from category and ingestor_id if not provided
+            if config.TITLE is None:
+                title = f"{category}_{ingestor_id}"
+            else:
+                title = config.TITLE  # Fallback to config title if no ingestor_id
+            if category == DataCategory.TABULAR_CLASSIFICATION:
+                allow_feature_modification = True
+            else:
+                allow_feature_modification = False
+            payload = json.dumps({
+                "title": title,
+                "requires_gpu": requires_gpu,
+                "allow_feature_modification": allow_feature_modification
+            })
+            logger.info(f"Creating dataset with payload: {payload}")
+            headers = {
+                "Authorization": f"TOKEN {self.token}",
+                "Content-Type": "application/json"
+            }
+            response = self.session.post(
+                f"{self.config.API_ENDPOINT}/dataset/",
+                data=payload,
+                headers=headers,
+                timeout=API_TIMEOUT
+            )
+            response.raise_for_status()
+            logger.info(f"Successfully created dataset. Response: {response.json()}")
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error creating dataset: {str(e)}")
+            if hasattr(e.response, 'text'):
+                logger.error(f"Error response: {e.response.text}")
+            raise
+    def __del__(self):
+        """Cleanup when the client is destroyed"""
+        if hasattr(self, 'session'):
+            self.session.close()

tracebloc_ingestor-0.1.0/tracebloc_ingestor/config.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import Dict, Any, Optional
+import os
+from dataclasses import dataclass
+import logging
+@dataclass
+class Config:
+    DB_HOST: str = os.getenv("MYSQL_HOST", "localhost")
+    DB_PORT: int = int(os.getenv("MYSQL_PORT", "3306"))
+    DB_USER: str = os.getenv("MYSQL_USER", "root")
+    DB_PASSWORD: str = os.getenv("MYSQL_PASSWORD", "")
+    DB_NAME: str = os.getenv("MYSQL_DATABASE", "ingestor_db")
+    BATCH_SIZE: int = int(os.getenv("BATCH_SIZE", "10"))
+    # Define API endpoints for different environments
+    API_ENDPOINTS = {
+        "dev": "https://dev-api.tracebloc.io",
+        "stg": "https://stg-api.tracebloc.io",
+        "prod": "https://api.tracebloc.io"
+    }
+    # Get environment and set appropriate API endpoint, default to dev
+    EDGE_ENV: str = os.getenv("EDGE_ENV", "dev")
+    API_ENDPOINT: str = API_ENDPOINTS.get(EDGE_ENV, API_ENDPOINTS["dev"])
+    CLIENT_USERNAME: str = os.getenv("EDGE_USERNAME", "")
+    CLIENT_PASSWORD: str = os.getenv("EDGE_PASSWORD", "")
+    STORAGE_PATH: str = os.getenv("STORAGE_PATH", "/data/shared")
+    SRC_PATH: str = os.getenv("SRC_PATH", "") # path to the source data
+    DEST_PATH: str = os.path.join(os.getenv("DEST_PATH", ""), os.getenv("TABLE_NAME", "")) # path to the destination data with table name
+    LABEL_FILE: str = os.getenv("LABEL_FILE", "")
+    COMPANY: str = os.getenv("COMPANY", "")
+    TABLE_NAME: str = os.getenv("TABLE_NAME", "")
+    TITLE: str = os.getenv("TITLE", "")
+    # Logging configuration
+    LOG_LEVEL: int = int(os.getenv("LOG_LEVEL", str(logging.INFO)))
+    LOG_FORMAT: Optional[str] = os.getenv("LOG_FORMAT", None)
+    LOG_DATE_FORMAT: Optional[str] = os.getenv("LOG_DATE_FORMAT", None)

tracebloc_ingestor-0.1.0/tracebloc_ingestor/database.py ADDED Viewed

@@ -0,0 +1,247 @@
+from sqlalchemy import create_engine, MetaData, Table, Column, BigInteger, DateTime, text, Text, Integer, String, Float, Boolean, inspect
+from sqlalchemy.engine import Engine
+from sqlalchemy.dialects.mysql import insert, LONGBLOB, BLOB
+import logging
+from urllib.parse import quote
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+from .config import Config
+from .utils.logging import setup_logging
+# Configure unified logging with config
+config = Config()
+setup_logging(config)
+logger = logging.getLogger(__name__)
+class Database:
+    def __init__(self, config: Config):
+        self.config = config
+        self.engine = self._create_engine()
+        self.metadata = MetaData()
+        self.tables: Dict[str, Table] = {}
+        self.unique_id_column: Optional[str] = None  # Store table-specific unique ID column mappings
+    def _create_engine(self) -> Engine:
+        # First create database if it doesn't exist
+        base_connection_string = (
+            f"mysql+mysqldb://{self.config.DB_USER}:{quote(self.config.DB_PASSWORD)}"
+            f"@{self.config.DB_HOST}:{self.config.DB_PORT}"
+        )
+        engine = create_engine(base_connection_string, pool_pre_ping=True)
+        with engine.connect() as connection:
+            connection.execute(text(f"CREATE DATABASE IF NOT EXISTS {self.config.DB_NAME}"))
+            connection.commit()
+        # Now connect to the specific database
+        connection_string = f"{base_connection_string}/{self.config.DB_NAME}"
+        print(connection_string)
+        return create_engine(connection_string, pool_pre_ping=True)
+    def _get_sqlalchemy_type(self, mysql_type: str):
+        type_mapping = {
+            'VARCHAR': String,
+            'TEXT': Text,
+            'INT': Integer,
+            'BIGINT': BigInteger,
+            'FLOAT': Float,
+            'BOOLEAN': Boolean,
+            'DATETIME': DateTime,
+            'TIMESTAMP': DateTime,
+            'BLOB': BLOB,
+            'LONGBLOB': LONGBLOB,
+        }
+        for sql_type, alchemy_type in type_mapping.items():
+            if sql_type in mysql_type.upper():
+                length = None
+                if '(' in mysql_type:
+                    length = int(mysql_type.split('(')[1].split(')')[0])
+                return alchemy_type(length) if length else alchemy_type
+        raise ValueError(f"Unsupported MySQL type: {mysql_type}")
+    def create_table(self, table_name: str, schema: Dict[str, str]):
+        """
+        Creates a table if it doesn't exist, or returns existing table
+        Args:
+            table_name: Name of the table
+            schema: Dictionary defining the table schema
+        Returns:
+            SQLAlchemy Table object
+        """
+        # Return existing table if already created
+        if table_name in self.tables:
+            return self.tables[table_name]
+        # Check if table exists in database
+        inspector = inspect(self.engine)
+        if table_name in inspector.get_table_names():
+            # Reflect existing table using MetaData
+            self.metadata.reflect(self.engine, only=[table_name])
+            table = self.metadata.tables[table_name]
+            self.tables[table_name] = table
+            return table
+        # Define standard columns that should be present in all tables
+        standard_columns = [
+            Column('id', BigInteger, primary_key=True, autoincrement=True),
+            Column('created_at', DateTime, server_default=text('CURRENT_TIMESTAMP')),
+            Column('updated_at', DateTime, server_default=text('CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP')),
+            Column('status', Integer, server_default=text('0')),  # 1 for active, 0 for inactive
+            Column('label', String(255), nullable=True),
+            Column('data_intent', String(100), nullable=True),
+            Column('data_id', String(255), unique=True, nullable=False),
+            Column('annotation', Text, nullable=True),
+            Column('ingestor_id', String(255), nullable=True)
+        ]
+        # Add custom columns from the schema
+        custom_columns = [
+            Column(column_name, self._get_sqlalchemy_type(mysql_type))
+            for column_name, mysql_type in schema.items()
+        ]
+        # Combine standard and custom columns
+        table = Table(table_name, self.metadata, *(standard_columns + custom_columns))
+        self.tables[table_name] = table
+        # Create table if it doesn't exist
+        self.metadata.create_all(self.engine, tables=[table])
+        return table
+    def insert_batch(self, table_name: str, records: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Insert or update batch of records based on data_id
+        Args:
+            table_name: Name of the target table
+            records: List of records to insert/update
+        Returns:
+            Dictionary containing:
+            - success_ids: List of successfully processed record IDs
+            - failures: List of dictionaries containing failed records and their error messages
+        """
+        if not records:
+            return {"success_ids": [], "failures": []}
+        table = self.tables[table_name]
+        result = {"success_ids": [], "failures": []}
+        try:
+            with self.engine.connect() as connection:
+                current_time = datetime.now()
+                processed_records = []
+                for record in records:
+                    processed_record = {
+                        **record,
+                        'updated_at': current_time,
+                    }
+                    if 'created_at' not in record:
+                        processed_record['created_at'] = current_time
+                    processed_records.append(processed_record)
+                # Create an "INSERT ... ON DUPLICATE KEY UPDATE" statement
+                insert_stmt = insert(table)
+                update_dict = {
+                    column.name: text(f"VALUES({column.name})")
+                    for column in table.columns
+                    if column.name not in ['id', 'created_at', 'data_id']
+                }
+                try:
+                    # Execute upsert
+                    connection.execute(
+                        insert_stmt.values(processed_records).on_duplicate_key_update(**update_dict)
+                    )
+                    connection.commit()
+                    # Get IDs for successfully processed records
+                    data_ids = [record['data_id'] for record in records]
+                    select_stmt = table.select().where(table.c.data_id.in_(data_ids))
+                    rows = connection.execute(select_stmt).fetchall()
+                    result["success_ids"] = [row.id for row in rows]
+                except Exception as e:
+                    # If batch insert fails, try one by one to identify problematic records
+                    connection.rollback()
+                    logger.warning(f"Batch insert failed, attempting individual inserts: {str(e)}")
+                    for record in processed_records:
+                        try:
+                            stmt = insert_stmt.values([record]).on_duplicate_key_update(**update_dict)
+                            connection.execute(stmt)
+                            connection.commit()
+                            # Get ID for the successful record
+                            select_stmt = table.select().where(table.c.data_id == record['data_id'])
+                            row = connection.execute(select_stmt).fetchone()
+                            if row:
+                                result["success_ids"].append(row.id)
+                        except Exception as individual_error:
+                            result["failures"].append({
+                                "record": record,
+                                "error": str(individual_error)
+                            })
+                            connection.rollback()
+                            logger.error(f"Failed to process record {record['data_id']}: {str(individual_error)}")
+        except Exception as e:
+            logger.error(f"Database connection error in insert_batch: {str(e)}")
+            result["failures"].extend([{
+                "record": record,
+                "error": f"Database connection error: {str(e)}"
+            } for record in records])
+        return result["success_ids"], result["failures"]
+    def get_table_schema(self, table_name: str) -> Dict[str, str]:
+        """
+        Returns the schema of a table as a dictionary mapping column names to their MySQL types.
+        Args:
+            table_name: Name of the table to inspect
+        Returns:
+            Dictionary of column names and their MySQL types
+        """
+        inspector = inspect(self.engine)
+        # Get all columns from the table
+        columns = inspector.get_columns(table_name)
+        # Convert SQLAlchemy types back to MySQL types
+        type_mapping = {
+            'String': 'VARCHAR',
+            'Text': 'TEXT',
+            'Integer': 'INT',
+            'BigInteger': 'BIGINT',
+            'Float': 'FLOAT',
+            'Boolean': 'BOOLEAN',
+            'DateTime': 'DATETIME',
+            'BLOB': 'BLOB',
+            'LONGBLOB': 'LONGBLOB'
+        }
+        schema = {}
+        for column in columns:
+            # Get the type name
+            type_name = column['type'].__class__.__name__
+            # Convert SQLAlchemy type to MySQL type
+            mysql_type = type_mapping.get(type_name, 'VARCHAR')
+            # Add length for VARCHAR types
+            if mysql_type == 'VARCHAR' and hasattr(column['type'], 'length'):
+                mysql_type = f"{mysql_type}({column['type'].length})"
+            schema[column['name']] = mysql_type
+        return schema