PyPI - padmy - Versions diffs - 0.4.0__tar.gz - Mend

padmy 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

padmy-0.4.0/PKG-INFO +19 -0
padmy-0.4.0/padmy/__init__.py +0 -0
padmy-0.4.0/padmy/anonymize/__init__.py +1 -0
padmy-0.4.0/padmy/anonymize/anonymize.py +102 -0
padmy-0.4.0/padmy/config.py +113 -0
padmy-0.4.0/padmy/db.py +307 -0
padmy-0.4.0/padmy/env.py +11 -0
padmy-0.4.0/padmy/logs.py +13 -0
padmy-0.4.0/padmy/migration/__init__.py +3 -0
padmy-0.4.0/padmy/migration/create_files.py +57 -0
padmy-0.4.0/padmy/migration/db.sql +23 -0
padmy-0.4.0/padmy/migration/migration.py +244 -0
padmy-0.4.0/padmy/migration/new_sql.py +36 -0
padmy-0.4.0/padmy/migration/run.py +130 -0
padmy-0.4.0/padmy/migration/utils.py +47 -0
padmy-0.4.0/padmy/sampling/__init__.py +1 -0
padmy-0.4.0/padmy/sampling/network.py +36 -0
padmy-0.4.0/padmy/sampling/sampling.py +236 -0
padmy-0.4.0/padmy/sampling/viz.py +165 -0
padmy-0.4.0/padmy/utils.py +220 -0
padmy-0.4.0/pyproject.toml +86 -0
padmy-0.4.0/setup.py +44 -0

padmy-0.4.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,19 @@
+Metadata-Version: 2.1
+Name: padmy
+Version: 0.4.0
+Summary:
+Author: andarius
+Author-email: julien.brayere@tracktor.fr
+Requires-Python: >=3.10,<4.0
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Provides-Extra: network
+Requires-Dist: Faker (>=13.15.1,<14.0.0)
+Requires-Dist: PyYAML (>=6.0,<7.0)
+Requires-Dist: asyncpg (>=0.27.0,<0.28.0)
+Requires-Dist: dash (>=2.6.0,<3.0.0); extra == "network"
+Requires-Dist: dash-cytoscape (>=0.3.0,<0.4.0); extra == "network"
+Requires-Dist: networkx (>=2.8.5,<3.0.0); extra == "network"
+Requires-Dist: piou (>=0.13.1,<0.14.0)
+Requires-Dist: typing-extensions (>=4.3.0,<5.0.0)

padmy-0.4.0/padmy/__init__.py ADDED Viewed

File without changes

padmy-0.4.0/padmy/anonymize/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .anonymize import anonymize_db

padmy-0.4.0/padmy/anonymize/anonymize.py ADDED Viewed

@@ -0,0 +1,102 @@
+import asyncio
+import itertools
+import logging
+import operator
+from functools import partial
+from typing import Any, Iterator
+import asyncpg
+from faker import Faker
+from ..config import Config, ConfigTable, FieldType, AnoFields
+from ..db import load_primary_keys, load_columns_type
+from ..utils import get_conn, iterate_pg
+def get_update_query(table: str, pks: list[str], fields: list[str],
+                     field_types: dict):
+    _table_keys = pks + fields
+    _set_fields = ', '.join(f'{_field} = u2.{_field}'
+                            for _field in fields)
+    _values = ', '.join(f'${i + 1}::{field_types[k]}' for i, k in enumerate(_table_keys))
+    _where = ' and '.join(f'u2.{_pk} = u.{_pk}' for _pk in pks)
+    query = f"""
+    UPDATE {table} as u
+    SET
+      {_set_fields}
+    from (values
+      ({_values})
+    ) as u2({', '.join(_table_keys)})
+    where {_where}
+    """
+    return query
+def _get_fake_value(faker: Faker, field: FieldType,
+                    extra_fields: dict | None = None) -> Any:
+    _extra_fields = extra_fields or {}
+    match field:
+        case 'EMAIL':
+            return faker.email(**_extra_fields)
+        case _:
+            raise ValueError(f'Got unimplemented field type {field!r}')
+def gen_mock_data(faker: Faker,
+                  fields: list[AnoFields],
+                  size: int) -> Iterator[dict]:
+    for _ in range(size):
+        yield {
+            v.column: _get_fake_value(faker, v.type, v.extra_args) for v in fields
+        }
+def dict_to_tuple(d: dict, fields: list[str]) -> tuple:
+    return tuple(d[k] for k in fields)
+async def anonymize_table(conn: asyncpg.Connection,
+                          table: ConfigTable,
+                          pks: list[str],
+                          faker: Faker,
+                          *,
+                          chunk_size: int = 1_000):
+    if table.fields is None:
+        raise ValueError('Fields must not be empty')
+    if not pks:
+        raise ValueError(f'No PKs found for {table.full_name!r}')
+    query = f"SELECT {', '.join(pks)} from {table.schema}.{table.table}"
+    fields = [x.column for x in table.fields]
+    fields_types = await load_columns_type(conn, table.schema, table.table,
+                                           pks + fields)
+    update_query = get_update_query(table.full_name, pks, fields,
+                                    fields_types)
+    async with conn.transaction():
+        async for chunk in iterate_pg(conn, query, chunk_size=chunk_size):
+            mock_data = gen_mock_data(faker, fields=table.fields, size=chunk_size)
+            new_data = [dict_to_tuple({**c, **m}, pks + fields) for c, m in zip(chunk, mock_data)]
+            await conn.executemany(update_query, new_data)
+async def anonymize_db(pool: asyncpg.Pool, config: Config, faker: Faker):
+    _tables_to_anonymize = [_table for _table in config.tables if _table.has_ano_fields]
+    if not _tables_to_anonymize:
+        logging.info('No tables found to anonymize in config file')
+        return
+    async with pool.acquire() as conn:
+        pks = await load_primary_keys(conn, list({_table.schema for _table in _tables_to_anonymize}))
+    _pks = {_table_name: list(_table_pks) for _table_name, _table_pks in
+            itertools.groupby(pks, operator.attrgetter('full_name'))}
+    await asyncio.gather(*[get_conn(pool, partial(anonymize_table,
+                                                  table=_table,
+                                                  pks=[x.column_name for x in _pks[_table.full_name]],
+                                                  faker=faker))
+                           for _table in _tables_to_anonymize])

padmy-0.4.0/padmy/config.py ADDED Viewed

@@ -0,0 +1,113 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+# else:
+#     from typing import Self
+from typing import Literal
+import yaml
+# if sys.version_info.minor < 11 and sys.version_info.major >= 3:
+FieldType = Literal['EMAIL']
+SampleType = float | int
+def _check_sample_size(sample: SampleType | None):
+    if sample is None:
+        return
+    if sample < 0 or sample > 100:
+        raise ValueError(f'Sample must be a value between 0 and 100 (got {sample})')
+@dataclass
+class AnoFields:
+    column: str
+    type: FieldType
+    extra_args: dict | None = None
+    @classmethod
+    def load(cls, data: dict):
+        if len(data) == 1:
+            column = next(iter(data))
+            _type = data[column]
+            extra_args = None
+        else:
+            column = data.pop('column')
+            _type = data.pop('type')
+            extra_args = data if data else None
+        return AnoFields(column=column,
+                         type=_type,
+                         extra_args=extra_args)
+@dataclass
+class ConfigTable:
+    schema: str
+    table: str
+    sample: SampleType | None = None
+    fields: list[AnoFields] = field(default_factory=list)
+    ignore: bool = False
+    def __post_init__(self):
+        _check_sample_size(self.sample)
+    @property
+    def full_name(self):
+        return f'{self.schema}.{self.table}'
+    @property
+    def has_ano_fields(self):
+        return self.fields is not None
+    @classmethod
+    def load(cls, table: dict):
+        _fields_any: dict | list = table.pop('fields', [])
+        _fields: list = [_fields_any] if isinstance(_fields_any, dict) else _fields_any
+        return cls(**table,
+                   fields=[AnoFields.load(_field) for _field in _fields])
+@dataclass
+class ConfigSchema:
+    schema: str
+    sample: SampleType | None = None
+    def __post_init__(self):
+        _check_sample_size(self.sample)
+    @classmethod
+    def load(cls, v: str | dict):
+        if isinstance(v, str):
+            return cls(v)
+        else:
+            return cls(v['name'], v.get('sample'))
+@dataclass
+class Config:
+    sample: SampleType | None = None
+    schemas: list[ConfigSchema] = field(default_factory=list)
+    tables: list[ConfigTable] = field(default_factory=list)
+    def __post_init__(self):
+        _check_sample_size(self.sample)
+    @classmethod
+    def load(cls, sample: SampleType, schemas: list[str]):
+        _schemas = [ConfigSchema(x) for x in schemas]
+        return cls(sample=sample, schemas=_schemas)
+    @classmethod
+    def load_from_file(cls, path: Path):
+        with path.open('r') as f:
+            config = yaml.load(f, Loader=yaml.Loader)
+        schemas = [ConfigSchema.load(schema) for schema in config.pop('schemas', [])]
+        tables = [ConfigTable.load(table) for table in config.pop('tables', [])]
+        return cls(**config, schemas=schemas, tables=tables)

padmy-0.4.0/padmy/db.py ADDED Viewed

@@ -0,0 +1,307 @@
+import asyncio
+import functools
+from dataclasses import dataclass, field
+import asyncpg
+from rich.console import Console
+from rich.table import Table as RTable
+from typing_extensions import Self
+from padmy.config import Config, SampleType
+from padmy.utils import get_first, get_conn
+# if sys.version_info.minor < 11 and sys.version_info.major >= 3:
+# else:
+#     from typing import Self
+def _get_full_name(schema: str | None, table: str | None) -> str:
+    if schema is None or table is None:
+        raise ValueError('schema and table must not be empty')
+    return f'{schema}.{table}'
+@dataclass
+class FKConstraint:
+    column_name: str
+    constraint_name: str
+    # references
+    foreign_schema: str
+    foreign_table: str
+    foreign_column_name: str
+    table: str | None = None
+    schema: str | None = None
+    @property
+    def foreign_full_name(self):
+        return _get_full_name(self.foreign_schema, self.foreign_table)
+    @property
+    def full_name(self):
+        return _get_full_name(self.schema, self.table)
+@dataclass
+class PKConstraint:
+    column_name: str
+    table: str
+    schema: str
+    @property
+    def full_name(self):
+        return _get_full_name(self.schema, self.table)
+@dataclass(eq=False)
+class Table:
+    schema: str
+    table: str
+    _count: int | None = None  # field(init=False)
+    foreign_keys: list[FKConstraint] = field(default_factory=list)
+    primary_keys: list[PKConstraint] = field(default_factory=list)
+    parent_tables: set[Self] = field(default_factory=set)
+    child_tables: set[Self] = field(default_factory=set)
+    # Has already been sampled and it's temporary table has been created
+    has_been_processed: bool = False
+    # Sample size
+    sample_size: int | None = None
+    @property
+    def parent_tables_safe(self):
+        # return self.parent_tables - {self}
+        return {x for x in self.parent_tables if x.full_name != self.full_name}
+    @property
+    def child_tables_safe(self):
+        # return self.child_tables - {self}
+        return {x for x in self.child_tables if x.full_name != self.full_name}
+    @property
+    def full_name(self):
+        return _get_full_name(self.schema, self.table)
+    @property
+    def tmp_name(self):
+        return f'_{self.schema}_{self.table}_tmp'
+    @property
+    def has_parent(self):
+        return len(self.parent_tables_safe) > 0
+    @property
+    def has_children(self):
+        return len(self.child_tables_safe) > 0
+    @property
+    def children_has_been_processed(self):
+        return all(_child.has_been_processed for _child in self.child_tables_safe)
+    @property
+    def count(self) -> int:
+        if self._count is None:
+            raise ValueError('Count must be loaded first')
+        return self._count
+    @count.setter
+    def count(self, v: int):
+        self._count = v
+    async def load_count(self, conn: asyncpg.Connection):
+        self._count = await conn.fetchval(f'SELECT count(*) from {self.full_name}')
+    def __eq__(self, other: Self):
+        for k in ['full_name', 'has_been_processed']:
+            if getattr(self, k) != getattr(other, k):
+                return False
+        return True
+    def __hash__(self):
+        return hash((getattr(self, x) for x in ['full_name']))
+    def __repr__(self):
+        return f'Table(full_name={self.full_name!r} ' \
+               f'count={self._count} ' \
+               f'foreign_keys={len(self.foreign_keys)} ' \
+               f'parents={len(self.parent_tables)} ' \
+               f'children={len(self.child_tables)}' \
+               f')'
+    def __post_init__(self):
+        for fk in self.foreign_keys:
+            fk.table = self.table
+            fk.schema = self.schema
+async def get_tables(conn: asyncpg.Connection, schemas: list[str]):
+    query = """
+    select
+    table_schema as schema, table_name as table
+    from information_schema.tables
+    where table_schema = ANY ($1::text[]) and
+    table_type = 'BASE TABLE'
+    """
+    data = await conn.fetch(query, schemas)
+    return [Table(**x) for x in data]
+SCHEMA_FK_QUERY = """
+SELECT
+    tc.table_schema as schema,
+    tc.constraint_name,
+    tc.table_name as table,
+    kcu.column_name,
+    ccu.table_schema AS foreign_schema,
+    ccu.table_name AS foreign_table,
+    ccu.column_name AS foreign_column_name
+FROM
+    information_schema.table_constraints AS tc
+    JOIN information_schema.key_column_usage AS kcu
+      ON tc.constraint_name = kcu.constraint_name
+      AND tc.table_schema = kcu.table_schema
+    JOIN information_schema.constraint_column_usage AS ccu
+      ON ccu.constraint_name = tc.constraint_name
+      AND ccu.table_schema = tc.table_schema
+WHERE tc.constraint_type = 'FOREIGN KEY' AND tc.table_schema = ANY ($1::text[]);
+"""
+SCHEMA_PK_QUERY = """
+SELECT
+    tc.table_schema as schema,
+    tc.table_name as table,
+    c.column_name
+FROM information_schema.table_constraints tc
+JOIN information_schema.constraint_column_usage AS ccu USING (constraint_schema, constraint_name)
+JOIN information_schema.columns AS c ON c.table_schema = tc.constraint_schema
+  AND tc.table_name = c.table_name AND ccu.column_name = c.column_name
+WHERE constraint_type = 'PRIMARY KEY' and tc.table_schema = ANY ($1::text[]);
+"""
+async def load_foreign_keys(conn: asyncpg.Connection, schemas: list[str]):
+    data = await conn.fetch(SCHEMA_FK_QUERY, schemas)
+    return [FKConstraint(**x) for x in data]
+async def load_primary_keys(conn: asyncpg.Connection, schemas: list[str]):
+    data = await conn.fetch(SCHEMA_PK_QUERY, schemas)
+    return [PKConstraint(**x) for x in data]
+GET_COLUMNS_TYPE_QUERY = """
+select
+    column_name, data_type
+from information_schema.columns
+where table_schema = $1 and
+      table_name = $2 and
+      column_name = ANY ($3::text[])
+"""
+async def load_columns_type(conn: asyncpg.Connection, schema: str,
+                            table: str,
+                            columns: list[str]):
+    data = await conn.fetch(GET_COLUMNS_TYPE_QUERY, schema,
+                            table, columns)
+    return functools.reduce(lambda p, n: {**p, **{n['column_name']: n['data_type']}},
+                            data, {})
+@dataclass
+class Database:
+    name: str
+    tables: list[Table] = field(default_factory=list)
+    async def explore(self, pool: asyncpg.Pool, schemas: list[str], *, load_count: bool = True):
+        async with pool.acquire() as conn:
+            self.tables = await get_tables(conn, schemas)
+            fks = await load_foreign_keys(conn, schemas)
+            pks = await load_primary_keys(conn, schemas)
+        _tables: dict[str, Table] = {_table.full_name: _table for _table in self.tables}
+        for _pk in pks:
+            _tables[_pk.full_name].primary_keys.append(_pk)
+        for _fk in fks:
+            _tables[_fk.full_name].foreign_keys.append(_fk)
+            _tables[_fk.full_name].parent_tables.add(_tables[_fk.foreign_full_name])
+            _tables[_fk.foreign_full_name].child_tables.add(_tables[_fk.full_name])
+        if load_count:
+            await asyncio.gather(*[
+                get_conn(pool, table.load_count) for table in self.tables
+            ])
+    def load_config(self, config: Config):
+        """
+        Loads the sample sizes for each tables from the config file.
+        Tables need to have been loaded first
+        """
+        _schemas: dict[str, SampleType | None] = {schema.schema: schema.sample for schema in config.schemas}
+        _tables: dict[str, SampleType | None] = {f'{_table.schema}.{_table.table}': _table.sample for _table in
+                                                 config.tables}
+        for _table in self.tables:
+            _schema_sample = _schemas.get(_table.schema)
+            _table_sample = _tables.get(_table.full_name)
+            _sample = get_first(_table_sample,
+                                _schema_sample,
+                                config.sample,
+                                fn=lambda x: x is not None)
+            if _sample is None:
+                raise ValueError('_sample must not be empty')
+            _table.sample_size = int(_sample)
+def pretty_print_stats(database: Database):
+    table = RTable(title=f"Stats for {database.name}")
+    table.add_column("Table", justify="left", style="cyan")
+    table.add_column("Count", justify="right", style="green")
+    table.add_column("# FKs", justify="right", style="magenta")
+    table.add_column("# Parents", justify="right", style="magenta")
+    table.add_column("# Children", justify="right", style="magenta")
+    for _table in sorted(database.tables, key=lambda x: x.full_name):
+        table.add_row(_table.full_name, str(_table.count),
+                      str(len(_table.foreign_keys)),
+                      str(len(_table.parent_tables)),
+                      str(len(_table.child_tables))
+                      )
+    console = Console()
+    console.print(table)
+def pprint_compared_dbs(db_1: Database, db_2: Database):
+    table = RTable(title=f"Comparing {db_1.name} and {db_2.name}")
+    table.add_column("Table", justify="left", style="blue")
+    table.add_column(f"Count {db_1.name!r}", justify="right", style="cyan")
+    table.add_column(f"Count {db_2.name!r}", justify="right", style="cyan")
+    table.add_column("Diff", justify="right", style="green")
+    tables_1, tables_2 = sorted(db_1.tables, key=lambda x: x.full_name), sorted(db_2.tables, key=lambda x: x.full_name)
+    for _table1, _table2 in zip(tables_1, tables_2):
+        perc_diff = 100 if _table2.count == 0 else int(_table2.count * 100 / _table1.count)
+        if perc_diff > 0:
+            pass
+        elif perc_diff < 0:
+            pass
+        else:
+            pass
+        table.add_row(_table1.full_name,
+                      str(_table1.count),
+                      str(_table2.count),
+                      f'{perc_diff}%')
+    console = Console()
+    console.print(table)

padmy-0.4.0/padmy/env.py ADDED Viewed

@@ -0,0 +1,11 @@
+import os
+PG_DATABASE = os.getenv('PG_DATABASE', 'postgres')
+PG_HOST = os.getenv('PG_HOST', 'localhost')
+PG_PORT = int(os.getenv('PG_PORT', '5432'))
+PG_USER = os.getenv('PG_USER', 'postgres')
+PG_PASSWORD = os.getenv('PG_PASSWORD', 'postgres')
+# Migration
+SQL_DIR = os.getenv('SQL_DIR')
+MIGRATION_DIR = os.getenv('MIGRATION_DIR')

padmy-0.4.0/padmy/logs.py ADDED Viewed

@@ -0,0 +1,13 @@
+import logging
+from rich.logging import RichHandler
+logs = logging.getLogger('padmy')
+def setup_logging(level: int):
+    logging.basicConfig(
+        datefmt="%H:%M:%S",
+        format="%(message)s",
+        handlers=[RichHandler(rich_tracebacks=False, show_path=False)]
+    )
+    logs.setLevel(level)

padmy-0.4.0/padmy/migration/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .migration import migrate_down, migrate_up, migrate_verify, migrate_setup
+from .create_files import create_new_migration
+from .run import migration

padmy-0.4.0/padmy/migration/create_files.py ADDED Viewed

@@ -0,0 +1,57 @@
+import textwrap
+import time
+import uuid
+from pathlib import Path
+from rich.console import Console
+from rich.markup import escape
+from rich.prompt import Prompt
+from padmy.logs import logs
+from .utils import get_git_email, get_files, iter_migration_files
+_CONSOLE = Console(markup=True, highlight=False)
+def _get_user_email() -> str | None:
+    default_author = get_git_email()
+    author = Prompt.ask("[blue]Author[/blue]", default=default_author,
+                        console=_CONSOLE)
+    return author
+def _get_last_migration_name(folder: Path) -> str | None:
+    """ Returns the most recent migration files"""
+    files = get_files(reverse=True, folder=folder)
+    if not files:
+        return None
+    up_file, down_file = next(iter_migration_files(files))
+    return up_file.path.name
+def create_new_migration(folder: Path):
+    """
+    Creates 2 new files, up and down
+    """
+    folder.mkdir(exist_ok=True, parents=True)
+    _base_name = f'{int(time.time())}-{str(uuid.uuid4())[:8]}'
+    _CONSOLE.print(f'\nCreating new migration file ([green]{escape(_base_name)}[/green]):\n')
+    last_migration = _get_last_migration_name(folder)
+    logs.debug(f'Last migration files: {last_migration}')
+    author = _get_user_email()
+    logs.debug(f'User email: {author}')
+    up_file = folder / Path(f'{_base_name}-up.sql')
+    down_file = folder / Path(f'{_base_name}-down.sql')
+    file_header = textwrap.dedent(f"""
+    -- Prev-file: {last_migration or ''}
+    -- Author: {author or ''}
+    """).strip()
+    up_file.write_text(file_header)
+    down_file.write_text(file_header)
+    _CONSOLE.print('\nNew files created!\n')

padmy-0.4.0/padmy/migration/db.sql ADDED Viewed

@@ -0,0 +1,23 @@
+SET SCHEMA 'public';
+DO
+$$
+    BEGIN
+        CREATE TYPE MIGRATION_TYPE AS ENUM (
+            'up',
+            'down'
+            );
+    EXCEPTION
+        WHEN duplicate_object THEN null;
+    END
+$$;
+CREATE TABLE IF NOT EXISTS public.migration
+(
+    id             serial PRIMARY KEY NOT NULL,
+    applied_at     timestamp          NOT NULL DEFAULT now(),
+    migration_type MIGRATION_TYPE     NOT NULL,
+    file_name      text               NOT NULL,
+    file_ts        TIMESTAMP          NOT NULL,
+    file_id        varchar(10)        NOT NULL
+);