PyPI - table2db - Versions diffs - 0.1.0__py3-none-any.whl - Mend

table2db 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

table2db/__init__.py +15 -0
table2db/cli.py +187 -0
table2db/converter.py +122 -0
table2db/describe.py +150 -0
table2db/errors.py +14 -0
table2db/loaders/__init__.py +4 -0
table2db/loaders/base.py +22 -0
table2db/loaders/sqlite_loader.py +187 -0
table2db/models.py +70 -0
table2db/pipeline/__init__.py +0 -0
table2db/pipeline/cleaner.py +183 -0
table2db/pipeline/island_detector.py +113 -0
table2db/pipeline/loader.py +11 -0
table2db/pipeline/reader.py +272 -0
table2db/pipeline/relator.py +119 -0
table2db/pipeline/structure.py +296 -0
table2db/pipeline/typer.py +188 -0
table2db-0.1.0.dist-info/METADATA +451 -0
table2db-0.1.0.dist-info/RECORD +23 -0
table2db-0.1.0.dist-info/WHEEL +5 -0
table2db-0.1.0.dist-info/entry_points.txt +2 -0
table2db-0.1.0.dist-info/licenses/LICENSE +21 -0
table2db-0.1.0.dist-info/top_level.txt +1 -0

table2db/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+from .converter import TableConverter
+from .models import ConversionResult, TableInfo, ForeignKey, WorkbookData
+from .loaders import BaseLoader, SqliteLoader
+from .errors import (
+    ExcelToDbError, FileReadError, NoDataError,
+    UnsupportedFormatError, SchemaError,
+)
+__all__ = [
+    "TableConverter",
+    "ConversionResult", "TableInfo", "ForeignKey", "WorkbookData",
+    "BaseLoader", "SqliteLoader",
+    "ExcelToDbError", "FileReadError", "NoDataError",
+    "UnsupportedFormatError", "SchemaError",
+]

table2db/cli.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""Command-line interface for table2db."""
+from __future__ import annotations
+import argparse
+import sys
+import os
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="table2db",
+        description="Convert Excel files into clean SQLite databases.",
+    )
+    subparsers = parser.add_subparsers(dest="command")
+    # convert command
+    convert_parser = subparsers.add_parser(
+        "convert", help="Convert an Excel file to a SQLite database."
+    )
+    convert_parser.add_argument("input", help="Path to the Excel file (.xlsx or .xls)")
+    convert_parser.add_argument(
+        "-o", "--output", default=None,
+        help="Output .db file path (default: <input_name>.db in current directory)"
+    )
+    convert_parser.add_argument(
+        "--summary", action="store_true",
+        help="Also generate a Markdown summary file (<output>_summary.md)"
+    )
+    convert_parser.add_argument(
+        "--sample-rows", type=int, default=3,
+        help="Number of sample rows in summary (default: 3)"
+    )
+    convert_parser.add_argument(
+        "--type-threshold", type=float, default=0.8,
+        help="Type inference majority threshold (default: 0.8)"
+    )
+    convert_parser.add_argument(
+        "--fk-threshold", type=float, default=0.8,
+        help="Foreign key confidence threshold (default: 0.8)"
+    )
+    # describe command
+    describe_parser = subparsers.add_parser(
+        "describe", help="Generate a Markdown summary of an existing .db file."
+    )
+    describe_parser.add_argument("db_path", help="Path to the SQLite .db file")
+    describe_parser.add_argument(
+        "-o", "--output", default=None,
+        help="Output .md file path (default: print to stdout)"
+    )
+    describe_parser.add_argument(
+        "--sample-rows", type=int, default=3,
+        help="Number of sample rows (default: 3)"
+    )
+    args = parser.parse_args(argv)
+    if args.command is None:
+        parser.print_help()
+        return 1
+    if args.command == "convert":
+        return _cmd_convert(args)
+    elif args.command == "describe":
+        return _cmd_describe(args)
+    return 1
+def _cmd_convert(args) -> int:
+    from .converter import TableConverter
+    from .loaders import SqliteLoader
+    from .describe import generate_db_summary
+    # Determine output path
+    if args.output:
+        output_path = args.output
+    else:
+        base = os.path.splitext(os.path.basename(args.input))[0]
+        output_path = f"{base}.db"
+    converter = TableConverter(
+        type_threshold=args.type_threshold,
+        fk_confidence_threshold=args.fk_threshold,
+    )
+    loader = SqliteLoader(output_path=output_path)
+    try:
+        result = converter.convert(args.input, loader=loader)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+    table_summary = ", ".join(f"{t.name}({t.row_count} rows)" for t in result.tables)
+    print(f"Created {output_path}: {len(result.tables)} tables [{table_summary}]")
+    if result.warnings:
+        for w in result.warnings:
+            print(f"  Warning: {w}")
+    if args.summary:
+        summary = generate_db_summary(result, sample_rows=args.sample_rows)
+        summary_path = output_path.replace(".db", "_summary.md")
+        with open(summary_path, "w", encoding="utf-8") as f:
+            f.write(summary)
+        print(f"Summary written to {summary_path}")
+    return 0
+def _cmd_describe(args) -> int:
+    from .models import ConversionResult, TableInfo, ForeignKey
+    from .describe import generate_db_summary
+    import sqlite3
+    if not os.path.exists(args.db_path):
+        print(f"Error: File not found: {args.db_path}", file=sys.stderr)
+        return 1
+    # Build a minimal ConversionResult from the .db file
+    conn = sqlite3.connect(args.db_path)
+    try:
+        tables_raw = conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name != '_meta'"
+        ).fetchall()
+        tables = []
+        relationships = []
+        source_file = args.db_path
+        # Try to read metadata
+        try:
+            meta = dict(conn.execute("SELECT key, value FROM _meta").fetchall())
+            source_file = meta.get("source_file", args.db_path)
+        except Exception:
+            meta = {}
+        for (tbl_name,) in tables_raw:
+            row_count = conn.execute(f'SELECT COUNT(*) FROM "{tbl_name}"').fetchone()[0]
+            cols_info = conn.execute(f'PRAGMA table_info("{tbl_name}")').fetchall()
+            columns = [{"name": c[1], "type": c[2] or "TEXT"} for c in cols_info]
+            pk_cols = [c[1] for c in cols_info if c[5] > 0]
+            source_sheet = meta.get(f"table:{tbl_name}:source_sheet", tbl_name)
+            tables.append(TableInfo(
+                name=tbl_name,
+                columns=columns,
+                row_count=row_count,
+                source_sheet=source_sheet,
+                primary_key=pk_cols[0] if pk_cols else None,
+            ))
+        # Read FK relationships from _meta
+        for key, value in meta.items():
+            if key.startswith("fk:"):
+                parts = key[3:]  # remove "fk:"
+                from_part, to_part = parts.split("->")
+                from_tbl, from_col = from_part.rsplit(".", 1)
+                to_tbl, to_col = to_part.rsplit(".", 1)
+                relationships.append(ForeignKey(
+                    from_table=from_tbl, from_column=from_col,
+                    to_table=to_tbl, to_column=to_col,
+                    confidence=float(value),
+                ))
+    finally:
+        conn.close()
+    result = ConversionResult(
+        db_path=args.db_path,
+        tables=tables,
+        relationships=relationships,
+        warnings=[],
+        metadata={"source_file": source_file},
+    )
+    summary = generate_db_summary(result, sample_rows=args.sample_rows)
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(summary)
+        print(f"Summary written to {args.output}")
+    else:
+        print(summary)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

table2db/converter.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""TableConverter — orchestrates the 6-stage pipeline."""
+from __future__ import annotations
+import asyncio
+import logging
+from typing import BinaryIO, Union
+from .models import WorkbookData, ConversionResult
+from .pipeline.reader import read_workbook
+from .pipeline.structure import detect_structure
+from .pipeline.cleaner import clean_data
+from .pipeline.typer import infer_types
+from .pipeline.relator import infer_relationships
+from .loaders.sqlite_loader import SqliteLoader
+from .loaders.base import BaseLoader
+from .errors import NoDataError
+logger = logging.getLogger(__name__)
+class TableConverter:
+    def __init__(
+        self,
+        subtotal_keywords: list[str] | None = None,
+        type_threshold: float = 0.8,
+        skip_hidden_sheets: bool = True,
+        fk_confidence_threshold: float = 0.8,
+        header_min_fill_ratio: float = 0.5,
+        header_min_string_ratio: float = 0.7,
+    ):
+        self.subtotal_keywords = subtotal_keywords
+        self.type_threshold = type_threshold
+        self.skip_hidden_sheets = skip_hidden_sheets
+        self.fk_confidence_threshold = fk_confidence_threshold
+        self.header_min_fill_ratio = header_min_fill_ratio
+        self.header_min_string_ratio = header_min_string_ratio
+    def process(
+        self, source: Union[str, BinaryIO], file_name: str | None = None
+    ) -> tuple[WorkbookData, list[str]]:
+        """Run stages 1-5 (read, structure, clean, type, relate).
+        Args:
+            source: File path (str) or file-like object (BytesIO, UploadFile.file).
+            file_name: Original file name (required when source is a stream).
+        Returns (WorkbookData, warnings).
+        """
+        all_warnings: list[str] = []
+        source_label = source if isinstance(source, str) else (file_name or "stream")
+        logger.info("Stage 1: Reading workbook from %s", source_label)
+        wb = read_workbook(source, skip_hidden_sheets=self.skip_hidden_sheets,
+                           file_name=file_name)
+        for sheet in wb.sheets:
+            all_warnings.extend(sheet.metadata.get("warnings", []))
+        logger.info("Stage 2: Detecting structure (%d sheets)", len(wb.sheets))
+        wb, warnings = detect_structure(
+            wb,
+            header_min_fill_ratio=self.header_min_fill_ratio,
+            header_min_string_ratio=self.header_min_string_ratio,
+        )
+        all_warnings.extend(warnings)
+        if not wb.sheets:
+            raise NoDataError(f"No valid sheets found in {source_label}")
+        logger.info("Stage 3: Cleaning data (%d sheets)", len(wb.sheets))
+        wb, warnings = clean_data(wb, subtotal_keywords=self.subtotal_keywords)
+        all_warnings.extend(warnings)
+        if not wb.sheets:
+            raise NoDataError(f"No data remaining after cleaning in {source_label}")
+        logger.info("Stage 4: Inferring types")
+        wb = infer_types(wb, type_threshold=self.type_threshold)
+        logger.info("Stage 5: Inferring relationships")
+        wb = infer_relationships(wb, fk_confidence_threshold=self.fk_confidence_threshold)
+        return wb, all_warnings
+    def convert(
+        self,
+        source: Union[str, BinaryIO],
+        loader: BaseLoader | None = None,
+        file_name: str | None = None,
+    ) -> ConversionResult:
+        """Run the full pipeline (stages 1-6) and return ConversionResult.
+        Args:
+            source: File path (str) or file-like object (BytesIO, UploadFile.file).
+            loader: Optional custom loader. Defaults to SqliteLoader().
+            file_name: Original file name (required when source is a stream).
+        """
+        wb, all_warnings = self.process(source, file_name=file_name)
+        if loader is None:
+            loader = SqliteLoader()
+        logger.info("Stage 6: Loading with %s", type(loader).__name__)
+        result = loader.load(wb)
+        result.warnings = all_warnings
+        logger.info("Conversion complete: %d tables, %d warnings",
+                     len(result.tables), len(result.warnings))
+        return result
+    async def convert_async(
+        self,
+        source: Union[str, BinaryIO],
+        loader: BaseLoader | None = None,
+        file_name: str | None = None,
+    ) -> ConversionResult:
+        """Async version of convert(). Runs pipeline in a thread pool."""
+        return await asyncio.to_thread(self.convert, source, loader, file_name)
+    async def process_async(
+        self, source: Union[str, BinaryIO], file_name: str | None = None
+    ) -> tuple[WorkbookData, list[str]]:
+        """Async version of process(). Runs pipeline in a thread pool."""
+        return await asyncio.to_thread(self.process, source, file_name)

table2db/describe.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Generate a Markdown summary of a ConversionResult database."""
+from __future__ import annotations
+import sqlite3
+from table2db.models import ConversionResult
+def generate_db_summary(result: ConversionResult, sample_rows: int = 3) -> str:
+    """Return a Markdown summary of the SQLite database in *result*."""
+    conn = sqlite3.connect(result.db_path)
+    conn.row_factory = sqlite3.Row
+    try:
+        return _build_summary(conn, result, sample_rows)
+    finally:
+        conn.close()
+def _build_summary(
+    conn: sqlite3.Connection,
+    result: ConversionResult,
+    sample_rows: int,
+) -> str:
+    source = result.metadata.get("source_file", "unknown")
+    lines: list[str] = [
+        "# Database Summary",
+        "",
+        f"**Source:** {source}",
+        f"**Tables:** {len(result.tables)}",
+        "",
+        "---",
+    ]
+    for table in result.tables:
+        tname = table.name
+        row_count = table.row_count
+        pk = table.primary_key or "None"
+        lines.append("")
+        lines.append(f"## Table: {tname}")
+        lines.append("")
+        lines.append(
+            f"**Rows:** {row_count} | **Source Sheet:** {table.source_sheet} "
+            f"| **Primary Key:** {pk}"
+        )
+        # --- Columns ---
+        lines.append("")
+        lines.append("### Columns")
+        lines.append("")
+        lines.append("| Column | Type |")
+        lines.append("|--------|------|")
+        for col in table.columns:
+            lines.append(f"| {col['name']} | {col['type']} |")
+        # --- Sample Data ---
+        lines.append("")
+        lines.append(f"### Sample Data (first {sample_rows} rows)")
+        lines.append("")
+        col_names = [c["name"] for c in table.columns]
+        lines.append("| " + " | ".join(col_names) + " |")
+        lines.append("| " + " | ".join("---" for _ in col_names) + " |")
+        quoted_cols = ", ".join(f'"{c}"' for c in col_names)
+        cur = conn.execute(
+            f'SELECT {quoted_cols} FROM "{tname}" LIMIT ?', (sample_rows,)
+        )
+        for row in cur:
+            cells = [_fmt(row[i]) for i in range(len(col_names))]
+            lines.append("| " + " | ".join(cells) + " |")
+        # --- Column Statistics ---
+        lines.append("")
+        lines.append("### Column Statistics")
+        lines.append("")
+        _append_stats(conn, tname, table, lines, row_count)
+    # --- Relationships ---
+    if result.relationships:
+        lines.append("")
+        lines.append("### Relationships")
+        lines.append("")
+        lines.append("| From | → | To |")
+        lines.append("|------|---|----|")
+        for fk in result.relationships:
+            lines.append(
+                f"| {fk.from_table}.{fk.from_column} | → "
+                f"| {fk.to_table}.{fk.to_column} |"
+            )
+    return "\n".join(lines)
+def _append_stats(
+    conn: sqlite3.Connection,
+    tname: str,
+    table,
+    lines: list[str],
+    row_count: int,
+) -> None:
+    lines.append(
+        "| Column | Type | Null % | Min | Max | Avg | Distinct |"
+    )
+    lines.append(
+        "|--------|------|--------|-----|-----|-----|----------|"
+    )
+    for col in table.columns:
+        cname = col["name"]
+        ctype = col["type"]
+        # Null rate
+        null_count = conn.execute(
+            f'SELECT COUNT(*) FROM "{tname}" WHERE "{cname}" IS NULL'
+        ).fetchone()[0]
+        null_pct = (
+            f"{null_count / row_count * 100:.0f}%" if row_count else "N/A"
+        )
+        distinct = conn.execute(
+            f'SELECT COUNT(DISTINCT "{cname}") FROM "{tname}"'
+        ).fetchone()[0]
+        if ctype in ("INTEGER", "REAL"):
+            row = conn.execute(
+                f'SELECT MIN("{cname}"), MAX("{cname}"), AVG("{cname}") '
+                f'FROM "{tname}"'
+            ).fetchone()
+            mn, mx, avg = (_fmt(row[0]), _fmt(row[1]), _fmt(row[2]))
+            lines.append(
+                f"| {cname} | {ctype} | {null_pct} | {mn} | {mx} | {avg} | {distinct} |"
+            )
+        else:
+            # Text stats: top 3 values
+            top_rows = conn.execute(
+                f'SELECT "{cname}", COUNT(*) as cnt FROM "{tname}" '
+                f'WHERE "{cname}" IS NOT NULL '
+                f'GROUP BY "{cname}" ORDER BY cnt DESC LIMIT 3'
+            ).fetchall()
+            top_vals = ", ".join(f"{r[0]}({r[1]})" for r in top_rows)
+            lines.append(
+                f"| {cname} | {ctype} | {null_pct} | - | - | - | {distinct} |"
+            )
+            if top_vals:
+                lines.append(f"|  | Top values: {top_vals} |||||")
+def _fmt(value) -> str:
+    if value is None:
+        return "NULL"
+    return str(value)

table2db/errors.py ADDED Viewed

@@ -0,0 +1,14 @@
+class ExcelToDbError(Exception):
+    """Base exception for table2db library."""
+class FileReadError(ExcelToDbError):
+    """File cannot be read: corrupted, password-protected, missing."""
+class NoDataError(ExcelToDbError):
+    """File readable but contains no usable data."""
+class UnsupportedFormatError(ExcelToDbError):
+    """Unsupported file format (e.g. .xlsb)."""
+class SchemaError(ExcelToDbError):
+    """Cannot infer valid table structure (e.g. no header found)."""

table2db/loaders/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .base import BaseLoader
+from .sqlite_loader import SqliteLoader
+__all__ = ["BaseLoader", "SqliteLoader"]

table2db/loaders/base.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""Base loader protocol for table2db."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from table2db.models import WorkbookData, ConversionResult
+class BaseLoader(ABC):
+    """Abstract base class for database loaders.
+    To create a custom loader, subclass BaseLoader and implement load().
+    Example:
+        class MyPostgresLoader(BaseLoader):
+            def load(self, wb: WorkbookData) -> ConversionResult:
+                # Create tables and insert data into PostgreSQL
+                ...
+    """
+    @abstractmethod
+    def load(self, wb: WorkbookData) -> ConversionResult:
+        """Load WorkbookData into a database and return ConversionResult."""
+        ...