table2db 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
table2db/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from .converter import TableConverter
2
+ from .models import ConversionResult, TableInfo, ForeignKey, WorkbookData
3
+ from .loaders import BaseLoader, SqliteLoader
4
+ from .errors import (
5
+ ExcelToDbError, FileReadError, NoDataError,
6
+ UnsupportedFormatError, SchemaError,
7
+ )
8
+
9
+ __all__ = [
10
+ "TableConverter",
11
+ "ConversionResult", "TableInfo", "ForeignKey", "WorkbookData",
12
+ "BaseLoader", "SqliteLoader",
13
+ "ExcelToDbError", "FileReadError", "NoDataError",
14
+ "UnsupportedFormatError", "SchemaError",
15
+ ]
table2db/cli.py ADDED
@@ -0,0 +1,187 @@
1
+ """Command-line interface for table2db."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+ import os
7
+
8
+
9
+ def main(argv: list[str] | None = None) -> int:
10
+ parser = argparse.ArgumentParser(
11
+ prog="table2db",
12
+ description="Convert Excel files into clean SQLite databases.",
13
+ )
14
+ subparsers = parser.add_subparsers(dest="command")
15
+
16
+ # convert command
17
+ convert_parser = subparsers.add_parser(
18
+ "convert", help="Convert an Excel file to a SQLite database."
19
+ )
20
+ convert_parser.add_argument("input", help="Path to the Excel file (.xlsx or .xls)")
21
+ convert_parser.add_argument(
22
+ "-o", "--output", default=None,
23
+ help="Output .db file path (default: <input_name>.db in current directory)"
24
+ )
25
+ convert_parser.add_argument(
26
+ "--summary", action="store_true",
27
+ help="Also generate a Markdown summary file (<output>_summary.md)"
28
+ )
29
+ convert_parser.add_argument(
30
+ "--sample-rows", type=int, default=3,
31
+ help="Number of sample rows in summary (default: 3)"
32
+ )
33
+ convert_parser.add_argument(
34
+ "--type-threshold", type=float, default=0.8,
35
+ help="Type inference majority threshold (default: 0.8)"
36
+ )
37
+ convert_parser.add_argument(
38
+ "--fk-threshold", type=float, default=0.8,
39
+ help="Foreign key confidence threshold (default: 0.8)"
40
+ )
41
+
42
+ # describe command
43
+ describe_parser = subparsers.add_parser(
44
+ "describe", help="Generate a Markdown summary of an existing .db file."
45
+ )
46
+ describe_parser.add_argument("db_path", help="Path to the SQLite .db file")
47
+ describe_parser.add_argument(
48
+ "-o", "--output", default=None,
49
+ help="Output .md file path (default: print to stdout)"
50
+ )
51
+ describe_parser.add_argument(
52
+ "--sample-rows", type=int, default=3,
53
+ help="Number of sample rows (default: 3)"
54
+ )
55
+
56
+ args = parser.parse_args(argv)
57
+
58
+ if args.command is None:
59
+ parser.print_help()
60
+ return 1
61
+
62
+ if args.command == "convert":
63
+ return _cmd_convert(args)
64
+ elif args.command == "describe":
65
+ return _cmd_describe(args)
66
+ return 1
67
+
68
+
69
+ def _cmd_convert(args) -> int:
70
+ from .converter import TableConverter
71
+ from .loaders import SqliteLoader
72
+ from .describe import generate_db_summary
73
+
74
+ # Determine output path
75
+ if args.output:
76
+ output_path = args.output
77
+ else:
78
+ base = os.path.splitext(os.path.basename(args.input))[0]
79
+ output_path = f"{base}.db"
80
+
81
+ converter = TableConverter(
82
+ type_threshold=args.type_threshold,
83
+ fk_confidence_threshold=args.fk_threshold,
84
+ )
85
+ loader = SqliteLoader(output_path=output_path)
86
+
87
+ try:
88
+ result = converter.convert(args.input, loader=loader)
89
+ except Exception as e:
90
+ print(f"Error: {e}", file=sys.stderr)
91
+ return 1
92
+
93
+ table_summary = ", ".join(f"{t.name}({t.row_count} rows)" for t in result.tables)
94
+ print(f"Created {output_path}: {len(result.tables)} tables [{table_summary}]")
95
+
96
+ if result.warnings:
97
+ for w in result.warnings:
98
+ print(f" Warning: {w}")
99
+
100
+ if args.summary:
101
+ summary = generate_db_summary(result, sample_rows=args.sample_rows)
102
+ summary_path = output_path.replace(".db", "_summary.md")
103
+ with open(summary_path, "w", encoding="utf-8") as f:
104
+ f.write(summary)
105
+ print(f"Summary written to {summary_path}")
106
+
107
+ return 0
108
+
109
+
110
+ def _cmd_describe(args) -> int:
111
+ from .models import ConversionResult, TableInfo, ForeignKey
112
+ from .describe import generate_db_summary
113
+ import sqlite3
114
+
115
+ if not os.path.exists(args.db_path):
116
+ print(f"Error: File not found: {args.db_path}", file=sys.stderr)
117
+ return 1
118
+
119
+ # Build a minimal ConversionResult from the .db file
120
+ conn = sqlite3.connect(args.db_path)
121
+ try:
122
+ tables_raw = conn.execute(
123
+ "SELECT name FROM sqlite_master WHERE type='table' AND name != '_meta'"
124
+ ).fetchall()
125
+
126
+ tables = []
127
+ relationships = []
128
+ source_file = args.db_path
129
+
130
+ # Try to read metadata
131
+ try:
132
+ meta = dict(conn.execute("SELECT key, value FROM _meta").fetchall())
133
+ source_file = meta.get("source_file", args.db_path)
134
+ except Exception:
135
+ meta = {}
136
+
137
+ for (tbl_name,) in tables_raw:
138
+ row_count = conn.execute(f'SELECT COUNT(*) FROM "{tbl_name}"').fetchone()[0]
139
+ cols_info = conn.execute(f'PRAGMA table_info("{tbl_name}")').fetchall()
140
+ columns = [{"name": c[1], "type": c[2] or "TEXT"} for c in cols_info]
141
+ pk_cols = [c[1] for c in cols_info if c[5] > 0]
142
+ source_sheet = meta.get(f"table:{tbl_name}:source_sheet", tbl_name)
143
+ tables.append(TableInfo(
144
+ name=tbl_name,
145
+ columns=columns,
146
+ row_count=row_count,
147
+ source_sheet=source_sheet,
148
+ primary_key=pk_cols[0] if pk_cols else None,
149
+ ))
150
+
151
+ # Read FK relationships from _meta
152
+ for key, value in meta.items():
153
+ if key.startswith("fk:"):
154
+ parts = key[3:] # remove "fk:"
155
+ from_part, to_part = parts.split("->")
156
+ from_tbl, from_col = from_part.rsplit(".", 1)
157
+ to_tbl, to_col = to_part.rsplit(".", 1)
158
+ relationships.append(ForeignKey(
159
+ from_table=from_tbl, from_column=from_col,
160
+ to_table=to_tbl, to_column=to_col,
161
+ confidence=float(value),
162
+ ))
163
+ finally:
164
+ conn.close()
165
+
166
+ result = ConversionResult(
167
+ db_path=args.db_path,
168
+ tables=tables,
169
+ relationships=relationships,
170
+ warnings=[],
171
+ metadata={"source_file": source_file},
172
+ )
173
+
174
+ summary = generate_db_summary(result, sample_rows=args.sample_rows)
175
+
176
+ if args.output:
177
+ with open(args.output, "w", encoding="utf-8") as f:
178
+ f.write(summary)
179
+ print(f"Summary written to {args.output}")
180
+ else:
181
+ print(summary)
182
+
183
+ return 0
184
+
185
+
186
+ if __name__ == "__main__":
187
+ sys.exit(main())
table2db/converter.py ADDED
@@ -0,0 +1,122 @@
1
+ """TableConverter — orchestrates the 6-stage pipeline."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import logging
6
+ from typing import BinaryIO, Union
7
+ from .models import WorkbookData, ConversionResult
8
+ from .pipeline.reader import read_workbook
9
+ from .pipeline.structure import detect_structure
10
+ from .pipeline.cleaner import clean_data
11
+ from .pipeline.typer import infer_types
12
+ from .pipeline.relator import infer_relationships
13
+ from .loaders.sqlite_loader import SqliteLoader
14
+ from .loaders.base import BaseLoader
15
+ from .errors import NoDataError
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class TableConverter:
21
+ def __init__(
22
+ self,
23
+ subtotal_keywords: list[str] | None = None,
24
+ type_threshold: float = 0.8,
25
+ skip_hidden_sheets: bool = True,
26
+ fk_confidence_threshold: float = 0.8,
27
+ header_min_fill_ratio: float = 0.5,
28
+ header_min_string_ratio: float = 0.7,
29
+ ):
30
+ self.subtotal_keywords = subtotal_keywords
31
+ self.type_threshold = type_threshold
32
+ self.skip_hidden_sheets = skip_hidden_sheets
33
+ self.fk_confidence_threshold = fk_confidence_threshold
34
+ self.header_min_fill_ratio = header_min_fill_ratio
35
+ self.header_min_string_ratio = header_min_string_ratio
36
+
37
+ def process(
38
+ self, source: Union[str, BinaryIO], file_name: str | None = None
39
+ ) -> tuple[WorkbookData, list[str]]:
40
+ """Run stages 1-5 (read, structure, clean, type, relate).
41
+
42
+ Args:
43
+ source: File path (str) or file-like object (BytesIO, UploadFile.file).
44
+ file_name: Original file name (required when source is a stream).
45
+
46
+ Returns (WorkbookData, warnings).
47
+ """
48
+ all_warnings: list[str] = []
49
+ source_label = source if isinstance(source, str) else (file_name or "stream")
50
+
51
+ logger.info("Stage 1: Reading workbook from %s", source_label)
52
+ wb = read_workbook(source, skip_hidden_sheets=self.skip_hidden_sheets,
53
+ file_name=file_name)
54
+ for sheet in wb.sheets:
55
+ all_warnings.extend(sheet.metadata.get("warnings", []))
56
+
57
+ logger.info("Stage 2: Detecting structure (%d sheets)", len(wb.sheets))
58
+ wb, warnings = detect_structure(
59
+ wb,
60
+ header_min_fill_ratio=self.header_min_fill_ratio,
61
+ header_min_string_ratio=self.header_min_string_ratio,
62
+ )
63
+ all_warnings.extend(warnings)
64
+
65
+ if not wb.sheets:
66
+ raise NoDataError(f"No valid sheets found in {source_label}")
67
+
68
+ logger.info("Stage 3: Cleaning data (%d sheets)", len(wb.sheets))
69
+ wb, warnings = clean_data(wb, subtotal_keywords=self.subtotal_keywords)
70
+ all_warnings.extend(warnings)
71
+
72
+ if not wb.sheets:
73
+ raise NoDataError(f"No data remaining after cleaning in {source_label}")
74
+
75
+ logger.info("Stage 4: Inferring types")
76
+ wb = infer_types(wb, type_threshold=self.type_threshold)
77
+
78
+ logger.info("Stage 5: Inferring relationships")
79
+ wb = infer_relationships(wb, fk_confidence_threshold=self.fk_confidence_threshold)
80
+
81
+ return wb, all_warnings
82
+
83
+ def convert(
84
+ self,
85
+ source: Union[str, BinaryIO],
86
+ loader: BaseLoader | None = None,
87
+ file_name: str | None = None,
88
+ ) -> ConversionResult:
89
+ """Run the full pipeline (stages 1-6) and return ConversionResult.
90
+
91
+ Args:
92
+ source: File path (str) or file-like object (BytesIO, UploadFile.file).
93
+ loader: Optional custom loader. Defaults to SqliteLoader().
94
+ file_name: Original file name (required when source is a stream).
95
+ """
96
+ wb, all_warnings = self.process(source, file_name=file_name)
97
+
98
+ if loader is None:
99
+ loader = SqliteLoader()
100
+
101
+ logger.info("Stage 6: Loading with %s", type(loader).__name__)
102
+ result = loader.load(wb)
103
+ result.warnings = all_warnings
104
+
105
+ logger.info("Conversion complete: %d tables, %d warnings",
106
+ len(result.tables), len(result.warnings))
107
+ return result
108
+
109
+ async def convert_async(
110
+ self,
111
+ source: Union[str, BinaryIO],
112
+ loader: BaseLoader | None = None,
113
+ file_name: str | None = None,
114
+ ) -> ConversionResult:
115
+ """Async version of convert(). Runs pipeline in a thread pool."""
116
+ return await asyncio.to_thread(self.convert, source, loader, file_name)
117
+
118
+ async def process_async(
119
+ self, source: Union[str, BinaryIO], file_name: str | None = None
120
+ ) -> tuple[WorkbookData, list[str]]:
121
+ """Async version of process(). Runs pipeline in a thread pool."""
122
+ return await asyncio.to_thread(self.process, source, file_name)
table2db/describe.py ADDED
@@ -0,0 +1,150 @@
1
+ """Generate a Markdown summary of a ConversionResult database."""
2
+ from __future__ import annotations
3
+
4
+ import sqlite3
5
+ from table2db.models import ConversionResult
6
+
7
+
8
+ def generate_db_summary(result: ConversionResult, sample_rows: int = 3) -> str:
9
+ """Return a Markdown summary of the SQLite database in *result*."""
10
+ conn = sqlite3.connect(result.db_path)
11
+ conn.row_factory = sqlite3.Row
12
+ try:
13
+ return _build_summary(conn, result, sample_rows)
14
+ finally:
15
+ conn.close()
16
+
17
+
18
+ def _build_summary(
19
+ conn: sqlite3.Connection,
20
+ result: ConversionResult,
21
+ sample_rows: int,
22
+ ) -> str:
23
+ source = result.metadata.get("source_file", "unknown")
24
+ lines: list[str] = [
25
+ "# Database Summary",
26
+ "",
27
+ f"**Source:** {source}",
28
+ f"**Tables:** {len(result.tables)}",
29
+ "",
30
+ "---",
31
+ ]
32
+
33
+ for table in result.tables:
34
+ tname = table.name
35
+ row_count = table.row_count
36
+ pk = table.primary_key or "None"
37
+
38
+ lines.append("")
39
+ lines.append(f"## Table: {tname}")
40
+ lines.append("")
41
+ lines.append(
42
+ f"**Rows:** {row_count} | **Source Sheet:** {table.source_sheet} "
43
+ f"| **Primary Key:** {pk}"
44
+ )
45
+
46
+ # --- Columns ---
47
+ lines.append("")
48
+ lines.append("### Columns")
49
+ lines.append("")
50
+ lines.append("| Column | Type |")
51
+ lines.append("|--------|------|")
52
+ for col in table.columns:
53
+ lines.append(f"| {col['name']} | {col['type']} |")
54
+
55
+ # --- Sample Data ---
56
+ lines.append("")
57
+ lines.append(f"### Sample Data (first {sample_rows} rows)")
58
+ lines.append("")
59
+ col_names = [c["name"] for c in table.columns]
60
+ lines.append("| " + " | ".join(col_names) + " |")
61
+ lines.append("| " + " | ".join("---" for _ in col_names) + " |")
62
+
63
+ quoted_cols = ", ".join(f'"{c}"' for c in col_names)
64
+ cur = conn.execute(
65
+ f'SELECT {quoted_cols} FROM "{tname}" LIMIT ?', (sample_rows,)
66
+ )
67
+ for row in cur:
68
+ cells = [_fmt(row[i]) for i in range(len(col_names))]
69
+ lines.append("| " + " | ".join(cells) + " |")
70
+
71
+ # --- Column Statistics ---
72
+ lines.append("")
73
+ lines.append("### Column Statistics")
74
+ lines.append("")
75
+ _append_stats(conn, tname, table, lines, row_count)
76
+
77
+ # --- Relationships ---
78
+ if result.relationships:
79
+ lines.append("")
80
+ lines.append("### Relationships")
81
+ lines.append("")
82
+ lines.append("| From | → | To |")
83
+ lines.append("|------|---|----|")
84
+ for fk in result.relationships:
85
+ lines.append(
86
+ f"| {fk.from_table}.{fk.from_column} | → "
87
+ f"| {fk.to_table}.{fk.to_column} |"
88
+ )
89
+
90
+ return "\n".join(lines)
91
+
92
+
93
+ def _append_stats(
94
+ conn: sqlite3.Connection,
95
+ tname: str,
96
+ table,
97
+ lines: list[str],
98
+ row_count: int,
99
+ ) -> None:
100
+ lines.append(
101
+ "| Column | Type | Null % | Min | Max | Avg | Distinct |"
102
+ )
103
+ lines.append(
104
+ "|--------|------|--------|-----|-----|-----|----------|"
105
+ )
106
+
107
+ for col in table.columns:
108
+ cname = col["name"]
109
+ ctype = col["type"]
110
+
111
+ # Null rate
112
+ null_count = conn.execute(
113
+ f'SELECT COUNT(*) FROM "{tname}" WHERE "{cname}" IS NULL'
114
+ ).fetchone()[0]
115
+ null_pct = (
116
+ f"{null_count / row_count * 100:.0f}%" if row_count else "N/A"
117
+ )
118
+
119
+ distinct = conn.execute(
120
+ f'SELECT COUNT(DISTINCT "{cname}") FROM "{tname}"'
121
+ ).fetchone()[0]
122
+
123
+ if ctype in ("INTEGER", "REAL"):
124
+ row = conn.execute(
125
+ f'SELECT MIN("{cname}"), MAX("{cname}"), AVG("{cname}") '
126
+ f'FROM "{tname}"'
127
+ ).fetchone()
128
+ mn, mx, avg = (_fmt(row[0]), _fmt(row[1]), _fmt(row[2]))
129
+ lines.append(
130
+ f"| {cname} | {ctype} | {null_pct} | {mn} | {mx} | {avg} | {distinct} |"
131
+ )
132
+ else:
133
+ # Text stats: top 3 values
134
+ top_rows = conn.execute(
135
+ f'SELECT "{cname}", COUNT(*) as cnt FROM "{tname}" '
136
+ f'WHERE "{cname}" IS NOT NULL '
137
+ f'GROUP BY "{cname}" ORDER BY cnt DESC LIMIT 3'
138
+ ).fetchall()
139
+ top_vals = ", ".join(f"{r[0]}({r[1]})" for r in top_rows)
140
+ lines.append(
141
+ f"| {cname} | {ctype} | {null_pct} | - | - | - | {distinct} |"
142
+ )
143
+ if top_vals:
144
+ lines.append(f"| | Top values: {top_vals} |||||")
145
+
146
+
147
+ def _fmt(value) -> str:
148
+ if value is None:
149
+ return "NULL"
150
+ return str(value)
table2db/errors.py ADDED
@@ -0,0 +1,14 @@
1
+ class ExcelToDbError(Exception):
2
+ """Base exception for table2db library."""
3
+
4
+ class FileReadError(ExcelToDbError):
5
+ """File cannot be read: corrupted, password-protected, missing."""
6
+
7
+ class NoDataError(ExcelToDbError):
8
+ """File readable but contains no usable data."""
9
+
10
+ class UnsupportedFormatError(ExcelToDbError):
11
+ """Unsupported file format (e.g. .xlsb)."""
12
+
13
+ class SchemaError(ExcelToDbError):
14
+ """Cannot infer valid table structure (e.g. no header found)."""
@@ -0,0 +1,4 @@
1
+ from .base import BaseLoader
2
+ from .sqlite_loader import SqliteLoader
3
+
4
+ __all__ = ["BaseLoader", "SqliteLoader"]
@@ -0,0 +1,22 @@
1
+ """Base loader protocol for table2db."""
2
+ from __future__ import annotations
3
+ from abc import ABC, abstractmethod
4
+ from table2db.models import WorkbookData, ConversionResult
5
+
6
+
7
+ class BaseLoader(ABC):
8
+ """Abstract base class for database loaders.
9
+
10
+ To create a custom loader, subclass BaseLoader and implement load().
11
+
12
+ Example:
13
+ class MyPostgresLoader(BaseLoader):
14
+ def load(self, wb: WorkbookData) -> ConversionResult:
15
+ # Create tables and insert data into PostgreSQL
16
+ ...
17
+ """
18
+
19
+ @abstractmethod
20
+ def load(self, wb: WorkbookData) -> ConversionResult:
21
+ """Load WorkbookData into a database and return ConversionResult."""
22
+ ...