seedloom 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
seedloom/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
seedloom/cli.py ADDED
@@ -0,0 +1,195 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import click
8
+ import psycopg2
9
+ from rich.console import Console
10
+ from rich.table import Table as RichTable
11
+
12
+ from .config import Config
13
+ from .generator import generate_rows
14
+ from .graph import CyclicDependencyError, resolve_seed_order
15
+ from .inserter import existing_column_values, insert_rows, table_row_count
16
+ from .introspect import introspect
17
+ from .models import Schema
18
+ from .providers import ProviderError, SUPPORTED_PROVIDERS, get_provider
19
+
20
+ console = Console()
21
+ SCHEMA_CACHE = Path(".seedloom_schema.json")
22
+
23
+
24
+ @click.group()
25
+ def main() -> None:
26
+ """seedloom — AI-powered database seeding.
27
+
28
+ Introspects your Postgres schema and uses Claude to generate realistic,
29
+ referentially-valid seed data.
30
+ """
31
+
32
+
33
+ @main.command()
34
+ def init() -> None:
35
+ """Connect to the database, introspect the schema, and cache it locally."""
36
+ try:
37
+ config = Config.load(require_provider=False)
38
+ except EnvironmentError as e:
39
+ console.print(f"[red]{e}[/red]")
40
+ sys.exit(1)
41
+
42
+ console.print("[cyan]Connecting and introspecting schema...[/cyan]")
43
+ try:
44
+ schema = introspect(config.database_url)
45
+ except psycopg2.OperationalError as e:
46
+ console.print(f"[red]Could not connect to database: {e}[/red]")
47
+ sys.exit(1)
48
+
49
+ if not schema.tables:
50
+ console.print("[yellow]No tables found in the 'public' schema.[/yellow]")
51
+ sys.exit(0)
52
+
53
+ SCHEMA_CACHE.write_text(json.dumps(schema.to_dict(), indent=2))
54
+
55
+ t = RichTable(title="Discovered schema")
56
+ t.add_column("Table")
57
+ t.add_column("Columns")
58
+ t.add_column("Foreign Keys")
59
+ for table in schema.tables.values():
60
+ fks = ", ".join(f"{fk.column}->{fk.ref_table}.{fk.ref_column}" for fk in table.foreign_keys)
61
+ t.add_row(table.name, str(len(table.columns)), fks or "-")
62
+ console.print(t)
63
+ console.print(f"[green]Schema cached to {SCHEMA_CACHE}[/green]. Run 'seedloom run' next.")
64
+
65
+
66
+ @main.command()
67
+ @click.option("--rows", default=10, show_default=True, help="Rows to generate per table.")
68
+ @click.option("--tables", default=None, help="Comma-separated subset of tables to seed (default: all).")
69
+ @click.option("--dry-run", is_flag=True, help="Generate data and print it without inserting.")
70
+ @click.option(
71
+ "--provider",
72
+ default=None,
73
+ help=f"Override provider from config. Supported: {', '.join(SUPPORTED_PROVIDERS)}.",
74
+ )
75
+ @click.option("--model", default=None, help="Override model from config.")
76
+ @click.option("--base-url", default=None, help="Override base URL (openai_compatible or self-hosted endpoints).")
77
+ @click.option("--host", default=None, help="Override Ollama host (default: http://localhost:11434).")
78
+ def run(
79
+ rows: int,
80
+ tables: str | None,
81
+ dry_run: bool,
82
+ provider: str | None,
83
+ model: str | None,
84
+ base_url: str | None,
85
+ host: str | None,
86
+ ) -> None:
87
+ """Generate and insert seed data, respecting foreign key order."""
88
+ try:
89
+ config = Config.load(provider_override=provider or "")
90
+ except EnvironmentError as e:
91
+ console.print(f"[red]{e}[/red]")
92
+ sys.exit(1)
93
+
94
+ if not SCHEMA_CACHE.exists():
95
+ console.print("[red]No cached schema found. Run 'seedloom init' first.[/red]")
96
+ sys.exit(1)
97
+
98
+ schema = Schema.from_dict(json.loads(SCHEMA_CACHE.read_text()))
99
+
100
+ try:
101
+ order = resolve_seed_order(schema)
102
+ except CyclicDependencyError as e:
103
+ console.print(f"[red]{e}[/red]")
104
+ sys.exit(1)
105
+
106
+ if tables:
107
+ wanted = set(t.strip() for t in tables.split(","))
108
+ order = [t for t in order if t in wanted]
109
+
110
+ try:
111
+ active_provider = get_provider(
112
+ config.provider,
113
+ api_key=config.api_key,
114
+ model=model or config.model,
115
+ base_url=base_url or config.base_url,
116
+ host=host or config.host,
117
+ )
118
+ except ProviderError as e:
119
+ console.print(f"[red]{e}[/red]")
120
+ sys.exit(1)
121
+
122
+ console.print(f"[cyan]Using provider: {config.provider}[/cyan]")
123
+ conn = None if dry_run else psycopg2.connect(config.database_url)
124
+
125
+ fk_pools: dict[str, dict[str, list]] = {} # table -> column -> values
126
+
127
+ referenced_columns: dict[str, set[str]] = {}
128
+ for t in schema.tables.values():
129
+ for fk in t.foreign_keys:
130
+ referenced_columns.setdefault(fk.ref_table, set()).add(fk.ref_column)
131
+
132
+ try:
133
+ for table_name in order:
134
+ table = schema.tables[table_name]
135
+ needed_columns = sorted(referenced_columns.get(table_name, set()))
136
+
137
+ to_generate = rows
138
+ if conn is not None:
139
+ existing_count = table_row_count(conn, table_name)
140
+ if existing_count > 0 and needed_columns:
141
+ existing_values = existing_column_values(conn, table_name, needed_columns)
142
+ for col, vals in existing_values.items():
143
+ if vals:
144
+ fk_pools.setdefault(table_name, {})[col] = vals
145
+
146
+ if existing_count >= rows:
147
+ console.print(
148
+ f"[yellow]Skipping '{table_name}' — already has {existing_count} row(s) "
149
+ f"(>= {rows} requested).[/yellow]"
150
+ )
151
+ continue
152
+
153
+ to_generate = rows - existing_count
154
+ if existing_count > 0:
155
+ console.print(
156
+ f"[cyan]'{table_name}' has {existing_count} row(s); generating "
157
+ f"{to_generate} more to reach {rows}...[/cyan]"
158
+ )
159
+ else:
160
+ console.print(f"[cyan]Generating {to_generate} rows for '{table_name}'...[/cyan]")
161
+ else:
162
+ console.print(f"[cyan]Generating {to_generate} rows for '{table_name}'...[/cyan]")
163
+
164
+ fk_value_pool: dict[str, list] = {}
165
+ for fk in table.foreign_keys:
166
+ parent_pool = fk_pools.get(fk.ref_table, {}).get(fk.ref_column, [])
167
+ if parent_pool:
168
+ fk_value_pool[fk.column] = parent_pool
169
+
170
+ try:
171
+ generated = generate_rows(active_provider, table, to_generate, fk_value_pool)
172
+ except ProviderError as e:
173
+ console.print(f"[red]{e}[/red]")
174
+ sys.exit(1)
175
+
176
+ if dry_run:
177
+ console.print(generated)
178
+ continue
179
+
180
+ inserted_values = insert_rows(conn, table, generated, needed_columns)
181
+ for col, vals in inserted_values.items():
182
+ if vals:
183
+ fk_pools.setdefault(table_name, {}).setdefault(col, [])
184
+ fk_pools[table_name][col].extend(vals)
185
+
186
+ console.print(f"[green]Inserted {len(generated)} rows into '{table_name}'.[/green]")
187
+ finally:
188
+ if conn:
189
+ conn.close()
190
+
191
+ console.print("[bold green]Done.[/bold green]")
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
seedloom/config.py ADDED
@@ -0,0 +1,85 @@
1
+ """Configuration loading: env vars + optional .env file, no external deps."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ from .providers import NO_KEY_REQUIRED, SUPPORTED_PROVIDERS
9
+
10
+ _PROVIDER_KEY_ENV: dict[str, str] = {
11
+ "anthropic": "ANTHROPIC_API_KEY",
12
+ "openai": "OPENAI_API_KEY",
13
+ "gemini": "GEMINI_API_KEY",
14
+ "groq": "GROQ_API_KEY",
15
+ "together": "TOGETHER_API_KEY",
16
+ "fireworks": "FIREWORKS_API_KEY",
17
+ "openrouter": "OPENROUTER_API_KEY",
18
+ "deepseek": "DEEPSEEK_API_KEY",
19
+ "mistral": "MISTRAL_API_KEY",
20
+ "openai_compatible": "OPENAI_COMPATIBLE_API_KEY",
21
+ }
22
+
23
+
24
+ def _load_dotenv(path: Path = Path(".env")) -> None:
25
+ """Minimal .env loader — avoids pulling in python-dotenv as a dependency."""
26
+ if not path.exists():
27
+ return
28
+ for line in path.read_text().splitlines():
29
+ line = line.strip()
30
+ if not line or line.startswith("#") or "=" not in line:
31
+ continue
32
+ key, _, value = line.partition("=")
33
+ key = key.strip()
34
+ value = value.strip().strip('"').strip("'")
35
+ os.environ.setdefault(key, value)
36
+
37
+
38
+ @dataclass
39
+ class Config:
40
+ database_url: str
41
+ provider: str = "anthropic"
42
+ api_key: str = ""
43
+ model: str = ""
44
+ base_url: str = ""
45
+ host: str = ""
46
+
47
+ @classmethod
48
+ def load(cls, provider_override: str = "", require_provider: bool = True) -> "Config":
49
+ _load_dotenv()
50
+ db_url = os.environ.get("DATABASE_URL", "")
51
+ provider = (provider_override or os.environ.get("SEEDLOOM_PROVIDER", "anthropic")).lower()
52
+ model = os.environ.get("SEEDLOOM_MODEL", "")
53
+ base_url = os.environ.get("SEEDLOOM_BASE_URL", "")
54
+ host = os.environ.get("SEEDLOOM_HOST", "")
55
+
56
+ missing = []
57
+ if not db_url:
58
+ missing.append("DATABASE_URL")
59
+
60
+ if provider not in SUPPORTED_PROVIDERS:
61
+ raise EnvironmentError(
62
+ f"Unknown provider '{provider}'. Supported: {', '.join(SUPPORTED_PROVIDERS)}."
63
+ )
64
+
65
+ api_key = ""
66
+ if require_provider and provider not in NO_KEY_REQUIRED:
67
+ key_env = _PROVIDER_KEY_ENV.get(provider, f"{provider.upper()}_API_KEY")
68
+ api_key = os.environ.get(key_env, "")
69
+ if not api_key:
70
+ missing.append(key_env)
71
+
72
+ if missing:
73
+ raise EnvironmentError(
74
+ f"Missing required environment variable(s): {', '.join(missing)}. "
75
+ "Set them in your shell or in a .env file in the current directory."
76
+ )
77
+
78
+ return cls(
79
+ database_url=db_url,
80
+ provider=provider,
81
+ api_key=api_key,
82
+ model=model,
83
+ base_url=base_url,
84
+ host=host,
85
+ )
seedloom/generator.py ADDED
@@ -0,0 +1,235 @@
1
+ """Generate realistic seed rows for a table using a pluggable LLM provider.
2
+
3
+ Key design choice: referential integrity is enforced *structurally*, not by
4
+ hoping the model behaves. Foreign-key columns are generated as a JSON Schema
5
+ `enum` of the actual parent-row key values already inserted — the model picks
6
+ from real values, it can't invent a dangling reference.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import random
11
+ import uuid
12
+ from typing import Any
13
+
14
+ from .models import Column, Table
15
+ from .providers import Provider
16
+
17
+ _MEDIA_KEYWORDS = (
18
+ "avatar",
19
+ "photo",
20
+ "image",
21
+ "picture",
22
+ "logo",
23
+ "banner",
24
+ "thumbnail",
25
+ "cover",
26
+ "icon",
27
+ )
28
+
29
+
30
+ def _is_media_url_column(col: Column) -> bool:
31
+ if col.data_type not in ("text", "character varying", "character"):
32
+ return False
33
+ name = col.name.lower()
34
+ return any(k in name for k in _MEDIA_KEYWORDS)
35
+
36
+
37
+ def _random_media_url(col: Column) -> str:
38
+ name = col.name.lower()
39
+ seed = uuid.uuid4().hex[:12]
40
+ if "avatar" in name or "headshot" in name or "profile" in name:
41
+ return f"https://i.pravatar.cc/300?u={seed}"
42
+ if "logo" in name or "icon" in name:
43
+ return f"https://picsum.photos/seed/{seed}/200/200"
44
+ if "banner" in name or "cover" in name:
45
+ return f"https://picsum.photos/seed/{seed}/1200/400"
46
+ return f"https://picsum.photos/seed/{seed}/600/400"
47
+
48
+
49
+ _VIDEO_KEYWORDS = (
50
+ "video",
51
+ "mp4",
52
+ "clip",
53
+ "trailer",
54
+ "movie",
55
+ "recording",
56
+ "footage",
57
+ )
58
+
59
+ _SAMPLE_VIDEO_URLS = (
60
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
61
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
62
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
63
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerEscapes.mp4",
64
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
65
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerJoyrides.mp4",
66
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerMeltdowns.mp4",
67
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/Sintel.mp4",
68
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/SubaruOutbackOnStreetAndDirt.mp4",
69
+ "https://commondatastorage.googleapis.com/gtv-videos-bucket/sample/TearsOfSteel.mp4",
70
+ )
71
+
72
+
73
+ def _is_video_url_column(col: Column) -> bool:
74
+ if col.data_type not in ("text", "character varying", "character"):
75
+ return False
76
+ name = col.name.lower()
77
+ return any(k in name for k in _VIDEO_KEYWORDS)
78
+
79
+
80
+ def _random_video_url() -> str:
81
+ return random.choice(_SAMPLE_VIDEO_URLS)
82
+
83
+ _PG_TYPE_TO_JSON_SCHEMA: dict[str, dict[str, Any]] = {
84
+ "integer": {"type": "integer"},
85
+ "bigint": {"type": "integer"},
86
+ "smallint": {"type": "integer"},
87
+ "numeric": {"type": "number"},
88
+ "real": {"type": "number"},
89
+ "double precision": {"type": "number"},
90
+ "boolean": {"type": "boolean"},
91
+ "text": {"type": "string"},
92
+ "character varying": {"type": "string"},
93
+ "character": {"type": "string"},
94
+ "uuid": {"type": "string"},
95
+ "date": {"type": "string", "description": "ISO 8601 date, e.g. 2024-03-15"},
96
+ "timestamp without time zone": {"type": "string", "description": "ISO 8601 datetime"},
97
+ "timestamp with time zone": {"type": "string", "description": "ISO 8601 datetime with offset"},
98
+ "json": {"type": "object"},
99
+ "jsonb": {"type": "object"},
100
+ }
101
+
102
+
103
+ def _column_schema(col: Column) -> dict[str, Any]:
104
+ if col.enum_values:
105
+ return {"type": "string", "enum": col.enum_values}
106
+ base = _PG_TYPE_TO_JSON_SCHEMA.get(col.data_type, {"type": "string"})
107
+ schema = dict(base)
108
+ if col.char_max_length and schema.get("type") == "string":
109
+ schema["maxLength"] = col.char_max_length
110
+ return schema
111
+
112
+
113
+ def build_row_schema(
114
+ table: Table, fk_value_pool: dict[str, list[Any]]
115
+ ) -> tuple[dict[str, Any], list[str]]:
116
+ """Returns (json_schema_for_one_row, list_of_generatable_column_names).
117
+
118
+ fk_value_pool maps column_name -> already-inserted parent key values,
119
+ for columns that are foreign keys. Columns with an empty pool (parent
120
+ table not seeded yet / no rows) are skipped — caller should seed in
121
+ dependency order so this shouldn't happen for non-nullable FKs.
122
+ """
123
+ properties: dict[str, Any] = {}
124
+ generatable: list[str] = []
125
+ fk_columns = {fk.column for fk in table.foreign_keys}
126
+
127
+ for col in table.columns:
128
+ if col.is_auto_generated:
129
+ continue
130
+ if col.name in fk_columns:
131
+ pool = [v for v in (fk_value_pool.get(col.name) or []) if v not in (None, "")]
132
+ if not pool:
133
+ continue
134
+ properties[col.name] = {"enum": pool}
135
+ else:
136
+ properties[col.name] = _column_schema(col)
137
+ generatable.append(col.name)
138
+
139
+ required = [
140
+ c for c in generatable
141
+ if not (table.column(c) and table.column(c).is_nullable)
142
+ ]
143
+ schema = {"type": "object", "properties": properties, "required": required}
144
+ return schema, generatable
145
+
146
+
147
+ _NULL_LITERALS = {"null", "none", "n/a", "na", ""}
148
+
149
+
150
+ def _sanitize_row(
151
+ table: Table, row: dict[str, Any], fk_value_pool: dict[str, list[Any]]
152
+ ) -> dict[str, Any]:
153
+ fk_columns = {fk.column for fk in table.foreign_keys}
154
+ cleaned: dict[str, Any] = {}
155
+ for key, value in row.items():
156
+ col = table.column(key)
157
+ if (
158
+ isinstance(value, str)
159
+ and value.strip().lower() in _NULL_LITERALS
160
+ and col
161
+ and col.is_nullable
162
+ ):
163
+ cleaned[key] = None
164
+ continue
165
+ if key in fk_columns:
166
+ pool = fk_value_pool.get(key) or []
167
+ if pool and value not in pool:
168
+ match = next((item for item in pool if str(item) == str(value)), None)
169
+ if match is not None:
170
+ value = match
171
+ elif isinstance(value, int) and 1 <= value <= len(pool):
172
+ value = pool[value - 1]
173
+ elif isinstance(value, int) and 0 <= value < len(pool):
174
+ value = pool[value]
175
+ else:
176
+ value = random.choice(pool)
177
+ elif (
178
+ isinstance(value, str)
179
+ and col
180
+ and col.char_max_length
181
+ and len(value) > col.char_max_length
182
+ ):
183
+ value = value[: col.char_max_length]
184
+ cleaned[key] = value
185
+ return cleaned
186
+
187
+
188
+ def generate_rows(
189
+ provider: Provider,
190
+ table: Table,
191
+ count: int,
192
+ fk_value_pool: dict[str, list[Any]],
193
+ context_hint: str = "",
194
+ ) -> list[dict[str, Any]]:
195
+ row_schema, columns = build_row_schema(table, fk_value_pool)
196
+ if not columns:
197
+ return [{}]
198
+
199
+ tool_schema = {
200
+ "type": "object",
201
+ "properties": {
202
+ "rows": {
203
+ "type": "array",
204
+ "minItems": count,
205
+ "maxItems": count,
206
+ "items": row_schema,
207
+ }
208
+ },
209
+ "required": ["rows"],
210
+ }
211
+
212
+ system = (
213
+ "You generate realistic, internally-consistent fake data for database seeding. "
214
+ "Values should look like real-world data (plausible names, emails matching names, "
215
+ "believable dates/amounts), not placeholder text like 'test1'. "
216
+ "Never reuse the exact same value twice within the batch unless the column is clearly "
217
+ "meant to repeat (e.g. a status field)."
218
+ )
219
+ user_prompt = (
220
+ f"Generate {count} realistic rows for the table `{table.name}`.\n"
221
+ f"{context_hint}\n"
222
+ "Call the generate_rows tool with the data."
223
+ )
224
+
225
+ rows = provider.generate(system, user_prompt, tool_schema, tool_name="generate_rows")
226
+ if not rows:
227
+ raise RuntimeError(f"Provider did not return any rows for table {table.name}")
228
+ for row in rows:
229
+ for c in columns:
230
+ col = table.column(c)
231
+ if col and _is_media_url_column(col):
232
+ row[c] = _random_media_url(col)
233
+ elif col and _is_video_url_column(col):
234
+ row[c] = _random_video_url()
235
+ return [_sanitize_row(table, row, fk_value_pool) for row in rows]
seedloom/graph.py ADDED
@@ -0,0 +1,50 @@
1
+ """Resolve the order tables must be seeded in, based on foreign key dependencies.
2
+
3
+ A table depends on every table its foreign keys point to (excluding self-references,
4
+ which are seeded as NULL-first-then-update or just left nullable). Raises on a
5
+ genuine cycle between two *different* tables, since that can't be seeded without
6
+ deferred constraints (out of scope for v1 — surfaced as a clear error instead of
7
+ a silent wrong answer).
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from .models import Schema
12
+
13
+
14
+ class CyclicDependencyError(Exception):
15
+ pass
16
+
17
+
18
+ def resolve_seed_order(schema: Schema) -> list[str]:
19
+ deps: dict[str, set[str]] = {}
20
+ for table in schema.tables.values():
21
+ table_deps = set()
22
+ for fk in table.foreign_keys:
23
+ if fk.ref_table != table.name and fk.ref_table in schema.tables:
24
+ table_deps.add(fk.ref_table)
25
+ deps[table.name] = table_deps
26
+
27
+ ordered: list[str] = []
28
+ visited: set[str] = set()
29
+ in_progress: set[str] = set()
30
+
31
+ def visit(name: str, path: list[str]) -> None:
32
+ if name in visited:
33
+ return
34
+ if name in in_progress:
35
+ raise CyclicDependencyError(
36
+ f"Cyclic foreign key dependency detected: {' -> '.join(path + [name])}. "
37
+ "seedloom can't resolve insert order for mutually-dependent tables in v1 — "
38
+ "consider making one of the FKs nullable and seeding it in a second pass."
39
+ )
40
+ in_progress.add(name)
41
+ for dep in deps.get(name, set()):
42
+ visit(dep, path + [name])
43
+ in_progress.discard(name)
44
+ visited.add(name)
45
+ ordered.append(name)
46
+
47
+ for table_name in schema.tables:
48
+ visit(table_name, [])
49
+
50
+ return ordered
seedloom/inserter.py ADDED
@@ -0,0 +1,93 @@
1
+ """Insert generated rows into Postgres, returning values for whichever
2
+ columns other tables' foreign keys point at, so downstream tables always
3
+ have a real, valid pool to pick from — not just the primary key."""
4
+ from __future__ import annotations
5
+
6
+ from typing import Any
7
+
8
+ import psycopg2
9
+ import psycopg2.extras
10
+ from rich.console import Console
11
+
12
+ from .models import Table
13
+
14
+ console = Console()
15
+
16
+
17
+ def _adapt_value(value: Any) -> Any:
18
+ if isinstance(value, (dict, list)):
19
+ return psycopg2.extras.Json(value)
20
+ return value
21
+
22
+
23
+ def table_row_count(conn, table_name: str) -> int:
24
+ with conn.cursor() as cur:
25
+ cur.execute(f'SELECT COUNT(*) FROM "{table_name}"')
26
+ return cur.fetchone()[0]
27
+
28
+
29
+ def existing_column_values(conn, table_name: str, columns: list[str]) -> dict[str, list[Any]]:
30
+ """Fetch current values for the given columns (e.g. columns other tables'
31
+ foreign keys reference — which may or may not be the primary key)."""
32
+ if not columns:
33
+ return {}
34
+ col_list = ", ".join(f'"{c}"' for c in columns)
35
+ result: dict[str, list[Any]] = {c: [] for c in columns}
36
+ with conn.cursor() as cur:
37
+ cur.execute(f'SELECT {col_list} FROM "{table_name}"')
38
+ for record in cur.fetchall():
39
+ for i, c in enumerate(columns):
40
+ result[c].append(record[i])
41
+ return result
42
+
43
+
44
+ def insert_rows(
45
+ conn, table: Table, rows: list[dict[str, Any]], needed_columns: list[str] | None = None
46
+ ) -> dict[str, list[Any]]:
47
+ """Insert rows for one table. Returns {column_name: [values]} for each
48
+ column in `needed_columns` (typically every column some other table's
49
+ foreign key references), covering both auto-generated and model-supplied
50
+ values.
51
+
52
+ Rows that collide with an existing unique/primary key are skipped (logged,
53
+ not treated as an error) rather than aborting the run.
54
+ """
55
+ if not rows or not rows[0]:
56
+ return {}
57
+
58
+ needed_columns = needed_columns or []
59
+ columns = sorted({c for row in rows for c in row.keys()})
60
+ returning_clause = ""
61
+ if needed_columns:
62
+ returning_clause = " RETURNING " + ", ".join(f'"{c}"' for c in needed_columns)
63
+
64
+ col_list = ", ".join(f'"{c}"' for c in columns)
65
+ placeholders = ", ".join(f"%({c})s" for c in columns)
66
+ query = (
67
+ f'INSERT INTO "{table.name}" ({col_list}) VALUES ({placeholders}) '
68
+ f'ON CONFLICT DO NOTHING{returning_clause}'
69
+ )
70
+
71
+ collected: dict[str, list[Any]] = {c: [] for c in needed_columns}
72
+ skipped = 0
73
+ with conn.cursor() as cur:
74
+ for row in rows:
75
+ adapted_row = {c: _adapt_value(row.get(c)) for c in columns}
76
+ cur.execute(query, adapted_row)
77
+ if returning_clause:
78
+ result = cur.fetchone()
79
+ if result is not None:
80
+ for i, c in enumerate(needed_columns):
81
+ collected[c].append(result[i])
82
+ else:
83
+ skipped += 1
84
+ elif cur.rowcount == 0:
85
+ skipped += 1
86
+ conn.commit()
87
+
88
+ if skipped:
89
+ console.print(
90
+ f"[yellow] {skipped} row(s) already existed in '{table.name}', skipped.[/yellow]"
91
+ )
92
+
93
+ return collected