sqlseed 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. sqlseed/__init__.py +121 -0
  2. sqlseed/_utils/__init__.py +11 -0
  3. sqlseed/_utils/logger.py +30 -0
  4. sqlseed/_utils/metrics.py +45 -0
  5. sqlseed/_utils/progress.py +14 -0
  6. sqlseed/_utils/schema_helpers.py +51 -0
  7. sqlseed/_utils/sql_safe.py +45 -0
  8. sqlseed/_version.py +1 -0
  9. sqlseed/cli/__init__.py +3 -0
  10. sqlseed/cli/main.py +316 -0
  11. sqlseed/config/__init__.py +14 -0
  12. sqlseed/config/loader.py +66 -0
  13. sqlseed/config/models.py +99 -0
  14. sqlseed/config/snapshot.py +91 -0
  15. sqlseed/core/__init__.py +14 -0
  16. sqlseed/core/column_dag.py +108 -0
  17. sqlseed/core/constraints.py +116 -0
  18. sqlseed/core/expression.py +71 -0
  19. sqlseed/core/mapper.py +257 -0
  20. sqlseed/core/orchestrator.py +578 -0
  21. sqlseed/core/relation.py +124 -0
  22. sqlseed/core/result.py +23 -0
  23. sqlseed/core/schema.py +97 -0
  24. sqlseed/core/transform.py +27 -0
  25. sqlseed/database/__init__.py +14 -0
  26. sqlseed/database/_protocol.py +72 -0
  27. sqlseed/database/optimizer.py +96 -0
  28. sqlseed/database/raw_sqlite_adapter.py +197 -0
  29. sqlseed/database/sqlite_utils_adapter.py +183 -0
  30. sqlseed/generators/__init__.py +11 -0
  31. sqlseed/generators/_protocol.py +73 -0
  32. sqlseed/generators/base_provider.py +448 -0
  33. sqlseed/generators/faker_provider.py +157 -0
  34. sqlseed/generators/mimesis_provider.py +203 -0
  35. sqlseed/generators/registry.py +86 -0
  36. sqlseed/generators/stream.py +157 -0
  37. sqlseed/py.typed +0 -0
  38. sqlseed-0.1.0.dist-info/METADATA +934 -0
  39. sqlseed-0.1.0.dist-info/RECORD +42 -0
  40. sqlseed-0.1.0.dist-info/WHEEL +4 -0
  41. sqlseed-0.1.0.dist-info/entry_points.txt +6 -0
  42. sqlseed-0.1.0.dist-info/licenses/LICENSE +17 -0
@@ -0,0 +1,203 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from sqlseed._utils.logger import get_logger
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class MimesisProvider:
11
+ """Mimesis-based data generator adapter."""
12
+
13
+ def __init__(self) -> None:
14
+ self._generic: Any = None
15
+ self._locale: str = "en"
16
+ self._seed: int | None = None
17
+ self._init_mimesis()
18
+
19
+ def _init_mimesis(self) -> None:
20
+ try:
21
+ from mimesis import Generic
22
+ from mimesis.locales import Locale
23
+
24
+ locale_enum = Locale(self._locale)
25
+ self._generic = Generic(locale_enum)
26
+ except ImportError:
27
+ raise ImportError("Mimesis is not installed. Install it with: pip install sqlseed[mimesis]") from None
28
+
29
+ @property
30
+ def name(self) -> str:
31
+ return "mimesis"
32
+
33
+ def set_locale(self, locale: str) -> None:
34
+ locale_map = {
35
+ "en_US": "en",
36
+ "en_GB": "en",
37
+ "zh_CN": "zh",
38
+ "zh_TW": "zh",
39
+ "ja_JP": "ja",
40
+ "ko_KR": "ko",
41
+ "de_DE": "de",
42
+ "fr_FR": "fr",
43
+ "es_ES": "es",
44
+ "pt_BR": "pt-br",
45
+ "ru_RU": "ru",
46
+ }
47
+ self._locale = locale_map.get(locale, locale.split("_", maxsplit=1)[0])
48
+ self._init_mimesis()
49
+
50
+ def set_seed(self, seed: int) -> None:
51
+ from mimesis import Generic
52
+ from mimesis.locales import Locale
53
+
54
+ self._seed = seed
55
+ locale_enum = Locale(self._locale)
56
+ self._generic = Generic(locale_enum, seed=seed)
57
+
58
+ def generate_string(
59
+ self,
60
+ *,
61
+ min_length: int = 1,
62
+ max_length: int = 100,
63
+ charset: str | None = None,
64
+ ) -> str:
65
+ import string
66
+
67
+ if charset == "alphanumeric":
68
+ chars = string.ascii_letters + string.digits
69
+ elif charset == "alpha":
70
+ chars = string.ascii_letters
71
+ elif charset == "digits":
72
+ chars = string.digits
73
+ elif charset is not None:
74
+ chars = charset
75
+ else:
76
+ chars = string.ascii_letters + string.digits + " _-"
77
+ length = self._generic.numeric.integer_number(start=min_length, end=max_length)
78
+ return "".join(self._generic.random.choice(chars) for _ in range(length))
79
+
80
+ def generate_integer(self, *, min_value: int = 0, max_value: int = 999999) -> int:
81
+ return self._generic.numeric.integer_number(start=min_value, end=max_value)
82
+
83
+ def generate_float(
84
+ self,
85
+ *,
86
+ min_value: float = 0.0,
87
+ max_value: float = 999999.0,
88
+ precision: int = 2,
89
+ ) -> float:
90
+ return round(
91
+ self._generic.numeric.float_number(start=min_value, end=max_value, precision=precision),
92
+ precision,
93
+ )
94
+
95
+ def generate_boolean(self) -> bool:
96
+ return self._generic.development.boolean()
97
+
98
+ def generate_bytes(self, *, length: int = 16) -> bytes:
99
+ return self._generic.cryptographic.token_bytes(length)
100
+
101
+ def generate_name(self) -> str:
102
+ return self._generic.person.full_name()
103
+
104
+ def generate_first_name(self) -> str:
105
+ return self._generic.person.first_name()
106
+
107
+ def generate_last_name(self) -> str:
108
+ return self._generic.person.last_name()
109
+
110
+ def generate_email(self) -> str:
111
+ return self._generic.person.email()
112
+
113
+ def generate_phone(self) -> str:
114
+ return self._generic.person.phone_number()
115
+
116
+ def generate_address(self) -> str:
117
+ return self._generic.address.address()
118
+
119
+ def generate_company(self) -> str:
120
+ return self._generic.finance.company()
121
+
122
+ def generate_url(self) -> str:
123
+ return self._generic.internet.url()
124
+
125
+ def generate_ipv4(self) -> str:
126
+ return self._generic.internet.ip_v4()
127
+
128
+ def generate_uuid(self) -> str:
129
+ return str(self._generic.cryptographic.uuid_object())
130
+
131
+ def generate_date(self, *, start_year: int = 2000, end_year: int | None = None) -> str:
132
+ from datetime import datetime
133
+
134
+ if end_year is None:
135
+ end_year = datetime.now().year
136
+ date = self._generic.datetime.date(start=start_year, end=end_year)
137
+ return str(date)
138
+
139
+ def generate_datetime(self, *, start_year: int = 2000, end_year: int | None = None) -> str:
140
+ from datetime import datetime
141
+
142
+ if end_year is None:
143
+ end_year = datetime.now().year
144
+ dt = self._generic.datetime.datetime(start=start_year, end=end_year)
145
+ return str(dt)
146
+
147
+ def generate_timestamp(self) -> int:
148
+ return self._generic.datetime.timestamp()
149
+
150
+ def generate_text(self, *, min_length: int = 50, max_length: int = 200) -> str:
151
+ text = self._generic.text.text(quantity=1)
152
+ while len(text) < min_length:
153
+ text += " " + self._generic.text.text(quantity=1)
154
+ return text[:max_length]
155
+
156
+ def generate_sentence(self) -> str:
157
+ return self._generic.text.sentence()
158
+
159
+ def generate_password(self, *, length: int = 16) -> str:
160
+ return self._generic.person.password(length=length)
161
+
162
+ def generate_choice(self, choices: list[Any]) -> Any:
163
+ return self._generic.random.choice(choices)
164
+
165
+ def generate_json(self, *, schema: dict[str, Any] | None = None) -> str:
166
+ import json
167
+
168
+ if schema is None:
169
+ data = {
170
+ "id": self.generate_integer(min_value=1, max_value=999999),
171
+ "name": self.generate_name(),
172
+ "active": self.generate_boolean(),
173
+ }
174
+ else:
175
+ data = self._generate_from_schema(schema)
176
+ return json.dumps(data)
177
+
178
+ def _generate_from_schema(self, schema: dict[str, Any]) -> Any:
179
+ schema_type = schema.get("type", "string")
180
+ if schema_type == "string":
181
+ return self.generate_string(min_length=5, max_length=20)
182
+ if schema_type == "integer":
183
+ return self.generate_integer()
184
+ if schema_type == "number":
185
+ return self.generate_float()
186
+ if schema_type == "boolean":
187
+ return self.generate_boolean()
188
+ if schema_type == "array":
189
+ items = schema.get("items", {"type": "string"})
190
+ count = self._generic.numeric.integer_number(start=1, end=5)
191
+ return [self._generate_from_schema(items) for _ in range(count)]
192
+ if schema_type == "object":
193
+ properties = schema.get("properties", {})
194
+ return {k: self._generate_from_schema(v) for k, v in properties.items()}
195
+ return self.generate_string()
196
+
197
+ def generate_pattern(self, *, regex: str) -> str:
198
+ import random
199
+
200
+ import rstr
201
+
202
+ rng = random.Random(self._seed)
203
+ return rstr.Rstr(rng).xeger(regex)
@@ -0,0 +1,86 @@
1
+ from __future__ import annotations
2
+
3
+ from sqlseed._utils.logger import get_logger
4
+ from sqlseed.generators._protocol import DataProvider
5
+ from sqlseed.generators.base_provider import BaseProvider
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class ProviderRegistry:
11
+ def __init__(self) -> None:
12
+ self._providers: dict[str, DataProvider] = {}
13
+ self._default_name: str = "base"
14
+ self._register_builtin()
15
+
16
+ def _register_builtin(self) -> None:
17
+ base = BaseProvider()
18
+ self._providers["base"] = base
19
+
20
+ def register(self, provider: DataProvider) -> None:
21
+ name = provider.name
22
+ self._providers[name] = provider
23
+ logger.debug("Registered provider", name=name)
24
+
25
+ def register_from_entry_points(self) -> None:
26
+ try:
27
+ from importlib.metadata import entry_points
28
+
29
+ eps = entry_points()
30
+ sqlseed_eps = eps.select(group="sqlseed") if hasattr(eps, "select") else eps.get("sqlseed", []) # type: ignore[arg-type]
31
+ for ep in sqlseed_eps:
32
+ try:
33
+ provider_cls = ep.load()
34
+ provider = provider_cls()
35
+ if isinstance(provider, DataProvider):
36
+ self.register(provider)
37
+ logger.info("Auto-discovered provider", name=ep.name)
38
+ except Exception as e:
39
+ logger.warning("Failed to load provider", name=ep.name, error=e)
40
+ except Exception as e:
41
+ logger.debug("Entry point discovery failed", error=e)
42
+
43
+ def get(self, name: str | None = None) -> DataProvider:
44
+ provider_name = name or self._default_name
45
+ if provider_name not in self._providers:
46
+ available = ", ".join(self._providers.keys())
47
+ raise ValueError(f"Provider '{provider_name}' not found. Available: {available}")
48
+ return self._providers[provider_name]
49
+
50
+ def set_default(self, name: str) -> None:
51
+ if name not in self._providers:
52
+ raise ValueError(f"Provider '{name}' not found")
53
+ self._default_name = name
54
+
55
+ @property
56
+ def default_name(self) -> str:
57
+ return self._default_name
58
+
59
+ @property
60
+ def available_providers(self) -> list[str]:
61
+ return list(self._providers.keys())
62
+
63
+ def ensure_provider(self, name: str) -> DataProvider:
64
+ if name in self._providers:
65
+ return self._providers[name]
66
+
67
+ if name == "faker":
68
+ try:
69
+ from sqlseed.generators.faker_provider import FakerProvider
70
+
71
+ provider: DataProvider = FakerProvider()
72
+ self.register(provider)
73
+ return provider
74
+ except ImportError:
75
+ raise ImportError("Faker is not installed. Install it with: pip install sqlseed[faker]") from None
76
+ elif name == "mimesis":
77
+ try:
78
+ from sqlseed.generators.mimesis_provider import MimesisProvider
79
+
80
+ provider = MimesisProvider()
81
+ self.register(provider)
82
+ return provider
83
+ except ImportError:
84
+ raise ImportError("Mimesis is not installed. Install it with: pip install sqlseed[mimesis]") from None
85
+
86
+ raise ValueError(f"Unknown provider: {name}")
@@ -0,0 +1,157 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+ from sqlseed.core.column_dag import ColumnNode
10
+ from sqlseed.core.constraints import ConstraintSolver
11
+ from sqlseed.core.expression import ExpressionEngine
12
+ from sqlseed.core.mapper import GeneratorSpec
13
+ from sqlseed.core.transform import RowTransformFn
14
+
15
+
16
+ class UnknownGeneratorError(Exception):
17
+ def __init__(self, generator_name: str, column_name: str | None = None) -> None:
18
+ self.generator_name = generator_name
19
+ self.column_name = column_name
20
+ super().__init__(f"Unknown generator '{generator_name}'{f' for column {column_name}' if column_name else ''}")
21
+
22
+
23
+ class DataStream:
24
+ def __init__(
25
+ self,
26
+ dag_nodes: list[ColumnNode],
27
+ provider: Any,
28
+ expr_engine: ExpressionEngine,
29
+ constraint_solver: ConstraintSolver,
30
+ transform_fn: RowTransformFn | None = None,
31
+ seed: int | None = None,
32
+ ) -> None:
33
+ self._nodes = dag_nodes
34
+ self._provider = provider
35
+ self._expr_engine = expr_engine
36
+ self._constraint_solver = constraint_solver
37
+ self._transform_fn = transform_fn
38
+
39
+ self._rng = random.Random(seed)
40
+ if seed is not None:
41
+ self._provider.set_seed(seed)
42
+
43
+ def generate(
44
+ self,
45
+ count: int,
46
+ batch_size: int = 5000,
47
+ ) -> Iterator[list[dict[str, Any]]]:
48
+ generated = 0
49
+ while generated < count:
50
+ current_batch_size = min(batch_size, count - generated)
51
+ batch = [self._generate_row() for _ in range(current_batch_size)]
52
+ yield batch
53
+ generated += current_batch_size
54
+
55
+ def _generate_row(self) -> dict[str, Any]:
56
+ max_total_retries = 1000
57
+ total_retries = 0
58
+
59
+ while total_retries < max_total_retries:
60
+ row: dict[str, Any] = {}
61
+ generated_values: dict[str, Any] = {}
62
+ backtrack_to: int | None = None
63
+
64
+ for idx, node in enumerate(self._nodes):
65
+ if node.is_skip:
66
+ continue
67
+
68
+ col_name = node.name
69
+ max_retries = node.constraints.max_retries if node.constraints else 100
70
+ is_unique = node.constraints.unique if node.constraints else False
71
+ source_columns = node.depends_on if node.is_derived else None
72
+
73
+ if backtrack_to is not None and idx < backtrack_to:
74
+ continue
75
+
76
+ col_success = False
77
+ for _ in range(max_retries):
78
+ if node.is_derived and node.expression:
79
+ ctx = {"row": row, "value": row.get(node.depends_on[0]) if node.depends_on else None}
80
+ val = self._expr_engine.evaluate(node.expression, ctx)
81
+ else:
82
+ val = self._apply_generator(node.generator_spec)
83
+
84
+ result = self._constraint_solver.try_register(
85
+ col_name,
86
+ val,
87
+ unique=is_unique,
88
+ source_columns=source_columns,
89
+ )
90
+ if result.registered:
91
+ row[col_name] = val
92
+ generated_values[col_name] = val
93
+ col_success = True
94
+ break
95
+ if result.need_backtrack and source_columns:
96
+ for bt_col in source_columns:
97
+ if bt_col in generated_values:
98
+ self._constraint_solver.unregister(bt_col, generated_values[bt_col])
99
+ del generated_values[bt_col]
100
+ row.pop(bt_col, None)
101
+ bt_idx = self._find_node_index(source_columns[0])
102
+ if bt_idx is not None:
103
+ backtrack_to = bt_idx
104
+ col_success = False
105
+ break
106
+
107
+ if not col_success:
108
+ if backtrack_to is not None:
109
+ break
110
+ for col, val in generated_values.items():
111
+ self._constraint_solver.unregister(col, val)
112
+ generated_values.clear()
113
+ row.clear()
114
+ break
115
+
116
+ if backtrack_to is not None:
117
+ total_retries += 1
118
+ continue
119
+
120
+ if generated_values:
121
+ if self._transform_fn:
122
+ ctx = {"row_number": total_retries}
123
+ row = self._transform_fn(row, ctx)
124
+ return row
125
+
126
+ total_retries += 1
127
+
128
+ raise RuntimeError("Failed to generate row satisfying all constraints after maximum retries.")
129
+
130
+ def _find_node_index(self, col_name: str) -> int | None:
131
+ for i, node in enumerate(self._nodes):
132
+ if node.name == col_name:
133
+ return i
134
+ return None
135
+
136
+ def _apply_generator(self, spec: GeneratorSpec) -> Any:
137
+ if spec.null_ratio > 0 and self._rng.random() < spec.null_ratio:
138
+ return None
139
+
140
+ method_name = f"generate_{spec.generator_name}"
141
+ if hasattr(self._provider, method_name):
142
+ method = getattr(self._provider, method_name)
143
+ return method(**spec.params) if spec.params else method()
144
+
145
+ if spec.generator_name == "choice" and "choices" in spec.params:
146
+ return self._rng.choice(spec.params["choices"])
147
+
148
+ if spec.generator_name == "foreign_key":
149
+ return self._handle_foreign_key(spec)
150
+
151
+ return self._provider.generate_string(**spec.params) if spec.params else self._provider.generate_string()
152
+
153
+ def _handle_foreign_key(self, spec: GeneratorSpec) -> Any:
154
+ ref_values = spec.params.get("_ref_values", [])
155
+ if ref_values:
156
+ return self._rng.choice(ref_values)
157
+ return self._provider.generate_integer(min_value=1, max_value=spec.params.get("max_ref", 999999))
sqlseed/py.typed ADDED
File without changes