anysite-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. anysite/__init__.py +4 -0
  2. anysite/__main__.py +6 -0
  3. anysite/api/__init__.py +21 -0
  4. anysite/api/client.py +271 -0
  5. anysite/api/errors.py +137 -0
  6. anysite/api/schemas.py +333 -0
  7. anysite/batch/__init__.py +1 -0
  8. anysite/batch/executor.py +176 -0
  9. anysite/batch/input.py +160 -0
  10. anysite/batch/rate_limiter.py +98 -0
  11. anysite/cli/__init__.py +1 -0
  12. anysite/cli/config.py +176 -0
  13. anysite/cli/executor.py +388 -0
  14. anysite/cli/options.py +249 -0
  15. anysite/config/__init__.py +11 -0
  16. anysite/config/paths.py +46 -0
  17. anysite/config/settings.py +187 -0
  18. anysite/dataset/__init__.py +37 -0
  19. anysite/dataset/analyzer.py +268 -0
  20. anysite/dataset/cli.py +644 -0
  21. anysite/dataset/collector.py +686 -0
  22. anysite/dataset/db_loader.py +248 -0
  23. anysite/dataset/errors.py +30 -0
  24. anysite/dataset/exporters.py +121 -0
  25. anysite/dataset/history.py +153 -0
  26. anysite/dataset/models.py +245 -0
  27. anysite/dataset/notifications.py +87 -0
  28. anysite/dataset/scheduler.py +107 -0
  29. anysite/dataset/storage.py +171 -0
  30. anysite/dataset/transformer.py +213 -0
  31. anysite/db/__init__.py +38 -0
  32. anysite/db/adapters/__init__.py +1 -0
  33. anysite/db/adapters/base.py +158 -0
  34. anysite/db/adapters/postgres.py +201 -0
  35. anysite/db/adapters/sqlite.py +183 -0
  36. anysite/db/cli.py +709 -0
  37. anysite/db/config.py +92 -0
  38. anysite/db/manager.py +166 -0
  39. anysite/db/operations/__init__.py +1 -0
  40. anysite/db/operations/insert.py +199 -0
  41. anysite/db/operations/query.py +43 -0
  42. anysite/db/schema/__init__.py +1 -0
  43. anysite/db/schema/inference.py +213 -0
  44. anysite/db/schema/types.py +71 -0
  45. anysite/db/utils/__init__.py +1 -0
  46. anysite/db/utils/sanitize.py +99 -0
  47. anysite/main.py +498 -0
  48. anysite/models/__init__.py +1 -0
  49. anysite/output/__init__.py +11 -0
  50. anysite/output/console.py +45 -0
  51. anysite/output/formatters.py +301 -0
  52. anysite/output/templates.py +76 -0
  53. anysite/py.typed +0 -0
  54. anysite/streaming/__init__.py +1 -0
  55. anysite/streaming/progress.py +121 -0
  56. anysite/streaming/writer.py +130 -0
  57. anysite/utils/__init__.py +1 -0
  58. anysite/utils/fields.py +242 -0
  59. anysite/utils/retry.py +109 -0
  60. anysite_cli-0.1.2.dist-info/METADATA +455 -0
  61. anysite_cli-0.1.2.dist-info/RECORD +64 -0
  62. anysite_cli-0.1.2.dist-info/WHEEL +4 -0
  63. anysite_cli-0.1.2.dist-info/entry_points.txt +2 -0
  64. anysite_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,213 @@
1
+ """Type inference for JSON data to SQL schemas."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+ from anysite.db.schema.types import get_sql_type
10
+
11
+ # Patterns for string subtype detection
12
+ _DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
13
+ _DATETIME_RE = re.compile(
14
+ r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2})?(\.\d+)?(Z|[+-]\d{2}:?\d{2})?$"
15
+ )
16
+ _URL_RE = re.compile(r"^https?://", re.IGNORECASE)
17
+ _EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
18
+
19
+ # Threshold for using TEXT vs VARCHAR
20
+ _VARCHAR_MAX_LENGTH = 255
21
+
22
+
23
+ @dataclass
24
+ class ColumnSchema:
25
+ """Schema for a single column."""
26
+
27
+ name: str
28
+ inferred_type: str
29
+ nullable: bool = True
30
+ sample_values: list[Any] = field(default_factory=list)
31
+
32
+
33
+ @dataclass
34
+ class TableSchema:
35
+ """Schema for a table inferred from JSON data."""
36
+
37
+ table_name: str
38
+ columns: list[ColumnSchema] = field(default_factory=list)
39
+
40
+ def to_sql_types(self, dialect: str) -> dict[str, str]:
41
+ """Convert inferred types to SQL types for a given dialect.
42
+
43
+ Args:
44
+ dialect: Database dialect ('sqlite', 'postgres', 'mysql').
45
+
46
+ Returns:
47
+ Mapping of column name to SQL type string.
48
+ """
49
+ return {
50
+ col.name: get_sql_type(col.inferred_type, dialect)
51
+ for col in self.columns
52
+ }
53
+
54
+
55
+ def infer_sql_type(value: Any) -> str:
56
+ """Infer the SQL-compatible type for a Python value.
57
+
58
+ Args:
59
+ value: A Python value from JSON.
60
+
61
+ Returns:
62
+ Inferred type name ('integer', 'float', 'boolean', 'text', etc.).
63
+ """
64
+ if value is None:
65
+ return "text"
66
+
67
+ if isinstance(value, bool):
68
+ return "boolean"
69
+
70
+ if isinstance(value, int):
71
+ return "integer"
72
+
73
+ if isinstance(value, float):
74
+ return "float"
75
+
76
+ if isinstance(value, (dict, list)):
77
+ return "json"
78
+
79
+ if isinstance(value, str):
80
+ return _infer_string_subtype(value)
81
+
82
+ return "text"
83
+
84
+
85
+ def _infer_string_subtype(value: str) -> str:
86
+ """Infer a more specific type for string values.
87
+
88
+ Args:
89
+ value: String value to analyze.
90
+
91
+ Returns:
92
+ Inferred type name.
93
+ """
94
+ if not value:
95
+ return "text"
96
+
97
+ if _DATETIME_RE.match(value):
98
+ return "datetime"
99
+
100
+ if _DATE_RE.match(value):
101
+ return "date"
102
+
103
+ if _URL_RE.match(value):
104
+ return "url"
105
+
106
+ if _EMAIL_RE.match(value):
107
+ return "email"
108
+
109
+ if len(value) <= _VARCHAR_MAX_LENGTH:
110
+ return "varchar"
111
+
112
+ return "text"
113
+
114
+
115
+ def _merge_types(type_a: str, type_b: str) -> str:
116
+ """Merge two inferred types into a compatible type.
117
+
118
+ When different rows have different types for the same column,
119
+ this picks the more general type.
120
+ """
121
+ if type_a == type_b:
122
+ return type_a
123
+
124
+ # Null/text absorbs anything
125
+ if type_a == "text" or type_b == "text":
126
+ return "text"
127
+
128
+ # Numeric promotion
129
+ if {type_a, type_b} == {"integer", "float"}:
130
+ return "float"
131
+
132
+ # String subtypes fall back to varchar or text
133
+ string_types = {"varchar", "url", "email", "date", "datetime"}
134
+ if type_a in string_types and type_b in string_types:
135
+ return "varchar"
136
+
137
+ # JSON stays as json
138
+ if type_a == "json" or type_b == "json":
139
+ return "json"
140
+
141
+ return "text"
142
+
143
+
144
+ def infer_table_schema(
145
+ table_name: str,
146
+ rows: list[dict[str, Any]],
147
+ max_sample: int = 100,
148
+ ) -> TableSchema:
149
+ """Infer a table schema from a list of JSON rows.
150
+
151
+ Examines up to max_sample rows to determine column types.
152
+
153
+ Args:
154
+ table_name: Name for the inferred table.
155
+ rows: List of row dictionaries.
156
+ max_sample: Maximum number of rows to sample for inference.
157
+
158
+ Returns:
159
+ Inferred TableSchema.
160
+ """
161
+ if not rows:
162
+ return TableSchema(table_name=table_name)
163
+
164
+ sample = rows[:max_sample]
165
+
166
+ # Track column types and nullability
167
+ column_types: dict[str, str] = {}
168
+ column_nullable: dict[str, bool] = {}
169
+ column_samples: dict[str, list[Any]] = {}
170
+ # Preserve column order across all rows
171
+ column_order: list[str] = []
172
+ seen_columns: set[str] = set()
173
+
174
+ for row in sample:
175
+ for col_name, value in row.items():
176
+ if col_name not in seen_columns:
177
+ seen_columns.add(col_name)
178
+ column_order.append(col_name)
179
+
180
+ inferred = infer_sql_type(value)
181
+
182
+ if value is None:
183
+ column_nullable[col_name] = True
184
+ else:
185
+ if col_name in column_types:
186
+ column_types[col_name] = _merge_types(column_types[col_name], inferred)
187
+ else:
188
+ column_types[col_name] = inferred
189
+
190
+ if col_name not in column_nullable:
191
+ column_nullable[col_name] = False
192
+
193
+ # Store sample values (up to 3)
194
+ samples = column_samples.setdefault(col_name, [])
195
+ if len(samples) < 3:
196
+ samples.append(value)
197
+
198
+ # Mark missing columns as nullable
199
+ for col_name in seen_columns:
200
+ if col_name not in row:
201
+ column_nullable[col_name] = True
202
+
203
+ columns = [
204
+ ColumnSchema(
205
+ name=col_name,
206
+ inferred_type=column_types.get(col_name, "text"),
207
+ nullable=column_nullable.get(col_name, True),
208
+ sample_values=column_samples.get(col_name, []),
209
+ )
210
+ for col_name in column_order
211
+ ]
212
+
213
+ return TableSchema(table_name=table_name, columns=columns)
@@ -0,0 +1,71 @@
1
+ """SQL type mappings per database dialect."""
2
+
3
+ from __future__ import annotations
4
+
5
+ # Type mapping: Python-inferred type name -> SQL type per dialect
6
+ TYPE_MAP: dict[str, dict[str, str]] = {
7
+ "integer": {
8
+ "sqlite": "INTEGER",
9
+ "postgres": "BIGINT",
10
+ "mysql": "BIGINT",
11
+ },
12
+ "float": {
13
+ "sqlite": "REAL",
14
+ "postgres": "DOUBLE PRECISION",
15
+ "mysql": "DOUBLE",
16
+ },
17
+ "boolean": {
18
+ "sqlite": "INTEGER",
19
+ "postgres": "BOOLEAN",
20
+ "mysql": "BOOLEAN",
21
+ },
22
+ "text": {
23
+ "sqlite": "TEXT",
24
+ "postgres": "TEXT",
25
+ "mysql": "TEXT",
26
+ },
27
+ "varchar": {
28
+ "sqlite": "TEXT",
29
+ "postgres": "VARCHAR(255)",
30
+ "mysql": "VARCHAR(255)",
31
+ },
32
+ "json": {
33
+ "sqlite": "TEXT",
34
+ "postgres": "JSONB",
35
+ "mysql": "JSON",
36
+ },
37
+ "date": {
38
+ "sqlite": "TEXT",
39
+ "postgres": "DATE",
40
+ "mysql": "DATE",
41
+ },
42
+ "datetime": {
43
+ "sqlite": "TEXT",
44
+ "postgres": "TIMESTAMPTZ",
45
+ "mysql": "DATETIME",
46
+ },
47
+ "url": {
48
+ "sqlite": "TEXT",
49
+ "postgres": "TEXT",
50
+ "mysql": "TEXT",
51
+ },
52
+ "email": {
53
+ "sqlite": "TEXT",
54
+ "postgres": "TEXT",
55
+ "mysql": "VARCHAR(320)",
56
+ },
57
+ }
58
+
59
+
60
+ def get_sql_type(inferred_type: str, dialect: str) -> str:
61
+ """Get the SQL type for a given inferred type and dialect.
62
+
63
+ Args:
64
+ inferred_type: The inferred Python type name.
65
+ dialect: The database dialect ('sqlite', 'postgres', 'mysql').
66
+
67
+ Returns:
68
+ SQL type string for the dialect.
69
+ """
70
+ type_entry = TYPE_MAP.get(inferred_type, TYPE_MAP["text"])
71
+ return type_entry.get(dialect, type_entry.get("sqlite", "TEXT"))
@@ -0,0 +1 @@
1
+ """Database utility functions."""
@@ -0,0 +1,99 @@
1
+ """SQL identifier sanitization utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ # Valid SQL identifier: starts with letter/underscore, then letters/digits/underscores
8
+ _IDENTIFIER_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
9
+
10
+ # Reserved SQL keywords that must be quoted
11
+ _RESERVED_WORDS = frozenset({
12
+ "all", "alter", "analyze", "and", "as", "asc", "between", "by", "case",
13
+ "check", "column", "constraint", "create", "cross", "current", "current_date",
14
+ "current_time", "current_timestamp", "current_user", "database", "default",
15
+ "delete", "desc", "distinct", "do", "drop", "else", "end", "exists", "false",
16
+ "fetch", "for", "foreign", "from", "full", "grant", "group", "having", "if",
17
+ "in", "index", "inner", "insert", "into", "is", "join", "key", "left", "like",
18
+ "limit", "natural", "not", "null", "offset", "on", "or", "order", "outer",
19
+ "primary", "references", "returning", "right", "row", "select", "session_user",
20
+ "set", "some", "table", "then", "to", "true", "union", "unique", "update",
21
+ "user", "using", "values", "view", "when", "where", "with",
22
+ })
23
+
24
+ # Maximum identifier length (conservative across databases)
25
+ _MAX_IDENTIFIER_LENGTH = 63
26
+
27
+
28
+ def sanitize_identifier(name: str) -> str:
29
+ """Sanitize a SQL identifier (column or table name).
30
+
31
+ Rules:
32
+ - Must be non-empty
33
+ - Must start with a letter or underscore
34
+ - Only letters, digits, and underscores allowed
35
+ - Reserved words are quoted with double quotes
36
+ - Max length 63 characters (PostgreSQL limit)
37
+ - Invalid characters are replaced with underscores
38
+
39
+ Args:
40
+ name: Raw identifier name.
41
+
42
+ Returns:
43
+ Sanitized identifier safe for use in SQL.
44
+
45
+ Raises:
46
+ ValueError: If name is empty or cannot be sanitized.
47
+ """
48
+ if not name or not name.strip():
49
+ raise ValueError("Identifier cannot be empty")
50
+
51
+ # Strip whitespace
52
+ cleaned = name.strip()
53
+
54
+ # Replace invalid characters with underscores
55
+ cleaned = re.sub(r"[^a-zA-Z0-9_]", "_", cleaned)
56
+
57
+ # Ensure starts with letter or underscore
58
+ if cleaned[0].isdigit():
59
+ cleaned = f"_{cleaned}"
60
+
61
+ # Collapse multiple underscores
62
+ cleaned = re.sub(r"__+", "_", cleaned)
63
+
64
+ # Strip trailing underscores
65
+ cleaned = cleaned.rstrip("_")
66
+
67
+ if not cleaned:
68
+ raise ValueError(f"Identifier '{name}' cannot be sanitized to a valid name")
69
+
70
+ # Truncate to max length
71
+ cleaned = cleaned[:_MAX_IDENTIFIER_LENGTH]
72
+
73
+ # Quote reserved words
74
+ if cleaned.lower() in _RESERVED_WORDS:
75
+ return f'"{cleaned}"'
76
+
77
+ return cleaned
78
+
79
+
80
+ def sanitize_table_name(name: str) -> str:
81
+ """Sanitize a table name, supporting schema-qualified names.
82
+
83
+ Handles 'schema.table' notation by sanitizing each part separately.
84
+
85
+ Args:
86
+ name: Raw table name, optionally schema-qualified.
87
+
88
+ Returns:
89
+ Sanitized table name.
90
+
91
+ Raises:
92
+ ValueError: If name is empty or cannot be sanitized.
93
+ """
94
+ if not name or not name.strip():
95
+ raise ValueError("Table name cannot be empty")
96
+
97
+ parts = name.split(".", maxsplit=1)
98
+ sanitized_parts = [sanitize_identifier(part) for part in parts]
99
+ return ".".join(sanitized_parts)