anysite-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anysite-cli might be problematic. Click here for more details.
- anysite/__init__.py +4 -0
- anysite/__main__.py +6 -0
- anysite/api/__init__.py +21 -0
- anysite/api/client.py +271 -0
- anysite/api/errors.py +137 -0
- anysite/api/schemas.py +333 -0
- anysite/batch/__init__.py +1 -0
- anysite/batch/executor.py +176 -0
- anysite/batch/input.py +160 -0
- anysite/batch/rate_limiter.py +98 -0
- anysite/cli/__init__.py +1 -0
- anysite/cli/config.py +176 -0
- anysite/cli/executor.py +388 -0
- anysite/cli/options.py +249 -0
- anysite/config/__init__.py +11 -0
- anysite/config/paths.py +46 -0
- anysite/config/settings.py +187 -0
- anysite/dataset/__init__.py +37 -0
- anysite/dataset/analyzer.py +268 -0
- anysite/dataset/cli.py +644 -0
- anysite/dataset/collector.py +686 -0
- anysite/dataset/db_loader.py +248 -0
- anysite/dataset/errors.py +30 -0
- anysite/dataset/exporters.py +121 -0
- anysite/dataset/history.py +153 -0
- anysite/dataset/models.py +245 -0
- anysite/dataset/notifications.py +87 -0
- anysite/dataset/scheduler.py +107 -0
- anysite/dataset/storage.py +171 -0
- anysite/dataset/transformer.py +213 -0
- anysite/db/__init__.py +38 -0
- anysite/db/adapters/__init__.py +1 -0
- anysite/db/adapters/base.py +158 -0
- anysite/db/adapters/postgres.py +201 -0
- anysite/db/adapters/sqlite.py +183 -0
- anysite/db/cli.py +687 -0
- anysite/db/config.py +92 -0
- anysite/db/manager.py +166 -0
- anysite/db/operations/__init__.py +1 -0
- anysite/db/operations/insert.py +199 -0
- anysite/db/operations/query.py +43 -0
- anysite/db/schema/__init__.py +1 -0
- anysite/db/schema/inference.py +213 -0
- anysite/db/schema/types.py +71 -0
- anysite/db/utils/__init__.py +1 -0
- anysite/db/utils/sanitize.py +99 -0
- anysite/main.py +498 -0
- anysite/models/__init__.py +1 -0
- anysite/output/__init__.py +11 -0
- anysite/output/console.py +45 -0
- anysite/output/formatters.py +301 -0
- anysite/output/templates.py +76 -0
- anysite/py.typed +0 -0
- anysite/streaming/__init__.py +1 -0
- anysite/streaming/progress.py +121 -0
- anysite/streaming/writer.py +130 -0
- anysite/utils/__init__.py +1 -0
- anysite/utils/fields.py +242 -0
- anysite/utils/retry.py +109 -0
- anysite_cli-0.1.0.dist-info/METADATA +437 -0
- anysite_cli-0.1.0.dist-info/RECORD +64 -0
- anysite_cli-0.1.0.dist-info/WHEEL +4 -0
- anysite_cli-0.1.0.dist-info/entry_points.txt +2 -0
- anysite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""Type inference for JSON data to SQL schemas."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from anysite.db.schema.types import get_sql_type
|
|
10
|
+
|
|
11
|
+
# Patterns for string subtype detection
|
|
12
|
+
_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
|
13
|
+
_DATETIME_RE = re.compile(
|
|
14
|
+
r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2})?(\.\d+)?(Z|[+-]\d{2}:?\d{2})?$"
|
|
15
|
+
)
|
|
16
|
+
_URL_RE = re.compile(r"^https?://", re.IGNORECASE)
|
|
17
|
+
_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
18
|
+
|
|
19
|
+
# Threshold for using TEXT vs VARCHAR
|
|
20
|
+
_VARCHAR_MAX_LENGTH = 255
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ColumnSchema:
|
|
25
|
+
"""Schema for a single column."""
|
|
26
|
+
|
|
27
|
+
name: str
|
|
28
|
+
inferred_type: str
|
|
29
|
+
nullable: bool = True
|
|
30
|
+
sample_values: list[Any] = field(default_factory=list)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class TableSchema:
|
|
35
|
+
"""Schema for a table inferred from JSON data."""
|
|
36
|
+
|
|
37
|
+
table_name: str
|
|
38
|
+
columns: list[ColumnSchema] = field(default_factory=list)
|
|
39
|
+
|
|
40
|
+
def to_sql_types(self, dialect: str) -> dict[str, str]:
|
|
41
|
+
"""Convert inferred types to SQL types for a given dialect.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
dialect: Database dialect ('sqlite', 'postgres', 'mysql').
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Mapping of column name to SQL type string.
|
|
48
|
+
"""
|
|
49
|
+
return {
|
|
50
|
+
col.name: get_sql_type(col.inferred_type, dialect)
|
|
51
|
+
for col in self.columns
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def infer_sql_type(value: Any) -> str:
|
|
56
|
+
"""Infer the SQL-compatible type for a Python value.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
value: A Python value from JSON.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Inferred type name ('integer', 'float', 'boolean', 'text', etc.).
|
|
63
|
+
"""
|
|
64
|
+
if value is None:
|
|
65
|
+
return "text"
|
|
66
|
+
|
|
67
|
+
if isinstance(value, bool):
|
|
68
|
+
return "boolean"
|
|
69
|
+
|
|
70
|
+
if isinstance(value, int):
|
|
71
|
+
return "integer"
|
|
72
|
+
|
|
73
|
+
if isinstance(value, float):
|
|
74
|
+
return "float"
|
|
75
|
+
|
|
76
|
+
if isinstance(value, (dict, list)):
|
|
77
|
+
return "json"
|
|
78
|
+
|
|
79
|
+
if isinstance(value, str):
|
|
80
|
+
return _infer_string_subtype(value)
|
|
81
|
+
|
|
82
|
+
return "text"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _infer_string_subtype(value: str) -> str:
|
|
86
|
+
"""Infer a more specific type for string values.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
value: String value to analyze.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Inferred type name.
|
|
93
|
+
"""
|
|
94
|
+
if not value:
|
|
95
|
+
return "text"
|
|
96
|
+
|
|
97
|
+
if _DATETIME_RE.match(value):
|
|
98
|
+
return "datetime"
|
|
99
|
+
|
|
100
|
+
if _DATE_RE.match(value):
|
|
101
|
+
return "date"
|
|
102
|
+
|
|
103
|
+
if _URL_RE.match(value):
|
|
104
|
+
return "url"
|
|
105
|
+
|
|
106
|
+
if _EMAIL_RE.match(value):
|
|
107
|
+
return "email"
|
|
108
|
+
|
|
109
|
+
if len(value) <= _VARCHAR_MAX_LENGTH:
|
|
110
|
+
return "varchar"
|
|
111
|
+
|
|
112
|
+
return "text"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _merge_types(type_a: str, type_b: str) -> str:
|
|
116
|
+
"""Merge two inferred types into a compatible type.
|
|
117
|
+
|
|
118
|
+
When different rows have different types for the same column,
|
|
119
|
+
this picks the more general type.
|
|
120
|
+
"""
|
|
121
|
+
if type_a == type_b:
|
|
122
|
+
return type_a
|
|
123
|
+
|
|
124
|
+
# Null/text absorbs anything
|
|
125
|
+
if type_a == "text" or type_b == "text":
|
|
126
|
+
return "text"
|
|
127
|
+
|
|
128
|
+
# Numeric promotion
|
|
129
|
+
if {type_a, type_b} == {"integer", "float"}:
|
|
130
|
+
return "float"
|
|
131
|
+
|
|
132
|
+
# String subtypes fall back to varchar or text
|
|
133
|
+
string_types = {"varchar", "url", "email", "date", "datetime"}
|
|
134
|
+
if type_a in string_types and type_b in string_types:
|
|
135
|
+
return "varchar"
|
|
136
|
+
|
|
137
|
+
# JSON stays as json
|
|
138
|
+
if type_a == "json" or type_b == "json":
|
|
139
|
+
return "json"
|
|
140
|
+
|
|
141
|
+
return "text"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def infer_table_schema(
|
|
145
|
+
table_name: str,
|
|
146
|
+
rows: list[dict[str, Any]],
|
|
147
|
+
max_sample: int = 100,
|
|
148
|
+
) -> TableSchema:
|
|
149
|
+
"""Infer a table schema from a list of JSON rows.
|
|
150
|
+
|
|
151
|
+
Examines up to max_sample rows to determine column types.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
table_name: Name for the inferred table.
|
|
155
|
+
rows: List of row dictionaries.
|
|
156
|
+
max_sample: Maximum number of rows to sample for inference.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Inferred TableSchema.
|
|
160
|
+
"""
|
|
161
|
+
if not rows:
|
|
162
|
+
return TableSchema(table_name=table_name)
|
|
163
|
+
|
|
164
|
+
sample = rows[:max_sample]
|
|
165
|
+
|
|
166
|
+
# Track column types and nullability
|
|
167
|
+
column_types: dict[str, str] = {}
|
|
168
|
+
column_nullable: dict[str, bool] = {}
|
|
169
|
+
column_samples: dict[str, list[Any]] = {}
|
|
170
|
+
# Preserve column order across all rows
|
|
171
|
+
column_order: list[str] = []
|
|
172
|
+
seen_columns: set[str] = set()
|
|
173
|
+
|
|
174
|
+
for row in sample:
|
|
175
|
+
for col_name, value in row.items():
|
|
176
|
+
if col_name not in seen_columns:
|
|
177
|
+
seen_columns.add(col_name)
|
|
178
|
+
column_order.append(col_name)
|
|
179
|
+
|
|
180
|
+
inferred = infer_sql_type(value)
|
|
181
|
+
|
|
182
|
+
if value is None:
|
|
183
|
+
column_nullable[col_name] = True
|
|
184
|
+
else:
|
|
185
|
+
if col_name in column_types:
|
|
186
|
+
column_types[col_name] = _merge_types(column_types[col_name], inferred)
|
|
187
|
+
else:
|
|
188
|
+
column_types[col_name] = inferred
|
|
189
|
+
|
|
190
|
+
if col_name not in column_nullable:
|
|
191
|
+
column_nullable[col_name] = False
|
|
192
|
+
|
|
193
|
+
# Store sample values (up to 3)
|
|
194
|
+
samples = column_samples.setdefault(col_name, [])
|
|
195
|
+
if len(samples) < 3:
|
|
196
|
+
samples.append(value)
|
|
197
|
+
|
|
198
|
+
# Mark missing columns as nullable
|
|
199
|
+
for col_name in seen_columns:
|
|
200
|
+
if col_name not in row:
|
|
201
|
+
column_nullable[col_name] = True
|
|
202
|
+
|
|
203
|
+
columns = [
|
|
204
|
+
ColumnSchema(
|
|
205
|
+
name=col_name,
|
|
206
|
+
inferred_type=column_types.get(col_name, "text"),
|
|
207
|
+
nullable=column_nullable.get(col_name, True),
|
|
208
|
+
sample_values=column_samples.get(col_name, []),
|
|
209
|
+
)
|
|
210
|
+
for col_name in column_order
|
|
211
|
+
]
|
|
212
|
+
|
|
213
|
+
return TableSchema(table_name=table_name, columns=columns)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""SQL type mappings per database dialect."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
# Type mapping: Python-inferred type name -> SQL type per dialect
|
|
6
|
+
TYPE_MAP: dict[str, dict[str, str]] = {
|
|
7
|
+
"integer": {
|
|
8
|
+
"sqlite": "INTEGER",
|
|
9
|
+
"postgres": "BIGINT",
|
|
10
|
+
"mysql": "BIGINT",
|
|
11
|
+
},
|
|
12
|
+
"float": {
|
|
13
|
+
"sqlite": "REAL",
|
|
14
|
+
"postgres": "DOUBLE PRECISION",
|
|
15
|
+
"mysql": "DOUBLE",
|
|
16
|
+
},
|
|
17
|
+
"boolean": {
|
|
18
|
+
"sqlite": "INTEGER",
|
|
19
|
+
"postgres": "BOOLEAN",
|
|
20
|
+
"mysql": "BOOLEAN",
|
|
21
|
+
},
|
|
22
|
+
"text": {
|
|
23
|
+
"sqlite": "TEXT",
|
|
24
|
+
"postgres": "TEXT",
|
|
25
|
+
"mysql": "TEXT",
|
|
26
|
+
},
|
|
27
|
+
"varchar": {
|
|
28
|
+
"sqlite": "TEXT",
|
|
29
|
+
"postgres": "VARCHAR(255)",
|
|
30
|
+
"mysql": "VARCHAR(255)",
|
|
31
|
+
},
|
|
32
|
+
"json": {
|
|
33
|
+
"sqlite": "TEXT",
|
|
34
|
+
"postgres": "JSONB",
|
|
35
|
+
"mysql": "JSON",
|
|
36
|
+
},
|
|
37
|
+
"date": {
|
|
38
|
+
"sqlite": "TEXT",
|
|
39
|
+
"postgres": "DATE",
|
|
40
|
+
"mysql": "DATE",
|
|
41
|
+
},
|
|
42
|
+
"datetime": {
|
|
43
|
+
"sqlite": "TEXT",
|
|
44
|
+
"postgres": "TIMESTAMPTZ",
|
|
45
|
+
"mysql": "DATETIME",
|
|
46
|
+
},
|
|
47
|
+
"url": {
|
|
48
|
+
"sqlite": "TEXT",
|
|
49
|
+
"postgres": "TEXT",
|
|
50
|
+
"mysql": "TEXT",
|
|
51
|
+
},
|
|
52
|
+
"email": {
|
|
53
|
+
"sqlite": "TEXT",
|
|
54
|
+
"postgres": "TEXT",
|
|
55
|
+
"mysql": "VARCHAR(320)",
|
|
56
|
+
},
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_sql_type(inferred_type: str, dialect: str) -> str:
|
|
61
|
+
"""Get the SQL type for a given inferred type and dialect.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
inferred_type: The inferred Python type name.
|
|
65
|
+
dialect: The database dialect ('sqlite', 'postgres', 'mysql').
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
SQL type string for the dialect.
|
|
69
|
+
"""
|
|
70
|
+
type_entry = TYPE_MAP.get(inferred_type, TYPE_MAP["text"])
|
|
71
|
+
return type_entry.get(dialect, type_entry.get("sqlite", "TEXT"))
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Database utility functions."""
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""SQL identifier sanitization utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
# Valid SQL identifier: starts with letter/underscore, then letters/digits/underscores
|
|
8
|
+
_IDENTIFIER_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
|
|
9
|
+
|
|
10
|
+
# Reserved SQL keywords that must be quoted
|
|
11
|
+
_RESERVED_WORDS = frozenset({
|
|
12
|
+
"all", "alter", "analyze", "and", "as", "asc", "between", "by", "case",
|
|
13
|
+
"check", "column", "constraint", "create", "cross", "current", "current_date",
|
|
14
|
+
"current_time", "current_timestamp", "current_user", "database", "default",
|
|
15
|
+
"delete", "desc", "distinct", "do", "drop", "else", "end", "exists", "false",
|
|
16
|
+
"fetch", "for", "foreign", "from", "full", "grant", "group", "having", "if",
|
|
17
|
+
"in", "index", "inner", "insert", "into", "is", "join", "key", "left", "like",
|
|
18
|
+
"limit", "natural", "not", "null", "offset", "on", "or", "order", "outer",
|
|
19
|
+
"primary", "references", "returning", "right", "row", "select", "session_user",
|
|
20
|
+
"set", "some", "table", "then", "to", "true", "union", "unique", "update",
|
|
21
|
+
"user", "using", "values", "view", "when", "where", "with",
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
# Maximum identifier length (conservative across databases)
|
|
25
|
+
_MAX_IDENTIFIER_LENGTH = 63
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def sanitize_identifier(name: str) -> str:
|
|
29
|
+
"""Sanitize a SQL identifier (column or table name).
|
|
30
|
+
|
|
31
|
+
Rules:
|
|
32
|
+
- Must be non-empty
|
|
33
|
+
- Must start with a letter or underscore
|
|
34
|
+
- Only letters, digits, and underscores allowed
|
|
35
|
+
- Reserved words are quoted with double quotes
|
|
36
|
+
- Max length 63 characters (PostgreSQL limit)
|
|
37
|
+
- Invalid characters are replaced with underscores
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
name: Raw identifier name.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Sanitized identifier safe for use in SQL.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If name is empty or cannot be sanitized.
|
|
47
|
+
"""
|
|
48
|
+
if not name or not name.strip():
|
|
49
|
+
raise ValueError("Identifier cannot be empty")
|
|
50
|
+
|
|
51
|
+
# Strip whitespace
|
|
52
|
+
cleaned = name.strip()
|
|
53
|
+
|
|
54
|
+
# Replace invalid characters with underscores
|
|
55
|
+
cleaned = re.sub(r"[^a-zA-Z0-9_]", "_", cleaned)
|
|
56
|
+
|
|
57
|
+
# Ensure starts with letter or underscore
|
|
58
|
+
if cleaned[0].isdigit():
|
|
59
|
+
cleaned = f"_{cleaned}"
|
|
60
|
+
|
|
61
|
+
# Collapse multiple underscores
|
|
62
|
+
cleaned = re.sub(r"__+", "_", cleaned)
|
|
63
|
+
|
|
64
|
+
# Strip trailing underscores
|
|
65
|
+
cleaned = cleaned.rstrip("_")
|
|
66
|
+
|
|
67
|
+
if not cleaned:
|
|
68
|
+
raise ValueError(f"Identifier '{name}' cannot be sanitized to a valid name")
|
|
69
|
+
|
|
70
|
+
# Truncate to max length
|
|
71
|
+
cleaned = cleaned[:_MAX_IDENTIFIER_LENGTH]
|
|
72
|
+
|
|
73
|
+
# Quote reserved words
|
|
74
|
+
if cleaned.lower() in _RESERVED_WORDS:
|
|
75
|
+
return f'"{cleaned}"'
|
|
76
|
+
|
|
77
|
+
return cleaned
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def sanitize_table_name(name: str) -> str:
|
|
81
|
+
"""Sanitize a table name, supporting schema-qualified names.
|
|
82
|
+
|
|
83
|
+
Handles 'schema.table' notation by sanitizing each part separately.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
name: Raw table name, optionally schema-qualified.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Sanitized table name.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If name is empty or cannot be sanitized.
|
|
93
|
+
"""
|
|
94
|
+
if not name or not name.strip():
|
|
95
|
+
raise ValueError("Table name cannot be empty")
|
|
96
|
+
|
|
97
|
+
parts = name.split(".", maxsplit=1)
|
|
98
|
+
sanitized_parts = [sanitize_identifier(part) for part in parts]
|
|
99
|
+
return ".".join(sanitized_parts)
|