anysite-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anysite/__init__.py +4 -0
- anysite/__main__.py +6 -0
- anysite/api/__init__.py +21 -0
- anysite/api/client.py +271 -0
- anysite/api/errors.py +137 -0
- anysite/api/schemas.py +333 -0
- anysite/batch/__init__.py +1 -0
- anysite/batch/executor.py +176 -0
- anysite/batch/input.py +160 -0
- anysite/batch/rate_limiter.py +98 -0
- anysite/cli/__init__.py +1 -0
- anysite/cli/config.py +176 -0
- anysite/cli/executor.py +388 -0
- anysite/cli/options.py +249 -0
- anysite/config/__init__.py +11 -0
- anysite/config/paths.py +46 -0
- anysite/config/settings.py +187 -0
- anysite/dataset/__init__.py +37 -0
- anysite/dataset/analyzer.py +268 -0
- anysite/dataset/cli.py +644 -0
- anysite/dataset/collector.py +686 -0
- anysite/dataset/db_loader.py +248 -0
- anysite/dataset/errors.py +30 -0
- anysite/dataset/exporters.py +121 -0
- anysite/dataset/history.py +153 -0
- anysite/dataset/models.py +245 -0
- anysite/dataset/notifications.py +87 -0
- anysite/dataset/scheduler.py +107 -0
- anysite/dataset/storage.py +171 -0
- anysite/dataset/transformer.py +213 -0
- anysite/db/__init__.py +38 -0
- anysite/db/adapters/__init__.py +1 -0
- anysite/db/adapters/base.py +158 -0
- anysite/db/adapters/postgres.py +201 -0
- anysite/db/adapters/sqlite.py +183 -0
- anysite/db/cli.py +709 -0
- anysite/db/config.py +92 -0
- anysite/db/manager.py +166 -0
- anysite/db/operations/__init__.py +1 -0
- anysite/db/operations/insert.py +199 -0
- anysite/db/operations/query.py +43 -0
- anysite/db/schema/__init__.py +1 -0
- anysite/db/schema/inference.py +213 -0
- anysite/db/schema/types.py +71 -0
- anysite/db/utils/__init__.py +1 -0
- anysite/db/utils/sanitize.py +99 -0
- anysite/main.py +498 -0
- anysite/models/__init__.py +1 -0
- anysite/output/__init__.py +11 -0
- anysite/output/console.py +45 -0
- anysite/output/formatters.py +301 -0
- anysite/output/templates.py +76 -0
- anysite/py.typed +0 -0
- anysite/streaming/__init__.py +1 -0
- anysite/streaming/progress.py +121 -0
- anysite/streaming/writer.py +130 -0
- anysite/utils/__init__.py +1 -0
- anysite/utils/fields.py +242 -0
- anysite/utils/retry.py +109 -0
- anysite_cli-0.1.2.dist-info/METADATA +455 -0
- anysite_cli-0.1.2.dist-info/RECORD +64 -0
- anysite_cli-0.1.2.dist-info/WHEEL +4 -0
- anysite_cli-0.1.2.dist-info/entry_points.txt +2 -0
- anysite_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""DuckDB-based analytics for dataset Parquet files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from anysite.dataset.models import DatasetConfig
|
|
9
|
+
from anysite.dataset.storage import get_source_dir
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def expand_dot_fields(fields_str: str) -> str:
|
|
13
|
+
"""Convert dot-notation field specs to DuckDB SQL expressions.
|
|
14
|
+
|
|
15
|
+
Simple fields pass through unchanged. Dotted fields are converted to
|
|
16
|
+
``json_extract_string`` calls so that nested values stored as JSON
|
|
17
|
+
strings in Parquet can be extracted directly.
|
|
18
|
+
|
|
19
|
+
Examples::
|
|
20
|
+
|
|
21
|
+
"name, age" -> "name, age"
|
|
22
|
+
"urn.value AS urn_id" -> "json_extract_string(urn, '$.value') AS urn_id"
|
|
23
|
+
"author.name" -> "json_extract_string(author, '$.name')"
|
|
24
|
+
"a.b.c" -> "json_extract_string(a, '$.b.c')"
|
|
25
|
+
"""
|
|
26
|
+
parts: list[str] = []
|
|
27
|
+
for spec in fields_str.split(","):
|
|
28
|
+
spec = spec.strip()
|
|
29
|
+
if not spec:
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
# Detect optional AS alias (case-insensitive)
|
|
33
|
+
alias = ""
|
|
34
|
+
as_match = re.search(r"\s+[Aa][Ss]\s+(\w+)$", spec)
|
|
35
|
+
if as_match:
|
|
36
|
+
alias = f" AS {as_match.group(1)}"
|
|
37
|
+
spec = spec[: as_match.start()]
|
|
38
|
+
|
|
39
|
+
if "." in spec:
|
|
40
|
+
col, rest = spec.split(".", 1)
|
|
41
|
+
parts.append(f"json_extract_string({col}, '$.{rest}'){alias}")
|
|
42
|
+
else:
|
|
43
|
+
parts.append(f"{spec}{alias}")
|
|
44
|
+
|
|
45
|
+
return ", ".join(parts)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _get_duckdb() -> Any:
|
|
49
|
+
import duckdb
|
|
50
|
+
|
|
51
|
+
return duckdb
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class DatasetAnalyzer:
|
|
55
|
+
"""Run SQL queries and analytics over dataset Parquet files using DuckDB."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, config: DatasetConfig) -> None:
|
|
58
|
+
self.config = config
|
|
59
|
+
self.base_path = config.storage_path()
|
|
60
|
+
self._conn: Any = None
|
|
61
|
+
|
|
62
|
+
def _get_conn(self) -> Any:
|
|
63
|
+
"""Get or create a DuckDB connection with views registered."""
|
|
64
|
+
if self._conn is not None:
|
|
65
|
+
return self._conn
|
|
66
|
+
|
|
67
|
+
duckdb = _get_duckdb()
|
|
68
|
+
self._conn = duckdb.connect(":memory:")
|
|
69
|
+
self._register_views()
|
|
70
|
+
return self._conn
|
|
71
|
+
|
|
72
|
+
def _register_views(self) -> None:
|
|
73
|
+
"""Register a DuckDB view for each source's Parquet files."""
|
|
74
|
+
conn = self._conn
|
|
75
|
+
for source in self.config.sources:
|
|
76
|
+
source_dir = get_source_dir(self.base_path, source.id)
|
|
77
|
+
if source_dir.exists() and any(source_dir.glob("*.parquet")):
|
|
78
|
+
parquet_glob = str(source_dir / "*.parquet")
|
|
79
|
+
# Use safe identifier quoting
|
|
80
|
+
view_name = source.id.replace("-", "_").replace(".", "_")
|
|
81
|
+
conn.execute(
|
|
82
|
+
f"CREATE OR REPLACE VIEW {view_name} AS "
|
|
83
|
+
f"SELECT * FROM read_parquet('{parquet_glob}')"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def close(self) -> None:
|
|
87
|
+
"""Close the DuckDB connection."""
|
|
88
|
+
if self._conn is not None:
|
|
89
|
+
self._conn.close()
|
|
90
|
+
self._conn = None
|
|
91
|
+
|
|
92
|
+
def __enter__(self) -> DatasetAnalyzer:
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
def __exit__(self, *args: Any) -> None:
|
|
96
|
+
self.close()
|
|
97
|
+
|
|
98
|
+
def query(self, sql: str) -> list[dict[str, Any]]:
|
|
99
|
+
"""Execute a SQL query and return results as list of dicts.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
sql: SQL query string.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of result dicts.
|
|
106
|
+
"""
|
|
107
|
+
conn = self._get_conn()
|
|
108
|
+
result = conn.execute(sql)
|
|
109
|
+
columns = [desc[0] for desc in result.description]
|
|
110
|
+
rows = result.fetchall()
|
|
111
|
+
return [dict(zip(columns, row)) for row in rows]
|
|
112
|
+
|
|
113
|
+
def stats(self, source_id: str) -> list[dict[str, Any]]:
|
|
114
|
+
"""Get column statistics for a source.
|
|
115
|
+
|
|
116
|
+
Returns min, max, avg, null count, distinct count per column.
|
|
117
|
+
"""
|
|
118
|
+
view_name = source_id.replace("-", "_").replace(".", "_")
|
|
119
|
+
conn = self._get_conn()
|
|
120
|
+
|
|
121
|
+
# Get column names and types
|
|
122
|
+
info = conn.execute(f"DESCRIBE {view_name}").fetchall()
|
|
123
|
+
results: list[dict[str, Any]] = []
|
|
124
|
+
|
|
125
|
+
for col_name, col_type, *_ in info:
|
|
126
|
+
stat: dict[str, Any] = {
|
|
127
|
+
"column": col_name,
|
|
128
|
+
"type": col_type,
|
|
129
|
+
}
|
|
130
|
+
quoted = f'"{col_name}"'
|
|
131
|
+
# Count nulls and total
|
|
132
|
+
row = conn.execute(
|
|
133
|
+
f"SELECT COUNT(*) as total, "
|
|
134
|
+
f"COUNT({quoted}) as non_null, "
|
|
135
|
+
f"COUNT(DISTINCT {quoted}) as distinct_count "
|
|
136
|
+
f"FROM {view_name}"
|
|
137
|
+
).fetchone()
|
|
138
|
+
if row:
|
|
139
|
+
stat["total"] = row[0]
|
|
140
|
+
stat["non_null"] = row[1]
|
|
141
|
+
stat["null_count"] = row[0] - row[1]
|
|
142
|
+
stat["distinct"] = row[2]
|
|
143
|
+
|
|
144
|
+
# Numeric stats
|
|
145
|
+
if col_type in ("INTEGER", "BIGINT", "DOUBLE", "FLOAT", "DECIMAL", "HUGEINT"):
|
|
146
|
+
num_row = conn.execute(
|
|
147
|
+
f"SELECT MIN({quoted}), MAX({quoted}), AVG({quoted}) "
|
|
148
|
+
f"FROM {view_name}"
|
|
149
|
+
).fetchone()
|
|
150
|
+
if num_row:
|
|
151
|
+
stat["min"] = num_row[0]
|
|
152
|
+
stat["max"] = num_row[1]
|
|
153
|
+
stat["avg"] = round(num_row[2], 2) if num_row[2] is not None else None
|
|
154
|
+
|
|
155
|
+
results.append(stat)
|
|
156
|
+
|
|
157
|
+
return results
|
|
158
|
+
|
|
159
|
+
def profile(self) -> list[dict[str, Any]]:
|
|
160
|
+
"""Profile all sources: record count, completeness, duplicates.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
List of dicts with source-level quality metrics.
|
|
164
|
+
"""
|
|
165
|
+
results: list[dict[str, Any]] = []
|
|
166
|
+
conn = self._get_conn()
|
|
167
|
+
|
|
168
|
+
for source in self.config.sources:
|
|
169
|
+
view_name = source.id.replace("-", "_").replace(".", "_")
|
|
170
|
+
source_dir = get_source_dir(self.base_path, source.id)
|
|
171
|
+
|
|
172
|
+
if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
|
|
173
|
+
results.append({
|
|
174
|
+
"source": source.id,
|
|
175
|
+
"status": "no data",
|
|
176
|
+
"records": 0,
|
|
177
|
+
})
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
row = conn.execute(f"SELECT COUNT(*) FROM {view_name}").fetchone()
|
|
182
|
+
total = row[0] if row else 0
|
|
183
|
+
|
|
184
|
+
# Get columns
|
|
185
|
+
info = conn.execute(f"DESCRIBE {view_name}").fetchall()
|
|
186
|
+
col_names = [c[0] for c in info]
|
|
187
|
+
|
|
188
|
+
# Completeness: fraction of non-null values
|
|
189
|
+
if col_names:
|
|
190
|
+
non_null_exprs = [f'COUNT("{c}")' for c in col_names]
|
|
191
|
+
counts_row = conn.execute(
|
|
192
|
+
f"SELECT {', '.join(non_null_exprs)} FROM {view_name}"
|
|
193
|
+
).fetchone()
|
|
194
|
+
if counts_row and total > 0:
|
|
195
|
+
completeness = sum(counts_row) / (total * len(col_names))
|
|
196
|
+
else:
|
|
197
|
+
completeness = 0.0
|
|
198
|
+
else:
|
|
199
|
+
completeness = 0.0
|
|
200
|
+
|
|
201
|
+
results.append({
|
|
202
|
+
"source": source.id,
|
|
203
|
+
"status": "ok",
|
|
204
|
+
"records": total,
|
|
205
|
+
"columns": len(col_names),
|
|
206
|
+
"completeness": round(completeness * 100, 1),
|
|
207
|
+
})
|
|
208
|
+
except Exception as e:
|
|
209
|
+
results.append({
|
|
210
|
+
"source": source.id,
|
|
211
|
+
"status": f"error: {e}",
|
|
212
|
+
"records": 0,
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
return results
|
|
216
|
+
|
|
217
|
+
def list_views(self) -> list[str]:
|
|
218
|
+
"""List all registered view names."""
|
|
219
|
+
conn = self._get_conn()
|
|
220
|
+
rows = conn.execute(
|
|
221
|
+
"SELECT table_name FROM information_schema.tables WHERE table_type = 'VIEW'"
|
|
222
|
+
).fetchall()
|
|
223
|
+
return [r[0] for r in rows]
|
|
224
|
+
|
|
225
|
+
def interactive_shell(self) -> None:
|
|
226
|
+
"""Run an interactive SQL shell."""
|
|
227
|
+
from rich.console import Console
|
|
228
|
+
|
|
229
|
+
console = Console()
|
|
230
|
+
conn = self._get_conn()
|
|
231
|
+
views = self.list_views()
|
|
232
|
+
|
|
233
|
+
console.print("[bold]Anysite Dataset SQL Shell[/bold]")
|
|
234
|
+
console.print(f"Available views: {', '.join(views)}")
|
|
235
|
+
console.print("Type 'exit' or 'quit' to leave.\n")
|
|
236
|
+
|
|
237
|
+
while True:
|
|
238
|
+
try:
|
|
239
|
+
sql = input("anysite> ").strip()
|
|
240
|
+
except (EOFError, KeyboardInterrupt):
|
|
241
|
+
console.print("\nBye!")
|
|
242
|
+
break
|
|
243
|
+
|
|
244
|
+
if not sql:
|
|
245
|
+
continue
|
|
246
|
+
if sql.lower() in ("exit", "quit", "\\q"):
|
|
247
|
+
break
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
result = conn.execute(sql)
|
|
251
|
+
if result.description:
|
|
252
|
+
columns = [desc[0] for desc in result.description]
|
|
253
|
+
rows = result.fetchall()
|
|
254
|
+
if rows:
|
|
255
|
+
from rich.table import Table
|
|
256
|
+
|
|
257
|
+
table = Table()
|
|
258
|
+
for col in columns:
|
|
259
|
+
table.add_column(col)
|
|
260
|
+
for row in rows:
|
|
261
|
+
table.add_row(*[str(v) for v in row])
|
|
262
|
+
console.print(table)
|
|
263
|
+
else:
|
|
264
|
+
console.print("[dim]Empty result set[/dim]")
|
|
265
|
+
else:
|
|
266
|
+
console.print("[green]OK[/green]")
|
|
267
|
+
except Exception as e:
|
|
268
|
+
console.print(f"[red]Error:[/red] {e}")
|