anysite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. anysite/__init__.py +4 -0
  2. anysite/__main__.py +6 -0
  3. anysite/api/__init__.py +21 -0
  4. anysite/api/client.py +271 -0
  5. anysite/api/errors.py +137 -0
  6. anysite/api/schemas.py +333 -0
  7. anysite/batch/__init__.py +1 -0
  8. anysite/batch/executor.py +176 -0
  9. anysite/batch/input.py +160 -0
  10. anysite/batch/rate_limiter.py +98 -0
  11. anysite/cli/__init__.py +1 -0
  12. anysite/cli/config.py +176 -0
  13. anysite/cli/executor.py +388 -0
  14. anysite/cli/options.py +249 -0
  15. anysite/config/__init__.py +11 -0
  16. anysite/config/paths.py +46 -0
  17. anysite/config/settings.py +187 -0
  18. anysite/dataset/__init__.py +37 -0
  19. anysite/dataset/analyzer.py +268 -0
  20. anysite/dataset/cli.py +644 -0
  21. anysite/dataset/collector.py +686 -0
  22. anysite/dataset/db_loader.py +248 -0
  23. anysite/dataset/errors.py +30 -0
  24. anysite/dataset/exporters.py +121 -0
  25. anysite/dataset/history.py +153 -0
  26. anysite/dataset/models.py +245 -0
  27. anysite/dataset/notifications.py +87 -0
  28. anysite/dataset/scheduler.py +107 -0
  29. anysite/dataset/storage.py +171 -0
  30. anysite/dataset/transformer.py +213 -0
  31. anysite/db/__init__.py +38 -0
  32. anysite/db/adapters/__init__.py +1 -0
  33. anysite/db/adapters/base.py +158 -0
  34. anysite/db/adapters/postgres.py +201 -0
  35. anysite/db/adapters/sqlite.py +183 -0
  36. anysite/db/cli.py +687 -0
  37. anysite/db/config.py +92 -0
  38. anysite/db/manager.py +166 -0
  39. anysite/db/operations/__init__.py +1 -0
  40. anysite/db/operations/insert.py +199 -0
  41. anysite/db/operations/query.py +43 -0
  42. anysite/db/schema/__init__.py +1 -0
  43. anysite/db/schema/inference.py +213 -0
  44. anysite/db/schema/types.py +71 -0
  45. anysite/db/utils/__init__.py +1 -0
  46. anysite/db/utils/sanitize.py +99 -0
  47. anysite/main.py +498 -0
  48. anysite/models/__init__.py +1 -0
  49. anysite/output/__init__.py +11 -0
  50. anysite/output/console.py +45 -0
  51. anysite/output/formatters.py +301 -0
  52. anysite/output/templates.py +76 -0
  53. anysite/py.typed +0 -0
  54. anysite/streaming/__init__.py +1 -0
  55. anysite/streaming/progress.py +121 -0
  56. anysite/streaming/writer.py +130 -0
  57. anysite/utils/__init__.py +1 -0
  58. anysite/utils/fields.py +242 -0
  59. anysite/utils/retry.py +109 -0
  60. anysite_cli-0.1.0.dist-info/METADATA +437 -0
  61. anysite_cli-0.1.0.dist-info/RECORD +64 -0
  62. anysite_cli-0.1.0.dist-info/WHEEL +4 -0
  63. anysite_cli-0.1.0.dist-info/entry_points.txt +2 -0
  64. anysite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,268 @@
1
+ """DuckDB-based analytics for dataset Parquet files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from anysite.dataset.models import DatasetConfig
9
+ from anysite.dataset.storage import get_source_dir
10
+
11
+
12
+ def expand_dot_fields(fields_str: str) -> str:
13
+ """Convert dot-notation field specs to DuckDB SQL expressions.
14
+
15
+ Simple fields pass through unchanged. Dotted fields are converted to
16
+ ``json_extract_string`` calls so that nested values stored as JSON
17
+ strings in Parquet can be extracted directly.
18
+
19
+ Examples::
20
+
21
+ "name, age" -> "name, age"
22
+ "urn.value AS urn_id" -> "json_extract_string(urn, '$.value') AS urn_id"
23
+ "author.name" -> "json_extract_string(author, '$.name')"
24
+ "a.b.c" -> "json_extract_string(a, '$.b.c')"
25
+ """
26
+ parts: list[str] = []
27
+ for spec in fields_str.split(","):
28
+ spec = spec.strip()
29
+ if not spec:
30
+ continue
31
+
32
+ # Detect optional AS alias (case-insensitive)
33
+ alias = ""
34
+ as_match = re.search(r"\s+[Aa][Ss]\s+(\w+)$", spec)
35
+ if as_match:
36
+ alias = f" AS {as_match.group(1)}"
37
+ spec = spec[: as_match.start()]
38
+
39
+ if "." in spec:
40
+ col, rest = spec.split(".", 1)
41
+ parts.append(f"json_extract_string({col}, '$.{rest}'){alias}")
42
+ else:
43
+ parts.append(f"{spec}{alias}")
44
+
45
+ return ", ".join(parts)
46
+
47
+
48
+ def _get_duckdb() -> Any:
49
+ import duckdb
50
+
51
+ return duckdb
52
+
53
+
54
+ class DatasetAnalyzer:
55
+ """Run SQL queries and analytics over dataset Parquet files using DuckDB."""
56
+
57
+ def __init__(self, config: DatasetConfig) -> None:
58
+ self.config = config
59
+ self.base_path = config.storage_path()
60
+ self._conn: Any = None
61
+
62
+ def _get_conn(self) -> Any:
63
+ """Get or create a DuckDB connection with views registered."""
64
+ if self._conn is not None:
65
+ return self._conn
66
+
67
+ duckdb = _get_duckdb()
68
+ self._conn = duckdb.connect(":memory:")
69
+ self._register_views()
70
+ return self._conn
71
+
72
+ def _register_views(self) -> None:
73
+ """Register a DuckDB view for each source's Parquet files."""
74
+ conn = self._conn
75
+ for source in self.config.sources:
76
+ source_dir = get_source_dir(self.base_path, source.id)
77
+ if source_dir.exists() and any(source_dir.glob("*.parquet")):
78
+ parquet_glob = str(source_dir / "*.parquet")
79
+ # Use safe identifier quoting
80
+ view_name = source.id.replace("-", "_").replace(".", "_")
81
+ conn.execute(
82
+ f"CREATE OR REPLACE VIEW {view_name} AS "
83
+ f"SELECT * FROM read_parquet('{parquet_glob}')"
84
+ )
85
+
86
+ def close(self) -> None:
87
+ """Close the DuckDB connection."""
88
+ if self._conn is not None:
89
+ self._conn.close()
90
+ self._conn = None
91
+
92
+ def __enter__(self) -> DatasetAnalyzer:
93
+ return self
94
+
95
+ def __exit__(self, *args: Any) -> None:
96
+ self.close()
97
+
98
+ def query(self, sql: str) -> list[dict[str, Any]]:
99
+ """Execute a SQL query and return results as list of dicts.
100
+
101
+ Args:
102
+ sql: SQL query string.
103
+
104
+ Returns:
105
+ List of result dicts.
106
+ """
107
+ conn = self._get_conn()
108
+ result = conn.execute(sql)
109
+ columns = [desc[0] for desc in result.description]
110
+ rows = result.fetchall()
111
+ return [dict(zip(columns, row)) for row in rows]
112
+
113
+ def stats(self, source_id: str) -> list[dict[str, Any]]:
114
+ """Get column statistics for a source.
115
+
116
+ Returns min, max, avg, null count, distinct count per column.
117
+ """
118
+ view_name = source_id.replace("-", "_").replace(".", "_")
119
+ conn = self._get_conn()
120
+
121
+ # Get column names and types
122
+ info = conn.execute(f"DESCRIBE {view_name}").fetchall()
123
+ results: list[dict[str, Any]] = []
124
+
125
+ for col_name, col_type, *_ in info:
126
+ stat: dict[str, Any] = {
127
+ "column": col_name,
128
+ "type": col_type,
129
+ }
130
+ quoted = f'"{col_name}"'
131
+ # Count nulls and total
132
+ row = conn.execute(
133
+ f"SELECT COUNT(*) as total, "
134
+ f"COUNT({quoted}) as non_null, "
135
+ f"COUNT(DISTINCT {quoted}) as distinct_count "
136
+ f"FROM {view_name}"
137
+ ).fetchone()
138
+ if row:
139
+ stat["total"] = row[0]
140
+ stat["non_null"] = row[1]
141
+ stat["null_count"] = row[0] - row[1]
142
+ stat["distinct"] = row[2]
143
+
144
+ # Numeric stats
145
+ if col_type in ("INTEGER", "BIGINT", "DOUBLE", "FLOAT", "DECIMAL", "HUGEINT"):
146
+ num_row = conn.execute(
147
+ f"SELECT MIN({quoted}), MAX({quoted}), AVG({quoted}) "
148
+ f"FROM {view_name}"
149
+ ).fetchone()
150
+ if num_row:
151
+ stat["min"] = num_row[0]
152
+ stat["max"] = num_row[1]
153
+ stat["avg"] = round(num_row[2], 2) if num_row[2] is not None else None
154
+
155
+ results.append(stat)
156
+
157
+ return results
158
+
159
+ def profile(self) -> list[dict[str, Any]]:
160
+ """Profile all sources: record count, completeness, duplicates.
161
+
162
+ Returns:
163
+ List of dicts with source-level quality metrics.
164
+ """
165
+ results: list[dict[str, Any]] = []
166
+ conn = self._get_conn()
167
+
168
+ for source in self.config.sources:
169
+ view_name = source.id.replace("-", "_").replace(".", "_")
170
+ source_dir = get_source_dir(self.base_path, source.id)
171
+
172
+ if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
173
+ results.append({
174
+ "source": source.id,
175
+ "status": "no data",
176
+ "records": 0,
177
+ })
178
+ continue
179
+
180
+ try:
181
+ row = conn.execute(f"SELECT COUNT(*) FROM {view_name}").fetchone()
182
+ total = row[0] if row else 0
183
+
184
+ # Get columns
185
+ info = conn.execute(f"DESCRIBE {view_name}").fetchall()
186
+ col_names = [c[0] for c in info]
187
+
188
+ # Completeness: fraction of non-null values
189
+ if col_names:
190
+ non_null_exprs = [f'COUNT("{c}")' for c in col_names]
191
+ counts_row = conn.execute(
192
+ f"SELECT {', '.join(non_null_exprs)} FROM {view_name}"
193
+ ).fetchone()
194
+ if counts_row and total > 0:
195
+ completeness = sum(counts_row) / (total * len(col_names))
196
+ else:
197
+ completeness = 0.0
198
+ else:
199
+ completeness = 0.0
200
+
201
+ results.append({
202
+ "source": source.id,
203
+ "status": "ok",
204
+ "records": total,
205
+ "columns": len(col_names),
206
+ "completeness": round(completeness * 100, 1),
207
+ })
208
+ except Exception as e:
209
+ results.append({
210
+ "source": source.id,
211
+ "status": f"error: {e}",
212
+ "records": 0,
213
+ })
214
+
215
+ return results
216
+
217
+ def list_views(self) -> list[str]:
218
+ """List all registered view names."""
219
+ conn = self._get_conn()
220
+ rows = conn.execute(
221
+ "SELECT table_name FROM information_schema.tables WHERE table_type = 'VIEW'"
222
+ ).fetchall()
223
+ return [r[0] for r in rows]
224
+
225
+ def interactive_shell(self) -> None:
226
+ """Run an interactive SQL shell."""
227
+ from rich.console import Console
228
+
229
+ console = Console()
230
+ conn = self._get_conn()
231
+ views = self.list_views()
232
+
233
+ console.print("[bold]Anysite Dataset SQL Shell[/bold]")
234
+ console.print(f"Available views: {', '.join(views)}")
235
+ console.print("Type 'exit' or 'quit' to leave.\n")
236
+
237
+ while True:
238
+ try:
239
+ sql = input("anysite> ").strip()
240
+ except (EOFError, KeyboardInterrupt):
241
+ console.print("\nBye!")
242
+ break
243
+
244
+ if not sql:
245
+ continue
246
+ if sql.lower() in ("exit", "quit", "\\q"):
247
+ break
248
+
249
+ try:
250
+ result = conn.execute(sql)
251
+ if result.description:
252
+ columns = [desc[0] for desc in result.description]
253
+ rows = result.fetchall()
254
+ if rows:
255
+ from rich.table import Table
256
+
257
+ table = Table()
258
+ for col in columns:
259
+ table.add_column(col)
260
+ for row in rows:
261
+ table.add_row(*[str(v) for v in row])
262
+ console.print(table)
263
+ else:
264
+ console.print("[dim]Empty result set[/dim]")
265
+ else:
266
+ console.print("[green]OK[/green]")
267
+ except Exception as e:
268
+ console.print(f"[red]Error:[/red] {e}")