anysite-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. anysite/__init__.py +4 -0
  2. anysite/__main__.py +6 -0
  3. anysite/api/__init__.py +21 -0
  4. anysite/api/client.py +271 -0
  5. anysite/api/errors.py +137 -0
  6. anysite/api/schemas.py +333 -0
  7. anysite/batch/__init__.py +1 -0
  8. anysite/batch/executor.py +176 -0
  9. anysite/batch/input.py +160 -0
  10. anysite/batch/rate_limiter.py +98 -0
  11. anysite/cli/__init__.py +1 -0
  12. anysite/cli/config.py +176 -0
  13. anysite/cli/executor.py +388 -0
  14. anysite/cli/options.py +249 -0
  15. anysite/config/__init__.py +11 -0
  16. anysite/config/paths.py +46 -0
  17. anysite/config/settings.py +187 -0
  18. anysite/dataset/__init__.py +37 -0
  19. anysite/dataset/analyzer.py +268 -0
  20. anysite/dataset/cli.py +644 -0
  21. anysite/dataset/collector.py +686 -0
  22. anysite/dataset/db_loader.py +248 -0
  23. anysite/dataset/errors.py +30 -0
  24. anysite/dataset/exporters.py +121 -0
  25. anysite/dataset/history.py +153 -0
  26. anysite/dataset/models.py +245 -0
  27. anysite/dataset/notifications.py +87 -0
  28. anysite/dataset/scheduler.py +107 -0
  29. anysite/dataset/storage.py +171 -0
  30. anysite/dataset/transformer.py +213 -0
  31. anysite/db/__init__.py +38 -0
  32. anysite/db/adapters/__init__.py +1 -0
  33. anysite/db/adapters/base.py +158 -0
  34. anysite/db/adapters/postgres.py +201 -0
  35. anysite/db/adapters/sqlite.py +183 -0
  36. anysite/db/cli.py +709 -0
  37. anysite/db/config.py +92 -0
  38. anysite/db/manager.py +166 -0
  39. anysite/db/operations/__init__.py +1 -0
  40. anysite/db/operations/insert.py +199 -0
  41. anysite/db/operations/query.py +43 -0
  42. anysite/db/schema/__init__.py +1 -0
  43. anysite/db/schema/inference.py +213 -0
  44. anysite/db/schema/types.py +71 -0
  45. anysite/db/utils/__init__.py +1 -0
  46. anysite/db/utils/sanitize.py +99 -0
  47. anysite/main.py +498 -0
  48. anysite/models/__init__.py +1 -0
  49. anysite/output/__init__.py +11 -0
  50. anysite/output/console.py +45 -0
  51. anysite/output/formatters.py +301 -0
  52. anysite/output/templates.py +76 -0
  53. anysite/py.typed +0 -0
  54. anysite/streaming/__init__.py +1 -0
  55. anysite/streaming/progress.py +121 -0
  56. anysite/streaming/writer.py +130 -0
  57. anysite/utils/__init__.py +1 -0
  58. anysite/utils/fields.py +242 -0
  59. anysite/utils/retry.py +109 -0
  60. anysite_cli-0.1.2.dist-info/METADATA +455 -0
  61. anysite_cli-0.1.2.dist-info/RECORD +64 -0
  62. anysite_cli-0.1.2.dist-info/WHEEL +4 -0
  63. anysite_cli-0.1.2.dist-info/entry_points.txt +2 -0
  64. anysite_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,248 @@
1
+ """Load dataset Parquet data into a relational database with FK linking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from anysite.dataset.models import DatasetConfig, DatasetSource
9
+ from anysite.dataset.storage import get_source_dir, read_parquet
10
+ from anysite.db.adapters.base import DatabaseAdapter
11
+ from anysite.db.schema.inference import infer_table_schema
12
+
13
+
14
+ def _get_dialect(adapter: DatabaseAdapter) -> str:
15
+ """Extract dialect string from adapter server info."""
16
+ info = adapter.get_server_info()
17
+ return info.get("type", "sqlite")
18
+
19
+
20
+ def _extract_dot_value(record: dict[str, Any], dot_path: str) -> Any:
21
+ """Extract a value from a record using dot notation.
22
+
23
+ Handles JSON strings stored in Parquet: if a field value is a JSON
24
+ string, it is parsed and the remainder of the dot path traversed.
25
+ """
26
+ parts = dot_path.split(".")
27
+ current: Any = record
28
+
29
+ for part in parts:
30
+ if isinstance(current, str):
31
+ try:
32
+ current = json.loads(current)
33
+ except (json.JSONDecodeError, ValueError):
34
+ return None
35
+
36
+ if isinstance(current, dict):
37
+ current = current.get(part)
38
+ else:
39
+ return None
40
+
41
+ if current is None:
42
+ return None
43
+
44
+ return current
45
+
46
+
47
+ def _table_name_for(source: DatasetSource) -> str:
48
+ """Get the DB table name for a source."""
49
+ if source.db_load and source.db_load.table:
50
+ return source.db_load.table
51
+ return source.id.replace("-", "_").replace(".", "_")
52
+
53
+
54
+ def _filter_record(
55
+ record: dict[str, Any],
56
+ source: DatasetSource,
57
+ ) -> dict[str, Any]:
58
+ """Filter and transform a record based on db_load config.
59
+
60
+ Applies field selection/exclusion and dot-notation extraction.
61
+ """
62
+ db_load = source.db_load
63
+ exclude = set(db_load.exclude) if db_load else {"_input_value", "_parent_source"}
64
+
65
+ if db_load and db_load.fields:
66
+ # Explicit field list — extract each field
67
+ row: dict[str, Any] = {}
68
+ for field_spec in db_load.fields:
69
+ # Parse "source_field AS alias" syntax
70
+ alias = None
71
+ upper = field_spec.upper()
72
+ as_idx = upper.find(" AS ")
73
+ if as_idx != -1:
74
+ alias = field_spec[as_idx + 4:].strip()
75
+ field_spec = field_spec[:as_idx].strip()
76
+
77
+ col_name = alias or field_spec.replace(".", "_")
78
+
79
+ if "." in field_spec:
80
+ row[col_name] = _extract_dot_value(record, field_spec)
81
+ else:
82
+ row[col_name] = record.get(field_spec)
83
+ return row
84
+ else:
85
+ # All fields minus exclusions
86
+ return {k: v for k, v in record.items() if k not in exclude}
87
+
88
+
89
+ class DatasetDbLoader:
90
+ """Load dataset Parquet data into a relational database.
91
+
92
+ Handles:
93
+ - Schema inference from Parquet records
94
+ - Auto-increment primary keys (``id`` column)
95
+ - Foreign key linking via provenance ``_input_value`` column
96
+ - Dot-notation field extraction for JSON columns
97
+ - Topological loading order (parents before children)
98
+ """
99
+
100
+ def __init__(
101
+ self,
102
+ config: DatasetConfig,
103
+ adapter: DatabaseAdapter,
104
+ ) -> None:
105
+ self.config = config
106
+ self.adapter = adapter
107
+ self.base_path = config.storage_path()
108
+ self._dialect = _get_dialect(adapter)
109
+ # Maps source_id -> {input_value -> db_id} for FK linking
110
+ self._value_to_id: dict[str, dict[str, int]] = {}
111
+
112
+ def load_all(
113
+ self,
114
+ *,
115
+ source_filter: str | None = None,
116
+ drop_existing: bool = False,
117
+ dry_run: bool = False,
118
+ ) -> dict[str, int]:
119
+ """Load all sources into the database in dependency order.
120
+
121
+ Args:
122
+ source_filter: Only load this source (and dependencies).
123
+ drop_existing: Drop tables before creating.
124
+ dry_run: Show plan without executing.
125
+
126
+ Returns:
127
+ Mapping of source_id to number of rows loaded.
128
+ """
129
+ sources = self.config.topological_sort()
130
+
131
+ if source_filter:
132
+ from anysite.dataset.collector import _filter_sources
133
+ sources = _filter_sources(sources, source_filter, self.config)
134
+
135
+ results: dict[str, int] = {}
136
+
137
+ for source in sources:
138
+ count = self._load_source(
139
+ source,
140
+ drop_existing=drop_existing,
141
+ dry_run=dry_run,
142
+ )
143
+ results[source.id] = count
144
+
145
+ return results
146
+
147
+ def _load_source(
148
+ self,
149
+ source: DatasetSource,
150
+ *,
151
+ drop_existing: bool = False,
152
+ dry_run: bool = False,
153
+ ) -> int:
154
+ """Load a single source into the database."""
155
+ source_dir = get_source_dir(self.base_path, source.id)
156
+ if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
157
+ return 0
158
+
159
+ raw_records = read_parquet(source_dir)
160
+ if not raw_records:
161
+ return 0
162
+
163
+ table_name = _table_name_for(source)
164
+
165
+ # Determine parent info for FK linking
166
+ parent_source_id = None
167
+ parent_fk_col = None
168
+ if source.dependency:
169
+ parent_source_id = source.dependency.from_source
170
+ parent_fk_col = f"{parent_source_id.replace('-', '_').replace('.', '_')}_id"
171
+
172
+ # Transform records
173
+ rows: list[dict[str, Any]] = []
174
+ for record in raw_records:
175
+ row = _filter_record(record, source)
176
+
177
+ # Add FK column if this is a dependent source
178
+ if parent_source_id and parent_fk_col:
179
+ input_val = record.get("_input_value")
180
+ parent_map = self._value_to_id.get(parent_source_id, {})
181
+ if input_val is not None and str(input_val) in parent_map:
182
+ row[parent_fk_col] = parent_map[str(input_val)]
183
+ else:
184
+ row[parent_fk_col] = None
185
+
186
+ rows.append(row)
187
+
188
+ if dry_run:
189
+ return len(rows)
190
+
191
+ # Determine the lookup field for children to reference this source
192
+ # This is the field that child dependencies extract from this source
193
+ lookup_field = self._get_child_lookup_field(source)
194
+
195
+ # Create table
196
+ if drop_existing and self.adapter.table_exists(table_name):
197
+ self.adapter.execute(f"DROP TABLE {table_name}")
198
+
199
+ if not self.adapter.table_exists(table_name):
200
+ schema = infer_table_schema(table_name, rows)
201
+ sql_types = schema.to_sql_types(self._dialect)
202
+ # Add auto-increment id column
203
+ col_defs = {"id": self._auto_id_type()}
204
+ col_defs.update(sql_types)
205
+ self.adapter.create_table(table_name, col_defs, primary_key="id")
206
+
207
+ # Insert rows one at a time to capture auto-increment IDs for FK mapping
208
+ value_map: dict[str, int] = {}
209
+ for i, row in enumerate(rows):
210
+ self.adapter.insert_batch(table_name, [row])
211
+ # Get the last inserted id
212
+ last_id = self._get_last_id(table_name)
213
+
214
+ # Build value→id map for child sources
215
+ if lookup_field and last_id is not None:
216
+ raw_record = raw_records[i]
217
+ lookup_val = _extract_dot_value(raw_record, lookup_field)
218
+ if lookup_val is None:
219
+ lookup_val = raw_record.get(lookup_field)
220
+ if lookup_val is not None:
221
+ value_map[str(lookup_val)] = last_id
222
+
223
+ if value_map:
224
+ self._value_to_id[source.id] = value_map
225
+
226
+ return len(rows)
227
+
228
+ def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
229
+ """Find which field children use to reference this source."""
230
+ for other in self.config.sources:
231
+ if other.dependency and other.dependency.from_source == source.id:
232
+ return other.dependency.field
233
+ return None
234
+
235
+ def _auto_id_type(self) -> str:
236
+ """Get the auto-increment ID column type for the dialect."""
237
+ if self._dialect == "postgres":
238
+ return "SERIAL"
239
+ return "INTEGER"
240
+
241
+ def _get_last_id(self, table_name: str) -> int | None:
242
+ """Get the last inserted auto-increment ID."""
243
+ row = self.adapter.fetch_one(
244
+ f"SELECT MAX(id) as last_id FROM {table_name}"
245
+ )
246
+ if row:
247
+ return row.get("last_id")
248
+ return None
@@ -0,0 +1,30 @@
1
+ """Dataset-specific error classes."""
2
+
3
+ from anysite.api.errors import AnysiteError
4
+
5
+
6
+ class DatasetError(AnysiteError):
7
+ """Base error for dataset operations."""
8
+
9
+ def __init__(self, message: str) -> None:
10
+ super().__init__(message)
11
+
12
+
13
+ class CircularDependencyError(DatasetError):
14
+ """Raised when source dependencies form a cycle."""
15
+
16
+ def __init__(self, sources: list[str]) -> None:
17
+ self.sources = sources
18
+ cycle = " -> ".join(sources)
19
+ super().__init__(f"Circular dependency detected: {cycle}")
20
+
21
+
22
+ class SourceNotFoundError(DatasetError):
23
+ """Raised when a dependency references a non-existent source."""
24
+
25
+ def __init__(self, source_id: str, referenced_by: str) -> None:
26
+ self.source_id = source_id
27
+ self.referenced_by = referenced_by
28
+ super().__init__(
29
+ f"Source '{source_id}' referenced by '{referenced_by}' does not exist"
30
+ )
@@ -0,0 +1,121 @@
1
+ """Export destinations — file and webhook exporters for per-source output.
2
+
3
+ These run after Parquet write as optional supplementary exports.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import csv
9
+ import json
10
+ import logging
11
+ from datetime import UTC, datetime
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from anysite.dataset.models import ExportDestination
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ async def run_exports(
21
+ records: list[dict[str, Any]],
22
+ exports: list[ExportDestination],
23
+ source_id: str,
24
+ dataset_name: str,
25
+ ) -> None:
26
+ """Run all export destinations for a source's records."""
27
+ for export in exports:
28
+ try:
29
+ if export.type == "file":
30
+ await _export_file(records, export, source_id, dataset_name)
31
+ elif export.type == "webhook":
32
+ await _export_webhook(records, export, source_id, dataset_name)
33
+ except Exception as e:
34
+ logger.error("Export %s failed for source %s: %s", export.type, source_id, e)
35
+
36
+
37
+ async def _export_file(
38
+ records: list[dict[str, Any]],
39
+ config: ExportDestination,
40
+ source_id: str,
41
+ dataset_name: str,
42
+ ) -> None:
43
+ """Write records to a file (JSON, JSONL, or CSV)."""
44
+ if not config.path or not records:
45
+ return
46
+
47
+ path = _expand_template(config.path, source_id, dataset_name)
48
+ parent = Path(path).parent
49
+ parent.mkdir(parents=True, exist_ok=True)
50
+
51
+ fmt = config.format.lower()
52
+
53
+ if fmt == "jsonl":
54
+ with open(path, "w", encoding="utf-8") as f:
55
+ for r in records:
56
+ f.write(json.dumps(r, default=str, ensure_ascii=False) + "\n")
57
+ elif fmt == "json":
58
+ with open(path, "w", encoding="utf-8") as f:
59
+ json.dump(records, f, default=str, ensure_ascii=False, indent=2)
60
+ elif fmt == "csv":
61
+ if not records:
62
+ return
63
+ fieldnames = list(records[0].keys())
64
+ with open(path, "w", newline="", encoding="utf-8") as f:
65
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
66
+ writer.writeheader()
67
+ for r in records:
68
+ writer.writerow({k: _csv_value(v) for k, v in r.items()})
69
+ else:
70
+ raise ValueError(f"Unsupported export format: {fmt}")
71
+
72
+ logger.info("Exported %d records to %s (%s)", len(records), path, fmt)
73
+
74
+
75
+ async def _export_webhook(
76
+ records: list[dict[str, Any]],
77
+ config: ExportDestination,
78
+ source_id: str,
79
+ dataset_name: str,
80
+ ) -> None:
81
+ """POST records to a webhook URL."""
82
+ if not config.url or not records:
83
+ return
84
+
85
+ import httpx
86
+
87
+ payload = {
88
+ "dataset": dataset_name,
89
+ "source": source_id,
90
+ "count": len(records),
91
+ "records": records,
92
+ "timestamp": datetime.now(UTC).isoformat(),
93
+ }
94
+
95
+ async with httpx.AsyncClient(timeout=30.0) as client:
96
+ resp = await client.post(
97
+ config.url,
98
+ json=payload,
99
+ headers=config.headers,
100
+ )
101
+ resp.raise_for_status()
102
+
103
+ logger.info("Exported %d records to webhook %s", len(records), config.url)
104
+
105
+
106
+ def _expand_template(path: str, source_id: str, dataset_name: str) -> str:
107
+ """Expand {{date}}, {{datetime}}, {{source}}, {{dataset}} placeholders."""
108
+ now = datetime.now(UTC)
109
+ return (
110
+ path.replace("{{date}}", now.strftime("%Y-%m-%d"))
111
+ .replace("{{datetime}}", now.strftime("%Y-%m-%dT%H%M%S"))
112
+ .replace("{{source}}", source_id)
113
+ .replace("{{dataset}}", dataset_name)
114
+ )
115
+
116
+
117
+ def _csv_value(v: Any) -> Any:
118
+ """Convert complex values to strings for CSV output."""
119
+ if isinstance(v, (dict, list)):
120
+ return json.dumps(v, default=str, ensure_ascii=False)
121
+ return v
@@ -0,0 +1,153 @@
1
+ """Dataset run history — SQLite-backed tracking and file-based logs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sqlite3
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+ from pathlib import Path
10
+
11
+ from anysite.config.paths import get_config_dir
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _DB_NAME = "dataset_history.db"
16
+ _LOG_DIR = "logs"
17
+
18
+
19
+ @dataclass
20
+ class RunRecord:
21
+ """A single dataset collection run."""
22
+
23
+ id: int | None = None
24
+ dataset_name: str = ""
25
+ status: str = "running" # running | success | failed | partial
26
+ started_at: str = ""
27
+ finished_at: str | None = None
28
+ record_count: int = 0
29
+ source_count: int = 0
30
+ error: str | None = None
31
+ duration: float = 0.0
32
+
33
+
34
+ class HistoryStore:
35
+ """SQLite-backed run history at ~/.anysite/dataset_history.db."""
36
+
37
+ def __init__(self, db_path: Path | None = None) -> None:
38
+ self.db_path = db_path or (get_config_dir() / _DB_NAME)
39
+ self._ensure_table()
40
+
41
+ def _ensure_table(self) -> None:
42
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
43
+ with sqlite3.connect(str(self.db_path)) as conn:
44
+ conn.execute("""
45
+ CREATE TABLE IF NOT EXISTS runs (
46
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
47
+ dataset_name TEXT NOT NULL,
48
+ status TEXT NOT NULL DEFAULT 'running',
49
+ started_at TEXT NOT NULL,
50
+ finished_at TEXT,
51
+ record_count INTEGER DEFAULT 0,
52
+ source_count INTEGER DEFAULT 0,
53
+ error TEXT,
54
+ duration REAL DEFAULT 0.0
55
+ )
56
+ """)
57
+
58
+ def record_start(self, dataset_name: str) -> int:
59
+ """Record start of a collection run. Returns run ID."""
60
+ now = datetime.now(UTC).isoformat()
61
+ with sqlite3.connect(str(self.db_path)) as conn:
62
+ cursor = conn.execute(
63
+ "INSERT INTO runs (dataset_name, status, started_at) VALUES (?, 'running', ?)",
64
+ (dataset_name, now),
65
+ )
66
+ return cursor.lastrowid or 0
67
+
68
+ def record_finish(
69
+ self,
70
+ run_id: int,
71
+ *,
72
+ status: str = "success",
73
+ record_count: int = 0,
74
+ source_count: int = 0,
75
+ error: str | None = None,
76
+ duration: float = 0.0,
77
+ ) -> None:
78
+ """Record completion of a collection run."""
79
+ now = datetime.now(UTC).isoformat()
80
+ with sqlite3.connect(str(self.db_path)) as conn:
81
+ conn.execute(
82
+ """UPDATE runs SET status=?, finished_at=?, record_count=?,
83
+ source_count=?, error=?, duration=? WHERE id=?""",
84
+ (status, now, record_count, source_count, error, duration, run_id),
85
+ )
86
+
87
+ def get_history(self, dataset_name: str, limit: int = 20) -> list[RunRecord]:
88
+ """Get recent runs for a dataset."""
89
+ with sqlite3.connect(str(self.db_path)) as conn:
90
+ conn.row_factory = sqlite3.Row
91
+ rows = conn.execute(
92
+ "SELECT * FROM runs WHERE dataset_name=? ORDER BY id DESC LIMIT ?",
93
+ (dataset_name, limit),
94
+ ).fetchall()
95
+ return [
96
+ RunRecord(
97
+ id=r["id"],
98
+ dataset_name=r["dataset_name"],
99
+ status=r["status"],
100
+ started_at=r["started_at"],
101
+ finished_at=r["finished_at"],
102
+ record_count=r["record_count"],
103
+ source_count=r["source_count"],
104
+ error=r["error"],
105
+ duration=r["duration"],
106
+ )
107
+ for r in rows
108
+ ]
109
+
110
+ def get_all_datasets(self) -> list[str]:
111
+ """Get list of all dataset names with history."""
112
+ with sqlite3.connect(str(self.db_path)) as conn:
113
+ rows = conn.execute(
114
+ "SELECT DISTINCT dataset_name FROM runs ORDER BY dataset_name"
115
+ ).fetchall()
116
+ return [r[0] for r in rows]
117
+
118
+
119
+ class LogManager:
120
+ """File-based log storage at ~/.anysite/logs/."""
121
+
122
+ def __init__(self, log_dir: Path | None = None) -> None:
123
+ self.log_dir = log_dir or (get_config_dir() / _LOG_DIR)
124
+ self.log_dir.mkdir(parents=True, exist_ok=True)
125
+
126
+ def get_log_path(self, dataset_name: str, run_id: int) -> Path:
127
+ """Get the log file path for a specific run."""
128
+ return self.log_dir / f"{dataset_name}_{run_id}.log"
129
+
130
+ def create_handler(self, dataset_name: str, run_id: int) -> logging.FileHandler:
131
+ """Create a logging FileHandler for a run."""
132
+ path = self.get_log_path(dataset_name, run_id)
133
+ handler = logging.FileHandler(str(path))
134
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s"))
135
+ return handler
136
+
137
+ def read_log(self, dataset_name: str, run_id: int) -> str | None:
138
+ """Read a run's log file content."""
139
+ path = self.get_log_path(dataset_name, run_id)
140
+ if path.exists():
141
+ return path.read_text()
142
+ return None
143
+
144
+ def list_logs(self, dataset_name: str) -> list[tuple[int, Path]]:
145
+ """List available log files for a dataset."""
146
+ logs = []
147
+ for path in sorted(self.log_dir.glob(f"{dataset_name}_*.log")):
148
+ try:
149
+ run_id = int(path.stem.split("_")[-1])
150
+ logs.append((run_id, path))
151
+ except ValueError:
152
+ continue
153
+ return logs