PyPI - anysite-cli - Versions diffs - 0.1.2__py3-none-any.whl - Mend

anysite-cli 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

anysite/__init__.py +4 -0
anysite/__main__.py +6 -0
anysite/api/__init__.py +21 -0
anysite/api/client.py +271 -0
anysite/api/errors.py +137 -0
anysite/api/schemas.py +333 -0
anysite/batch/__init__.py +1 -0
anysite/batch/executor.py +176 -0
anysite/batch/input.py +160 -0
anysite/batch/rate_limiter.py +98 -0
anysite/cli/__init__.py +1 -0
anysite/cli/config.py +176 -0
anysite/cli/executor.py +388 -0
anysite/cli/options.py +249 -0
anysite/config/__init__.py +11 -0
anysite/config/paths.py +46 -0
anysite/config/settings.py +187 -0
anysite/dataset/__init__.py +37 -0
anysite/dataset/analyzer.py +268 -0
anysite/dataset/cli.py +644 -0
anysite/dataset/collector.py +686 -0
anysite/dataset/db_loader.py +248 -0
anysite/dataset/errors.py +30 -0
anysite/dataset/exporters.py +121 -0
anysite/dataset/history.py +153 -0
anysite/dataset/models.py +245 -0
anysite/dataset/notifications.py +87 -0
anysite/dataset/scheduler.py +107 -0
anysite/dataset/storage.py +171 -0
anysite/dataset/transformer.py +213 -0
anysite/db/__init__.py +38 -0
anysite/db/adapters/__init__.py +1 -0
anysite/db/adapters/base.py +158 -0
anysite/db/adapters/postgres.py +201 -0
anysite/db/adapters/sqlite.py +183 -0
anysite/db/cli.py +709 -0
anysite/db/config.py +92 -0
anysite/db/manager.py +166 -0
anysite/db/operations/__init__.py +1 -0
anysite/db/operations/insert.py +199 -0
anysite/db/operations/query.py +43 -0
anysite/db/schema/__init__.py +1 -0
anysite/db/schema/inference.py +213 -0
anysite/db/schema/types.py +71 -0
anysite/db/utils/__init__.py +1 -0
anysite/db/utils/sanitize.py +99 -0
anysite/main.py +498 -0
anysite/models/__init__.py +1 -0
anysite/output/__init__.py +11 -0
anysite/output/console.py +45 -0
anysite/output/formatters.py +301 -0
anysite/output/templates.py +76 -0
anysite/py.typed +0 -0
anysite/streaming/__init__.py +1 -0
anysite/streaming/progress.py +121 -0
anysite/streaming/writer.py +130 -0
anysite/utils/__init__.py +1 -0
anysite/utils/fields.py +242 -0
anysite/utils/retry.py +109 -0
anysite_cli-0.1.2.dist-info/METADATA +455 -0
anysite_cli-0.1.2.dist-info/RECORD +64 -0
anysite_cli-0.1.2.dist-info/WHEEL +4 -0
anysite_cli-0.1.2.dist-info/entry_points.txt +2 -0
anysite_cli-0.1.2.dist-info/licenses/LICENSE +21 -0

anysite/dataset/db_loader.py ADDED Viewed

@@ -0,0 +1,248 @@
+"""Load dataset Parquet data into a relational database with FK linking."""
+from __future__ import annotations
+import json
+from typing import Any
+from anysite.dataset.models import DatasetConfig, DatasetSource
+from anysite.dataset.storage import get_source_dir, read_parquet
+from anysite.db.adapters.base import DatabaseAdapter
+from anysite.db.schema.inference import infer_table_schema
+def _get_dialect(adapter: DatabaseAdapter) -> str:
+    """Extract dialect string from adapter server info."""
+    info = adapter.get_server_info()
+    return info.get("type", "sqlite")
+def _extract_dot_value(record: dict[str, Any], dot_path: str) -> Any:
+    """Extract a value from a record using dot notation.
+    Handles JSON strings stored in Parquet: if a field value is a JSON
+    string, it is parsed and the remainder of the dot path traversed.
+    """
+    parts = dot_path.split(".")
+    current: Any = record
+    for part in parts:
+        if isinstance(current, str):
+            try:
+                current = json.loads(current)
+            except (json.JSONDecodeError, ValueError):
+                return None
+        if isinstance(current, dict):
+            current = current.get(part)
+        else:
+            return None
+        if current is None:
+            return None
+    return current
+def _table_name_for(source: DatasetSource) -> str:
+    """Get the DB table name for a source."""
+    if source.db_load and source.db_load.table:
+        return source.db_load.table
+    return source.id.replace("-", "_").replace(".", "_")
+def _filter_record(
+    record: dict[str, Any],
+    source: DatasetSource,
+) -> dict[str, Any]:
+    """Filter and transform a record based on db_load config.
+    Applies field selection/exclusion and dot-notation extraction.
+    """
+    db_load = source.db_load
+    exclude = set(db_load.exclude) if db_load else {"_input_value", "_parent_source"}
+    if db_load and db_load.fields:
+        # Explicit field list — extract each field
+        row: dict[str, Any] = {}
+        for field_spec in db_load.fields:
+            # Parse "source_field AS alias" syntax
+            alias = None
+            upper = field_spec.upper()
+            as_idx = upper.find(" AS ")
+            if as_idx != -1:
+                alias = field_spec[as_idx + 4:].strip()
+                field_spec = field_spec[:as_idx].strip()
+            col_name = alias or field_spec.replace(".", "_")
+            if "." in field_spec:
+                row[col_name] = _extract_dot_value(record, field_spec)
+            else:
+                row[col_name] = record.get(field_spec)
+        return row
+    else:
+        # All fields minus exclusions
+        return {k: v for k, v in record.items() if k not in exclude}
+class DatasetDbLoader:
+    """Load dataset Parquet data into a relational database.
+    Handles:
+    - Schema inference from Parquet records
+    - Auto-increment primary keys (``id`` column)
+    - Foreign key linking via provenance ``_input_value`` column
+    - Dot-notation field extraction for JSON columns
+    - Topological loading order (parents before children)
+    """
+    def __init__(
+        self,
+        config: DatasetConfig,
+        adapter: DatabaseAdapter,
+    ) -> None:
+        self.config = config
+        self.adapter = adapter
+        self.base_path = config.storage_path()
+        self._dialect = _get_dialect(adapter)
+        # Maps source_id -> {input_value -> db_id} for FK linking
+        self._value_to_id: dict[str, dict[str, int]] = {}
+    def load_all(
+        self,
+        *,
+        source_filter: str | None = None,
+        drop_existing: bool = False,
+        dry_run: bool = False,
+    ) -> dict[str, int]:
+        """Load all sources into the database in dependency order.
+        Args:
+            source_filter: Only load this source (and dependencies).
+            drop_existing: Drop tables before creating.
+            dry_run: Show plan without executing.
+        Returns:
+            Mapping of source_id to number of rows loaded.
+        """
+        sources = self.config.topological_sort()
+        if source_filter:
+            from anysite.dataset.collector import _filter_sources
+            sources = _filter_sources(sources, source_filter, self.config)
+        results: dict[str, int] = {}
+        for source in sources:
+            count = self._load_source(
+                source,
+                drop_existing=drop_existing,
+                dry_run=dry_run,
+            )
+            results[source.id] = count
+        return results
+    def _load_source(
+        self,
+        source: DatasetSource,
+        *,
+        drop_existing: bool = False,
+        dry_run: bool = False,
+    ) -> int:
+        """Load a single source into the database."""
+        source_dir = get_source_dir(self.base_path, source.id)
+        if not source_dir.exists() or not any(source_dir.glob("*.parquet")):
+            return 0
+        raw_records = read_parquet(source_dir)
+        if not raw_records:
+            return 0
+        table_name = _table_name_for(source)
+        # Determine parent info for FK linking
+        parent_source_id = None
+        parent_fk_col = None
+        if source.dependency:
+            parent_source_id = source.dependency.from_source
+            parent_fk_col = f"{parent_source_id.replace('-', '_').replace('.', '_')}_id"
+        # Transform records
+        rows: list[dict[str, Any]] = []
+        for record in raw_records:
+            row = _filter_record(record, source)
+            # Add FK column if this is a dependent source
+            if parent_source_id and parent_fk_col:
+                input_val = record.get("_input_value")
+                parent_map = self._value_to_id.get(parent_source_id, {})
+                if input_val is not None and str(input_val) in parent_map:
+                    row[parent_fk_col] = parent_map[str(input_val)]
+                else:
+                    row[parent_fk_col] = None
+            rows.append(row)
+        if dry_run:
+            return len(rows)
+        # Determine the lookup field for children to reference this source
+        # This is the field that child dependencies extract from this source
+        lookup_field = self._get_child_lookup_field(source)
+        # Create table
+        if drop_existing and self.adapter.table_exists(table_name):
+            self.adapter.execute(f"DROP TABLE {table_name}")
+        if not self.adapter.table_exists(table_name):
+            schema = infer_table_schema(table_name, rows)
+            sql_types = schema.to_sql_types(self._dialect)
+            # Add auto-increment id column
+            col_defs = {"id": self._auto_id_type()}
+            col_defs.update(sql_types)
+            self.adapter.create_table(table_name, col_defs, primary_key="id")
+        # Insert rows one at a time to capture auto-increment IDs for FK mapping
+        value_map: dict[str, int] = {}
+        for i, row in enumerate(rows):
+            self.adapter.insert_batch(table_name, [row])
+            # Get the last inserted id
+            last_id = self._get_last_id(table_name)
+            # Build value→id map for child sources
+            if lookup_field and last_id is not None:
+                raw_record = raw_records[i]
+                lookup_val = _extract_dot_value(raw_record, lookup_field)
+                if lookup_val is None:
+                    lookup_val = raw_record.get(lookup_field)
+                if lookup_val is not None:
+                    value_map[str(lookup_val)] = last_id
+        if value_map:
+            self._value_to_id[source.id] = value_map
+        return len(rows)
+    def _get_child_lookup_field(self, source: DatasetSource) -> str | None:
+        """Find which field children use to reference this source."""
+        for other in self.config.sources:
+            if other.dependency and other.dependency.from_source == source.id:
+                return other.dependency.field
+        return None
+    def _auto_id_type(self) -> str:
+        """Get the auto-increment ID column type for the dialect."""
+        if self._dialect == "postgres":
+            return "SERIAL"
+        return "INTEGER"
+    def _get_last_id(self, table_name: str) -> int | None:
+        """Get the last inserted auto-increment ID."""
+        row = self.adapter.fetch_one(
+            f"SELECT MAX(id) as last_id FROM {table_name}"
+        )
+        if row:
+            return row.get("last_id")
+        return None

anysite/dataset/errors.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Dataset-specific error classes."""
+from anysite.api.errors import AnysiteError
+class DatasetError(AnysiteError):
+    """Base error for dataset operations."""
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+class CircularDependencyError(DatasetError):
+    """Raised when source dependencies form a cycle."""
+    def __init__(self, sources: list[str]) -> None:
+        self.sources = sources
+        cycle = " -> ".join(sources)
+        super().__init__(f"Circular dependency detected: {cycle}")
+class SourceNotFoundError(DatasetError):
+    """Raised when a dependency references a non-existent source."""
+    def __init__(self, source_id: str, referenced_by: str) -> None:
+        self.source_id = source_id
+        self.referenced_by = referenced_by
+        super().__init__(
+            f"Source '{source_id}' referenced by '{referenced_by}' does not exist"
+        )

anysite/dataset/exporters.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""Export destinations — file and webhook exporters for per-source output.
+These run after Parquet write as optional supplementary exports.
+"""
+from __future__ import annotations
+import csv
+import json
+import logging
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+from anysite.dataset.models import ExportDestination
+logger = logging.getLogger(__name__)
+async def run_exports(
+    records: list[dict[str, Any]],
+    exports: list[ExportDestination],
+    source_id: str,
+    dataset_name: str,
+) -> None:
+    """Run all export destinations for a source's records."""
+    for export in exports:
+        try:
+            if export.type == "file":
+                await _export_file(records, export, source_id, dataset_name)
+            elif export.type == "webhook":
+                await _export_webhook(records, export, source_id, dataset_name)
+        except Exception as e:
+            logger.error("Export %s failed for source %s: %s", export.type, source_id, e)
+async def _export_file(
+    records: list[dict[str, Any]],
+    config: ExportDestination,
+    source_id: str,
+    dataset_name: str,
+) -> None:
+    """Write records to a file (JSON, JSONL, or CSV)."""
+    if not config.path or not records:
+        return
+    path = _expand_template(config.path, source_id, dataset_name)
+    parent = Path(path).parent
+    parent.mkdir(parents=True, exist_ok=True)
+    fmt = config.format.lower()
+    if fmt == "jsonl":
+        with open(path, "w", encoding="utf-8") as f:
+            for r in records:
+                f.write(json.dumps(r, default=str, ensure_ascii=False) + "\n")
+    elif fmt == "json":
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(records, f, default=str, ensure_ascii=False, indent=2)
+    elif fmt == "csv":
+        if not records:
+            return
+        fieldnames = list(records[0].keys())
+        with open(path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
+            writer.writeheader()
+            for r in records:
+                writer.writerow({k: _csv_value(v) for k, v in r.items()})
+    else:
+        raise ValueError(f"Unsupported export format: {fmt}")
+    logger.info("Exported %d records to %s (%s)", len(records), path, fmt)
+async def _export_webhook(
+    records: list[dict[str, Any]],
+    config: ExportDestination,
+    source_id: str,
+    dataset_name: str,
+) -> None:
+    """POST records to a webhook URL."""
+    if not config.url or not records:
+        return
+    import httpx
+    payload = {
+        "dataset": dataset_name,
+        "source": source_id,
+        "count": len(records),
+        "records": records,
+        "timestamp": datetime.now(UTC).isoformat(),
+    }
+    async with httpx.AsyncClient(timeout=30.0) as client:
+        resp = await client.post(
+            config.url,
+            json=payload,
+            headers=config.headers,
+        )
+        resp.raise_for_status()
+    logger.info("Exported %d records to webhook %s", len(records), config.url)
+def _expand_template(path: str, source_id: str, dataset_name: str) -> str:
+    """Expand {{date}}, {{datetime}}, {{source}}, {{dataset}} placeholders."""
+    now = datetime.now(UTC)
+    return (
+        path.replace("{{date}}", now.strftime("%Y-%m-%d"))
+        .replace("{{datetime}}", now.strftime("%Y-%m-%dT%H%M%S"))
+        .replace("{{source}}", source_id)
+        .replace("{{dataset}}", dataset_name)
+    )
+def _csv_value(v: Any) -> Any:
+    """Convert complex values to strings for CSV output."""
+    if isinstance(v, (dict, list)):
+        return json.dumps(v, default=str, ensure_ascii=False)
+    return v

anysite/dataset/history.py ADDED Viewed

@@ -0,0 +1,153 @@
+"""Dataset run history — SQLite-backed tracking and file-based logs."""
+from __future__ import annotations
+import logging
+import sqlite3
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+from anysite.config.paths import get_config_dir
+logger = logging.getLogger(__name__)
+_DB_NAME = "dataset_history.db"
+_LOG_DIR = "logs"
+@dataclass
+class RunRecord:
+    """A single dataset collection run."""
+    id: int | None = None
+    dataset_name: str = ""
+    status: str = "running"  # running | success | failed | partial
+    started_at: str = ""
+    finished_at: str | None = None
+    record_count: int = 0
+    source_count: int = 0
+    error: str | None = None
+    duration: float = 0.0
+class HistoryStore:
+    """SQLite-backed run history at ~/.anysite/dataset_history.db."""
+    def __init__(self, db_path: Path | None = None) -> None:
+        self.db_path = db_path or (get_config_dir() / _DB_NAME)
+        self._ensure_table()
+    def _ensure_table(self) -> None:
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        with sqlite3.connect(str(self.db_path)) as conn:
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS runs (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    dataset_name TEXT NOT NULL,
+                    status TEXT NOT NULL DEFAULT 'running',
+                    started_at TEXT NOT NULL,
+                    finished_at TEXT,
+                    record_count INTEGER DEFAULT 0,
+                    source_count INTEGER DEFAULT 0,
+                    error TEXT,
+                    duration REAL DEFAULT 0.0
+                )
+            """)
+    def record_start(self, dataset_name: str) -> int:
+        """Record start of a collection run. Returns run ID."""
+        now = datetime.now(UTC).isoformat()
+        with sqlite3.connect(str(self.db_path)) as conn:
+            cursor = conn.execute(
+                "INSERT INTO runs (dataset_name, status, started_at) VALUES (?, 'running', ?)",
+                (dataset_name, now),
+            )
+            return cursor.lastrowid or 0
+    def record_finish(
+        self,
+        run_id: int,
+        *,
+        status: str = "success",
+        record_count: int = 0,
+        source_count: int = 0,
+        error: str | None = None,
+        duration: float = 0.0,
+    ) -> None:
+        """Record completion of a collection run."""
+        now = datetime.now(UTC).isoformat()
+        with sqlite3.connect(str(self.db_path)) as conn:
+            conn.execute(
+                """UPDATE runs SET status=?, finished_at=?, record_count=?,
+                   source_count=?, error=?, duration=? WHERE id=?""",
+                (status, now, record_count, source_count, error, duration, run_id),
+            )
+    def get_history(self, dataset_name: str, limit: int = 20) -> list[RunRecord]:
+        """Get recent runs for a dataset."""
+        with sqlite3.connect(str(self.db_path)) as conn:
+            conn.row_factory = sqlite3.Row
+            rows = conn.execute(
+                "SELECT * FROM runs WHERE dataset_name=? ORDER BY id DESC LIMIT ?",
+                (dataset_name, limit),
+            ).fetchall()
+        return [
+            RunRecord(
+                id=r["id"],
+                dataset_name=r["dataset_name"],
+                status=r["status"],
+                started_at=r["started_at"],
+                finished_at=r["finished_at"],
+                record_count=r["record_count"],
+                source_count=r["source_count"],
+                error=r["error"],
+                duration=r["duration"],
+            )
+            for r in rows
+        ]
+    def get_all_datasets(self) -> list[str]:
+        """Get list of all dataset names with history."""
+        with sqlite3.connect(str(self.db_path)) as conn:
+            rows = conn.execute(
+                "SELECT DISTINCT dataset_name FROM runs ORDER BY dataset_name"
+            ).fetchall()
+        return [r[0] for r in rows]
+class LogManager:
+    """File-based log storage at ~/.anysite/logs/."""
+    def __init__(self, log_dir: Path | None = None) -> None:
+        self.log_dir = log_dir or (get_config_dir() / _LOG_DIR)
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+    def get_log_path(self, dataset_name: str, run_id: int) -> Path:
+        """Get the log file path for a specific run."""
+        return self.log_dir / f"{dataset_name}_{run_id}.log"
+    def create_handler(self, dataset_name: str, run_id: int) -> logging.FileHandler:
+        """Create a logging FileHandler for a run."""
+        path = self.get_log_path(dataset_name, run_id)
+        handler = logging.FileHandler(str(path))
+        handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s"))
+        return handler
+    def read_log(self, dataset_name: str, run_id: int) -> str | None:
+        """Read a run's log file content."""
+        path = self.get_log_path(dataset_name, run_id)
+        if path.exists():
+            return path.read_text()
+        return None
+    def list_logs(self, dataset_name: str) -> list[tuple[int, Path]]:
+        """List available log files for a dataset."""
+        logs = []
+        for path in sorted(self.log_dir.glob(f"{dataset_name}_*.log")):
+            try:
+                run_id = int(path.stem.split("_")[-1])
+                logs.append((run_id, path))
+            except ValueError:
+                continue
+        return logs