PyPI - anysite-cli - Versions diffs - 0.1.2__tar.gz → 0.1.4__tar.gz - Mend

anysite-cli 0.1.2tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/CLAUDE.md RENAMED Viewed

@@ -53,6 +53,8 @@ anysite dataset history my-dataset
 anysite dataset logs my-dataset --run 42
 anysite dataset schedule dataset.yaml --incremental --load-db pg
 anysite dataset schedule dataset.yaml --systemd --load-db pg
+anysite dataset diff dataset.yaml --source profiles --key _input_value
+anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
 anysite dataset reset-cursor dataset.yaml
 anysite dataset reset-cursor dataset.yaml --source profiles
@@ -102,7 +104,8 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
 - `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
 - `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
 - `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
-- `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `history`, `logs`, `schedule`, `reset-cursor`
+- `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
+- `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
 - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
 - `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
 - `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
@@ -202,5 +205,5 @@ Tests are in `tests/` with subdirectories mirroring `src/anysite/`:
 - `test_streaming/` — Progress and writer
 - `test_output/` — Formatters and templates
 - `test_utils/` — Field selection and retry
-- `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications
+- `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications, differ
 - `test_db/` — Database adapters, schema inference, connection manager, operations

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: anysite-cli
-Version: 0.1.2
+Version: 0.1.4
 Summary: CLI for Anysite API - web data extraction for humans and AI agents
 Project-URL: Homepage, https://anysite.io
 Project-URL: Documentation, https://docs.anysite.io/cli
@@ -272,6 +272,7 @@ sources:
         - type: company
           value: "{value}"
       count: 5
+    refresh: always                       # Re-collect every run with --incremental
     db_load:
       fields: [name, url, headline]
@@ -327,6 +328,9 @@ anysite dataset logs my-dataset --run 42
 # Generate cron/systemd schedule
 anysite dataset schedule dataset.yaml --incremental --load-db pg
+# Compare snapshots (diff two collection dates)
+anysite dataset diff dataset.yaml --source employees --key _input_value
 # Reset incremental state
 anysite dataset reset-cursor dataset.yaml
 ```

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/README.md RENAMED Viewed

@@ -209,6 +209,7 @@ sources:
         - type: company
           value: "{value}"
       count: 5
+    refresh: always                       # Re-collect every run with --incremental
     db_load:
       fields: [name, url, headline]
@@ -264,6 +265,9 @@ anysite dataset logs my-dataset --run 42
 # Generate cron/systemd schedule
 anysite dataset schedule dataset.yaml --incremental --load-db pg
+# Compare snapshots (diff two collection dates)
+anysite dataset diff dataset.yaml --source employees --key _input_value
 # Reset incremental state
 anysite dataset reset-cursor dataset.yaml
 ```

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "anysite-cli"
-version = "0.1.2"
+version = "0.1.4"
 description = "CLI for Anysite API - web data extraction for humans and AI agents"
 readme = "README.md"
 license = "MIT"

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/SKILL.md RENAMED Viewed

@@ -133,6 +133,7 @@ sources:
       count: 5
     parallel: 3
     on_error: skip
+    refresh: always             # Re-collect every run even with --incremental
 storage:
   format: parquet
@@ -235,7 +236,17 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
   | anysite db insert pg --table profiles --stdin --auto-create
 ```
-### Step 6: History, Scheduling, and Notifications
+### Step 6: Compare Snapshots
+```bash
+# Diff two most recent snapshots
+anysite dataset diff dataset.yaml --source employees --key _input_value
+# Diff specific dates, compare only certain fields
+anysite dataset diff dataset.yaml --source employees --key _input_value \
+  --from 2026-01-30 --to 2026-02-01 --fields "name,headline"
+```
+### Step 7: History, Scheduling, and Notifications
 ```bash
 # View run history
 anysite dataset history my-dataset

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/skills/anysite-cli/references/dataset-guide.md RENAMED Viewed

@@ -24,6 +24,7 @@ sources:
     parallel: 3                # Concurrent requests
     rate_limit: "10/s"         # Rate limiting
     on_error: skip             # stop or skip
+    refresh: always            # auto (default) or always — re-collect every run
     transform:                 # Post-collection transform (for exports only)
       filter: '.count > 10'   # Safe filter expression
       fields: [name, url]     # Field selection with aliases
@@ -169,6 +170,25 @@ With `--incremental`:
 2. Dependent/from_file sources: skips individual input values already in `metadata.json`
 3. New values are still collected and tracked
+### Refresh Mode
+Per-source `refresh` field controls behavior with `--incremental`:
+```yaml
+- id: posts
+  endpoint: /api/linkedin/user/posts
+  dependency: { from_source: profiles, field: urn.value }
+  input_key: user
+  refresh: always    # Re-collect every run even with --incremental
+```
+| Setting | `--incremental` | No flag |
+|---------|----------------|---------|
+| `refresh: auto` (default) | Skip collected inputs | Collect all |
+| `refresh: always` | Collect all (ignore cache) | Collect all |
+Use `refresh: always` for sources with frequently changing data (e.g., posts, activity feeds) where you want fresh snapshots each run while still caching stable parent data.
 ### Storage Layout
 ```
@@ -428,6 +448,39 @@ Payload: `{event: "complete"|"failure", dataset, timestamp, record_count, source
 ---
+## Comparing Snapshots (Diff)
+Compare two collection snapshots to find added, removed, and changed records.
+```bash
+# Compare two most recent snapshots (auto-detect dates)
+anysite dataset diff dataset.yaml --source profiles --key _input_value
+# Compare specific dates
+anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
+# Only compare specific fields
+anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
+# Output as JSON/CSV
+anysite dataset diff dataset.yaml --source profiles --key urn --format json --output diff.json
+```
+**Options:**
+- `--source, -s` (required) — source to compare
+- `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
+- `--from` / `--to` — snapshot dates (default: two most recent)
+- `--fields, -f` — only compare these fields
+- `--format` — output format (table, json, jsonl, csv)
+- `--output, -o` — write to file
+**Output** shows summary counts and a table of changes:
+- **added** — records in the new snapshot but not the old
+- **removed** — records in the old snapshot but not the new
+- **changed** — records with the same key but different values (shows `old → new`)
+---
 ## Reset Incremental State
 Clear collected input tracking to force re-collection.

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/cli.py RENAMED Viewed

@@ -357,6 +357,10 @@ def load_db(
         bool,
         typer.Option("--quiet", "-q", help="Suppress progress output"),
     ] = False,
+    snapshot: Annotated[
+        str | None,
+        typer.Option("--snapshot", help="Load a specific snapshot date (YYYY-MM-DD)"),
+    ] = None,
 ) -> None:
     """Load collected Parquet data into a relational database with FK linking."""
     config = _load_config(config_path)
@@ -379,6 +383,7 @@ def load_db(
                 source_filter=source,
                 drop_existing=drop_existing,
                 dry_run=dry_run,
+                snapshot=snapshot,
             )
         except Exception as e:
             typer.echo(f"Load error: {e}", err=True)
@@ -413,6 +418,121 @@ def load_db(
         )
+@app.command("diff")
+def diff_cmd(
+    config_path: Annotated[
+        Path,
+        typer.Argument(help="Path to dataset.yaml"),
+    ],
+    source: Annotated[
+        str,
+        typer.Option("--source", "-s", help="Source to compare"),
+    ],
+    key: Annotated[
+        str,
+        typer.Option("--key", "-k", help="Field to match records by (e.g., _input_value, urn)"),
+    ],
+    from_date: Annotated[
+        str | None,
+        typer.Option("--from", help="Older snapshot date (YYYY-MM-DD)"),
+    ] = None,
+    to_date: Annotated[
+        str | None,
+        typer.Option("--to", help="Newer snapshot date (YYYY-MM-DD)"),
+    ] = None,
+    fields: Annotated[
+        str | None,
+        typer.Option("--fields", "-f", help="Only compare these fields (comma-separated)"),
+    ] = None,
+    format: Annotated[
+        str,
+        typer.Option("--format", help="Output format: table, json, jsonl, csv"),
+    ] = "table",
+    output: Annotated[
+        Path | None,
+        typer.Option("--output", "-o", help="Write output to file"),
+    ] = None,
+    quiet: Annotated[
+        bool,
+        typer.Option("--quiet", "-q", help="Suppress summary, only output data"),
+    ] = False,
+) -> None:
+    """Compare two snapshots of a source to show added, removed, and changed records."""
+    from datetime import date as date_type
+    from anysite.dataset.differ import (
+        DatasetDiffer,
+        format_diff_records,
+        format_diff_table,
+    )
+    config = _load_config(config_path)
+    # Validate source exists
+    src = config.get_source(source)
+    if src is None:
+        typer.echo(f"Error: source '{source}' not found in dataset", err=True)
+        raise typer.Exit(1)
+    differ = DatasetDiffer(config.storage_path())
+    # Parse dates
+    parsed_from = None
+    parsed_to = None
+    try:
+        if from_date:
+            parsed_from = date_type.fromisoformat(from_date)
+        if to_date:
+            parsed_to = date_type.fromisoformat(to_date)
+    except ValueError as e:
+        typer.echo(f"Error: invalid date format: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Parse fields
+    field_list = None
+    if fields:
+        field_list = [f.strip() for f in fields.split(",") if f.strip()]
+    try:
+        result = differ.diff(
+            source,
+            key,
+            from_date=parsed_from,
+            to_date=parsed_to,
+            fields=field_list,
+        )
+    except DatasetError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1) from None
+    # Print summary unless quiet
+    if not quiet:
+        console = Console()
+        console.print(
+            f"\n[bold]Diff: {source}[/bold] "
+            f"({result.from_date.isoformat()} → {result.to_date.isoformat()})\n"
+        )
+        console.print(f"  [green]Added:[/green]     {len(result.added)}")
+        console.print(f"  [red]Removed:[/red]   {len(result.removed)}")
+        console.print(f"  [yellow]Changed:[/yellow]   {len(result.changed)}")
+        console.print(f"  Unchanged: {result.unchanged_count}")
+        console.print()
+    if not result.has_changes:
+        if not quiet:
+            Console().print("[dim]No changes detected.[/dim]")
+        return
+    # Format and output
+    rows = (
+        format_diff_table(result, output_fields=field_list)
+        if format == "table"
+        else format_diff_records(result, output_fields=field_list)
+    )
+    _output_results(rows, format, output)
 @app.command("history")
 def history(
     name: Annotated[

{anysite_cli-0.1.2 → anysite_cli-0.1.4}/src/anysite/dataset/collector.py RENAMED Viewed

@@ -43,6 +43,7 @@ class CollectionPlan:
         params: dict[str, Any] | None = None,
         dependency: str | None = None,
         estimated_requests: int | None = None,
+        refresh: str = "auto",
     ) -> None:
         self.steps.append({
             "source": source_id,
@@ -51,6 +52,7 @@ class CollectionPlan:
             "params": params or {},
             "dependency": dependency,
             "estimated_requests": estimated_requests,
+            "refresh": refresh,
         })
@@ -116,8 +118,8 @@ async def collect_dataset(
     try:
         for source in ordered:
-            # Check incremental skip
-            if incremental:
+            # Check incremental skip (refresh: always bypasses this)
+            if incremental and source.refresh != "always":
                 parquet_path = get_parquet_path(base_path, source.id, today)
                 if parquet_path.exists():
                     if not quiet:
@@ -276,8 +278,8 @@ async def _collect_from_file(
             print_warning(f"No values extracted from {file_path}")
         return []
-    # Filter already-collected inputs in incremental mode
-    if incremental and metadata:
+    # Filter already-collected inputs in incremental mode (refresh: always bypasses)
+    if incremental and source.refresh != "always" and metadata:
         already = metadata.get_collected_inputs(source.id)
         if already:
             original = len(values)
@@ -432,8 +434,8 @@ async def _collect_dependent(
             f"Source {source.id} has a dependency but no input_key defined"
         )
-    # Filter already-collected inputs in incremental mode
-    if incremental and metadata:
+    # Filter already-collected inputs in incremental mode (refresh: always bypasses)
+    if incremental and source.refresh != "always" and metadata:
         already = metadata.get_collected_inputs(source.id)
         if already:
             original = len(values)
@@ -579,7 +581,7 @@ def _build_plan(
     plan = CollectionPlan()
     for source in ordered:
-        if incremental:
+        if incremental and source.refresh != "always":
             parquet_path = get_parquet_path(base_path, source.id, today)
             if parquet_path.exists():
                 continue
@@ -592,6 +594,7 @@ def _build_plan(
                 kind="from_file",
                 params={"file": source.from_file, "field": source.file_field},
                 estimated_requests=est,
+                refresh=source.refresh,
             )
         elif source.dependency is None:
             plan.add_step(
@@ -600,6 +603,7 @@ def _build_plan(
                 kind="independent",
                 params=source.params,
                 estimated_requests=1,
+                refresh=source.refresh,
             )
         else:
             est = _count_dependent_inputs(source, base_path, metadata)
@@ -609,6 +613,7 @@ def _build_plan(
                 kind="dependent",
                 dependency=source.dependency.from_source,
                 estimated_requests=est,
+                refresh=source.refresh,
             )
     return plan
@@ -665,11 +670,14 @@ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
     table.add_column("Est. Requests")
     for i, step in enumerate(plan.steps, 1):
+        kind = step["kind"]
+        if step.get("refresh") == "always":
+            kind += " (refresh)"
         table.add_row(
             str(i),
             step["source"],
             step["endpoint"],
-            step["kind"],
+            kind,
             step.get("dependency") or "-",
             str(step.get("estimated_requests") or "?"),
         )

anysite-cli 0.1.2__tar.gz → 0.1.4__tar.gz

anysite-cli 0.1.2tar.gz → 0.1.4tar.gz