anysite-cli 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anysite-cli might be problematic. Click here for more details.
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/.gitignore +1 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/CLAUDE.md +5 -2
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/PKG-INFO +25 -3
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/README.md +18 -2
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/pyproject.toml +8 -1
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/skills/anysite-cli/SKILL.md +25 -3
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/skills/anysite-cli/references/dataset-guide.md +61 -1
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/cli.py +111 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/collector.py +16 -8
- anysite_cli-0.1.3/src/anysite/dataset/differ.py +355 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/models.py +4 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/cli.py +22 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_collector.py +191 -0
- anysite_cli-0.1.3/tests/test_dataset/test_differ.py +338 -0
- anysite_cli-0.1.3/tests/test_dataset/test_integration_csv.py +291 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/.claude/settings.local.json +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/LICENSE +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/skills/anysite-cli/references/api-reference.md +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/__main__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/client.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/errors.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/schemas.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/executor.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/input.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/rate_limiter.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/config.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/executor.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/options.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/config/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/config/paths.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/config/settings.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/analyzer.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/db_loader.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/errors.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/exporters.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/history.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/notifications.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/scheduler.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/storage.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/transformer.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/base.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/postgres.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/sqlite.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/config.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/manager.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/operations/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/operations/insert.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/operations/query.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/schema/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/schema/inference.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/schema/types.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/utils/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/utils/sanitize.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/main.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/models/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/console.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/formatters.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/templates.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/py.typed +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/streaming/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/streaming/progress.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/streaming/writer.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/utils/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/utils/fields.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/utils/retry.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/enriched_partners_sample_10.csv +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/linkedin-partners/company_aliases.txt +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/linkedin-partners/dataset.yaml +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-deep/dataset.yaml +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-intel/dataset.yaml +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-linkedin/company_aliases.txt +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-linkedin/dataset.yaml +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-pipeline/dataset.yaml +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/conftest.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_api/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/test_executor.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/test_input.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/test_rate_limiter.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_cli/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_cli/test_main.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_analyzer.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_db_loader.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_exporters.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_history.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_models.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_notifications.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_scheduler.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_storage.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_transformer.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_cli.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_config.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_inference.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_insert.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_manager.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_postgres_adapter.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_sanitize.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_sqlite_adapter.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_output/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_output/test_formatters.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_output/test_templates.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_streaming/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_streaming/test_progress.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_streaming/test_writer.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_utils/__init__.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_utils/test_fields.py +0 -0
- {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_utils/test_retry.py +0 -0
|
@@ -53,6 +53,8 @@ anysite dataset history my-dataset
|
|
|
53
53
|
anysite dataset logs my-dataset --run 42
|
|
54
54
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
55
55
|
anysite dataset schedule dataset.yaml --systemd --load-db pg
|
|
56
|
+
anysite dataset diff dataset.yaml --source profiles --key _input_value
|
|
57
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
|
|
56
58
|
anysite dataset reset-cursor dataset.yaml
|
|
57
59
|
anysite dataset reset-cursor dataset.yaml --source profiles
|
|
58
60
|
|
|
@@ -102,7 +104,8 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
|
|
|
102
104
|
- `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
|
|
103
105
|
- `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
|
|
104
106
|
- `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
|
|
105
|
-
- `dataset/
|
|
107
|
+
- `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
|
|
108
|
+
- `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
|
|
106
109
|
- `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
|
|
107
110
|
- `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
|
|
108
111
|
- `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
|
|
@@ -202,5 +205,5 @@ Tests are in `tests/` with subdirectories mirroring `src/anysite/`:
|
|
|
202
205
|
- `test_streaming/` — Progress and writer
|
|
203
206
|
- `test_output/` — Formatters and templates
|
|
204
207
|
- `test_utils/` — Field selection and retry
|
|
205
|
-
- `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications
|
|
208
|
+
- `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications, differ
|
|
206
209
|
- `test_db/` — Database adapters, schema inference, connection manager, operations
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: anysite-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: CLI for Anysite API - web data extraction for humans and AI agents
|
|
5
5
|
Project-URL: Homepage, https://anysite.io
|
|
6
6
|
Project-URL: Documentation, https://docs.anysite.io/cli
|
|
@@ -33,6 +33,12 @@ Requires-Dist: pyyaml>=6.0.0
|
|
|
33
33
|
Requires-Dist: rich>=13.0.0
|
|
34
34
|
Requires-Dist: tabulate>=0.9.0
|
|
35
35
|
Requires-Dist: typer[all]>=0.9.0
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: duckdb>=1.0.0; extra == 'all'
|
|
38
|
+
Requires-Dist: httpx>=0.25.0; extra == 'all'
|
|
39
|
+
Requires-Dist: psycopg[binary]>=3.1.0; extra == 'all'
|
|
40
|
+
Requires-Dist: pyarrow>=15.0.0; extra == 'all'
|
|
41
|
+
Requires-Dist: pymysql>=1.1.0; extra == 'all'
|
|
36
42
|
Provides-Extra: data
|
|
37
43
|
Requires-Dist: duckdb>=1.0.0; extra == 'data'
|
|
38
44
|
Requires-Dist: httpx>=0.25.0; extra == 'data'
|
|
@@ -65,11 +71,21 @@ Web data extraction for humans and AI agents.
|
|
|
65
71
|
pip install anysite-cli
|
|
66
72
|
```
|
|
67
73
|
|
|
74
|
+
Optional extras:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install "anysite-cli[data]" # DuckDB + PyArrow for dataset pipelines
|
|
78
|
+
pip install "anysite-cli[postgres]" # PostgreSQL support
|
|
79
|
+
pip install "anysite-cli[all]" # All optional dependencies
|
|
80
|
+
```
|
|
81
|
+
|
|
68
82
|
Or install from source:
|
|
69
83
|
|
|
70
84
|
```bash
|
|
71
85
|
git clone https://github.com/anysiteio/anysite-cli.git
|
|
72
86
|
cd anysite-cli
|
|
87
|
+
python -m venv .venv
|
|
88
|
+
source .venv/bin/activate
|
|
73
89
|
pip install -e .
|
|
74
90
|
```
|
|
75
91
|
|
|
@@ -256,6 +272,7 @@ sources:
|
|
|
256
272
|
- type: company
|
|
257
273
|
value: "{value}"
|
|
258
274
|
count: 5
|
|
275
|
+
refresh: always # Re-collect every run with --incremental
|
|
259
276
|
db_load:
|
|
260
277
|
fields: [name, url, headline]
|
|
261
278
|
|
|
@@ -311,6 +328,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
311
328
|
# Generate cron/systemd schedule
|
|
312
329
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
313
330
|
|
|
331
|
+
# Compare snapshots (diff two collection dates)
|
|
332
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
333
|
+
|
|
314
334
|
# Reset incremental state
|
|
315
335
|
anysite dataset reset-cursor dataset.yaml
|
|
316
336
|
```
|
|
@@ -320,8 +340,10 @@ anysite dataset reset-cursor dataset.yaml
|
|
|
320
340
|
Manage database connections and run queries.
|
|
321
341
|
|
|
322
342
|
```bash
|
|
323
|
-
# Add a connection
|
|
324
|
-
anysite db add pg
|
|
343
|
+
# Add a connection (--password auto-stores via env var reference)
|
|
344
|
+
anysite db add pg --type postgres --host localhost --database mydb --user app --password secret
|
|
345
|
+
# Or reference an existing env var
|
|
346
|
+
anysite db add pg --type postgres --host localhost --database mydb --user app --password-env PGPASS
|
|
325
347
|
|
|
326
348
|
# List and test connections
|
|
327
349
|
anysite db list
|
|
@@ -8,11 +8,21 @@ Web data extraction for humans and AI agents.
|
|
|
8
8
|
pip install anysite-cli
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
+
Optional extras:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install "anysite-cli[data]" # DuckDB + PyArrow for dataset pipelines
|
|
15
|
+
pip install "anysite-cli[postgres]" # PostgreSQL support
|
|
16
|
+
pip install "anysite-cli[all]" # All optional dependencies
|
|
17
|
+
```
|
|
18
|
+
|
|
11
19
|
Or install from source:
|
|
12
20
|
|
|
13
21
|
```bash
|
|
14
22
|
git clone https://github.com/anysiteio/anysite-cli.git
|
|
15
23
|
cd anysite-cli
|
|
24
|
+
python -m venv .venv
|
|
25
|
+
source .venv/bin/activate
|
|
16
26
|
pip install -e .
|
|
17
27
|
```
|
|
18
28
|
|
|
@@ -199,6 +209,7 @@ sources:
|
|
|
199
209
|
- type: company
|
|
200
210
|
value: "{value}"
|
|
201
211
|
count: 5
|
|
212
|
+
refresh: always # Re-collect every run with --incremental
|
|
202
213
|
db_load:
|
|
203
214
|
fields: [name, url, headline]
|
|
204
215
|
|
|
@@ -254,6 +265,9 @@ anysite dataset logs my-dataset --run 42
|
|
|
254
265
|
# Generate cron/systemd schedule
|
|
255
266
|
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
256
267
|
|
|
268
|
+
# Compare snapshots (diff two collection dates)
|
|
269
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
270
|
+
|
|
257
271
|
# Reset incremental state
|
|
258
272
|
anysite dataset reset-cursor dataset.yaml
|
|
259
273
|
```
|
|
@@ -263,8 +277,10 @@ anysite dataset reset-cursor dataset.yaml
|
|
|
263
277
|
Manage database connections and run queries.
|
|
264
278
|
|
|
265
279
|
```bash
|
|
266
|
-
# Add a connection
|
|
267
|
-
anysite db add pg
|
|
280
|
+
# Add a connection (--password auto-stores via env var reference)
|
|
281
|
+
anysite db add pg --type postgres --host localhost --database mydb --user app --password secret
|
|
282
|
+
# Or reference an existing env var
|
|
283
|
+
anysite db add pg --type postgres --host localhost --database mydb --user app --password-env PGPASS
|
|
268
284
|
|
|
269
285
|
# List and test connections
|
|
270
286
|
anysite db list
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "anysite-cli"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.3"
|
|
8
8
|
description = "CLI for Anysite API - web data extraction for humans and AI agents"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -57,6 +57,13 @@ db = [
|
|
|
57
57
|
"psycopg[binary]>=3.1.0",
|
|
58
58
|
"pymysql>=1.1.0",
|
|
59
59
|
]
|
|
60
|
+
all = [
|
|
61
|
+
"duckdb>=1.0.0",
|
|
62
|
+
"pyarrow>=15.0.0",
|
|
63
|
+
"httpx>=0.25.0",
|
|
64
|
+
"psycopg[binary]>=3.1.0",
|
|
65
|
+
"pymysql>=1.1.0",
|
|
66
|
+
]
|
|
60
67
|
dev = [
|
|
61
68
|
"pytest>=7.4.0",
|
|
62
69
|
"pytest-asyncio>=0.21.0",
|
|
@@ -10,9 +10,15 @@ Command-line tool for web data extraction, dataset pipelines, and database opera
|
|
|
10
10
|
## Prerequisites
|
|
11
11
|
|
|
12
12
|
```bash
|
|
13
|
-
# Ensure CLI is installed
|
|
13
|
+
# Ensure CLI is installed (activate venv if installed from source)
|
|
14
|
+
source .venv/bin/activate # if using a virtual environment
|
|
14
15
|
anysite --version
|
|
15
16
|
|
|
17
|
+
# Install extras for dataset pipelines and database support
|
|
18
|
+
pip install "anysite-cli[data]" # DuckDB + PyArrow for dataset commands
|
|
19
|
+
pip install "anysite-cli[postgres]" # PostgreSQL adapter
|
|
20
|
+
pip install "anysite-cli[all]" # All optional dependencies
|
|
21
|
+
|
|
16
22
|
# Configure API key (one-time)
|
|
17
23
|
anysite config set api_key sk-xxxxx
|
|
18
24
|
|
|
@@ -30,6 +36,9 @@ anysite api /api/linkedin/user user=satyanadella
|
|
|
30
36
|
anysite api /api/linkedin/company company=anthropic --format table
|
|
31
37
|
anysite api /api/linkedin/search/users title=CTO count=50 --format csv --output ctos.csv
|
|
32
38
|
|
|
39
|
+
# Search with specific parameters (always check with `anysite describe` first)
|
|
40
|
+
anysite api /api/linkedin/search/users first_name=Andrew last_name=Kulikov company_keywords=Anysite count=5
|
|
41
|
+
|
|
33
42
|
# Field selection
|
|
34
43
|
anysite api /api/linkedin/user user=satyanadella --fields "name,headline,follower_count"
|
|
35
44
|
anysite api /api/linkedin/user user=satyanadella --exclude "certifications,patents"
|
|
@@ -124,6 +133,7 @@ sources:
|
|
|
124
133
|
count: 5
|
|
125
134
|
parallel: 3
|
|
126
135
|
on_error: skip
|
|
136
|
+
refresh: always # Re-collect every run even with --incremental
|
|
127
137
|
|
|
128
138
|
storage:
|
|
129
139
|
format: parquet
|
|
@@ -203,7 +213,9 @@ Optional `db_load` config per source controls which fields go to DB:
|
|
|
203
213
|
|
|
204
214
|
```bash
|
|
205
215
|
# Add connection
|
|
206
|
-
anysite db add pg
|
|
216
|
+
anysite db add pg --type postgres --host localhost --port 5432 --database mydb --user myuser --password mypass
|
|
217
|
+
# Or use env var reference (password not stored in config):
|
|
218
|
+
anysite db add pg --type postgres --host localhost --database mydb --user myuser --password-env PGPASS
|
|
207
219
|
|
|
208
220
|
# Test and inspect
|
|
209
221
|
anysite db test pg
|
|
@@ -224,7 +236,17 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
|
|
|
224
236
|
| anysite db insert pg --table profiles --stdin --auto-create
|
|
225
237
|
```
|
|
226
238
|
|
|
227
|
-
### Step 6:
|
|
239
|
+
### Step 6: Compare Snapshots
|
|
240
|
+
```bash
|
|
241
|
+
# Diff two most recent snapshots
|
|
242
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value
|
|
243
|
+
|
|
244
|
+
# Diff specific dates, compare only certain fields
|
|
245
|
+
anysite dataset diff dataset.yaml --source employees --key _input_value \
|
|
246
|
+
--from 2026-01-30 --to 2026-02-01 --fields "name,headline"
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### Step 7: History, Scheduling, and Notifications
|
|
228
250
|
```bash
|
|
229
251
|
# View run history
|
|
230
252
|
anysite dataset history my-dataset
|
|
@@ -24,6 +24,7 @@ sources:
|
|
|
24
24
|
parallel: 3 # Concurrent requests
|
|
25
25
|
rate_limit: "10/s" # Rate limiting
|
|
26
26
|
on_error: skip # stop or skip
|
|
27
|
+
refresh: always # auto (default) or always — re-collect every run
|
|
27
28
|
transform: # Post-collection transform (for exports only)
|
|
28
29
|
filter: '.count > 10' # Safe filter expression
|
|
29
30
|
fields: [name, url] # Field selection with aliases
|
|
@@ -101,6 +102,12 @@ Sources are topologically sorted — parents always run before children. Multi-l
|
|
|
101
102
|
companies → employees → profiles → posts → comments
|
|
102
103
|
```
|
|
103
104
|
|
|
105
|
+
**Common dependency fields:**
|
|
106
|
+
- `/api/linkedin/company/employees` returns: `name`, `headline`, `url`, `image`, `location`, `internal_id`, `urn` — use `urn.value` (not `alias`) to chain into `/api/linkedin/user`
|
|
107
|
+
- `/api/linkedin/user` accepts both human-readable aliases (`satyanadella`) and URN values as the `user` parameter
|
|
108
|
+
|
|
109
|
+
Always run `anysite describe <endpoint>` to verify available fields before setting up dependencies.
|
|
110
|
+
|
|
104
111
|
### input_template
|
|
105
112
|
|
|
106
113
|
Transforms extracted values before passing to the API. Use `{value}` placeholder:
|
|
@@ -163,6 +170,25 @@ With `--incremental`:
|
|
|
163
170
|
2. Dependent/from_file sources: skips individual input values already in `metadata.json`
|
|
164
171
|
3. New values are still collected and tracked
|
|
165
172
|
|
|
173
|
+
### Refresh Mode
|
|
174
|
+
|
|
175
|
+
Per-source `refresh` field controls behavior with `--incremental`:
|
|
176
|
+
|
|
177
|
+
```yaml
|
|
178
|
+
- id: posts
|
|
179
|
+
endpoint: /api/linkedin/user/posts
|
|
180
|
+
dependency: { from_source: profiles, field: urn.value }
|
|
181
|
+
input_key: user
|
|
182
|
+
refresh: always # Re-collect every run even with --incremental
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
| Setting | `--incremental` | No flag |
|
|
186
|
+
|---------|----------------|---------|
|
|
187
|
+
| `refresh: auto` (default) | Skip collected inputs | Collect all |
|
|
188
|
+
| `refresh: always` | Collect all (ignore cache) | Collect all |
|
|
189
|
+
|
|
190
|
+
Use `refresh: always` for sources with frequently changing data (e.g., posts, activity feeds) where you want fresh snapshots each run while still caching stable parent data.
|
|
191
|
+
|
|
166
192
|
### Storage Layout
|
|
167
193
|
|
|
168
194
|
```
|
|
@@ -265,7 +291,8 @@ Result in database:
|
|
|
265
291
|
|
|
266
292
|
### Connection Management
|
|
267
293
|
```bash
|
|
268
|
-
anysite db add <name>
|
|
294
|
+
anysite db add <name> --type postgres --host localhost --database mydb --user app --password secret
|
|
295
|
+
anysite db add <name> --type postgres --host localhost --database mydb --user app --password-env DB_PASS
|
|
269
296
|
anysite db list # List all connections
|
|
270
297
|
anysite db test <name> # Test connectivity
|
|
271
298
|
anysite db info <name> # Show connection details
|
|
@@ -421,6 +448,39 @@ Payload: `{event: "complete"|"failure", dataset, timestamp, record_count, source
|
|
|
421
448
|
|
|
422
449
|
---
|
|
423
450
|
|
|
451
|
+
## Comparing Snapshots (Diff)
|
|
452
|
+
|
|
453
|
+
Compare two collection snapshots to find added, removed, and changed records.
|
|
454
|
+
|
|
455
|
+
```bash
|
|
456
|
+
# Compare two most recent snapshots (auto-detect dates)
|
|
457
|
+
anysite dataset diff dataset.yaml --source profiles --key _input_value
|
|
458
|
+
|
|
459
|
+
# Compare specific dates
|
|
460
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
|
|
461
|
+
|
|
462
|
+
# Only compare specific fields
|
|
463
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
|
|
464
|
+
|
|
465
|
+
# Output as JSON/CSV
|
|
466
|
+
anysite dataset diff dataset.yaml --source profiles --key urn --format json --output diff.json
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
**Options:**
|
|
470
|
+
- `--source, -s` (required) — source to compare
|
|
471
|
+
- `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
|
|
472
|
+
- `--from` / `--to` — snapshot dates (default: two most recent)
|
|
473
|
+
- `--fields, -f` — only compare these fields
|
|
474
|
+
- `--format` — output format (table, json, jsonl, csv)
|
|
475
|
+
- `--output, -o` — write to file
|
|
476
|
+
|
|
477
|
+
**Output** shows summary counts and a table of changes:
|
|
478
|
+
- **added** — records in the new snapshot but not the old
|
|
479
|
+
- **removed** — records in the old snapshot but not the new
|
|
480
|
+
- **changed** — records with the same key but different values (shows `old → new`)
|
|
481
|
+
|
|
482
|
+
---
|
|
483
|
+
|
|
424
484
|
## Reset Incremental State
|
|
425
485
|
|
|
426
486
|
Clear collected input tracking to force re-collection.
|
|
@@ -413,6 +413,117 @@ def load_db(
|
|
|
413
413
|
)
|
|
414
414
|
|
|
415
415
|
|
|
416
|
+
@app.command("diff")
|
|
417
|
+
def diff_cmd(
|
|
418
|
+
config_path: Annotated[
|
|
419
|
+
Path,
|
|
420
|
+
typer.Argument(help="Path to dataset.yaml"),
|
|
421
|
+
],
|
|
422
|
+
source: Annotated[
|
|
423
|
+
str,
|
|
424
|
+
typer.Option("--source", "-s", help="Source to compare"),
|
|
425
|
+
],
|
|
426
|
+
key: Annotated[
|
|
427
|
+
str,
|
|
428
|
+
typer.Option("--key", "-k", help="Field to match records by (e.g., _input_value, urn)"),
|
|
429
|
+
],
|
|
430
|
+
from_date: Annotated[
|
|
431
|
+
str | None,
|
|
432
|
+
typer.Option("--from", help="Older snapshot date (YYYY-MM-DD)"),
|
|
433
|
+
] = None,
|
|
434
|
+
to_date: Annotated[
|
|
435
|
+
str | None,
|
|
436
|
+
typer.Option("--to", help="Newer snapshot date (YYYY-MM-DD)"),
|
|
437
|
+
] = None,
|
|
438
|
+
fields: Annotated[
|
|
439
|
+
str | None,
|
|
440
|
+
typer.Option("--fields", "-f", help="Only compare these fields (comma-separated)"),
|
|
441
|
+
] = None,
|
|
442
|
+
format: Annotated[
|
|
443
|
+
str,
|
|
444
|
+
typer.Option("--format", help="Output format: table, json, jsonl, csv"),
|
|
445
|
+
] = "table",
|
|
446
|
+
output: Annotated[
|
|
447
|
+
Path | None,
|
|
448
|
+
typer.Option("--output", "-o", help="Write output to file"),
|
|
449
|
+
] = None,
|
|
450
|
+
quiet: Annotated[
|
|
451
|
+
bool,
|
|
452
|
+
typer.Option("--quiet", "-q", help="Suppress summary, only output data"),
|
|
453
|
+
] = False,
|
|
454
|
+
) -> None:
|
|
455
|
+
"""Compare two snapshots of a source to show added, removed, and changed records."""
|
|
456
|
+
from datetime import date as date_type
|
|
457
|
+
|
|
458
|
+
from anysite.dataset.differ import (
|
|
459
|
+
DatasetDiffer,
|
|
460
|
+
format_diff_records,
|
|
461
|
+
format_diff_table,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
config = _load_config(config_path)
|
|
465
|
+
|
|
466
|
+
# Validate source exists
|
|
467
|
+
src = config.get_source(source)
|
|
468
|
+
if src is None:
|
|
469
|
+
typer.echo(f"Error: source '{source}' not found in dataset", err=True)
|
|
470
|
+
raise typer.Exit(1)
|
|
471
|
+
|
|
472
|
+
differ = DatasetDiffer(config.storage_path())
|
|
473
|
+
|
|
474
|
+
# Parse dates
|
|
475
|
+
parsed_from = None
|
|
476
|
+
parsed_to = None
|
|
477
|
+
try:
|
|
478
|
+
if from_date:
|
|
479
|
+
parsed_from = date_type.fromisoformat(from_date)
|
|
480
|
+
if to_date:
|
|
481
|
+
parsed_to = date_type.fromisoformat(to_date)
|
|
482
|
+
except ValueError as e:
|
|
483
|
+
typer.echo(f"Error: invalid date format: {e}", err=True)
|
|
484
|
+
raise typer.Exit(1) from None
|
|
485
|
+
|
|
486
|
+
# Parse fields
|
|
487
|
+
field_list = None
|
|
488
|
+
if fields:
|
|
489
|
+
field_list = [f.strip() for f in fields.split(",") if f.strip()]
|
|
490
|
+
|
|
491
|
+
try:
|
|
492
|
+
result = differ.diff(
|
|
493
|
+
source,
|
|
494
|
+
key,
|
|
495
|
+
from_date=parsed_from,
|
|
496
|
+
to_date=parsed_to,
|
|
497
|
+
fields=field_list,
|
|
498
|
+
)
|
|
499
|
+
except DatasetError as e:
|
|
500
|
+
typer.echo(f"Error: {e}", err=True)
|
|
501
|
+
raise typer.Exit(1) from None
|
|
502
|
+
|
|
503
|
+
# Print summary unless quiet
|
|
504
|
+
if not quiet:
|
|
505
|
+
console = Console()
|
|
506
|
+
console.print(
|
|
507
|
+
f"\n[bold]Diff: {source}[/bold] "
|
|
508
|
+
f"({result.from_date.isoformat()} → {result.to_date.isoformat()})\n"
|
|
509
|
+
)
|
|
510
|
+
console.print(f" [green]Added:[/green] {len(result.added)}")
|
|
511
|
+
console.print(f" [red]Removed:[/red] {len(result.removed)}")
|
|
512
|
+
console.print(f" [yellow]Changed:[/yellow] {len(result.changed)}")
|
|
513
|
+
console.print(f" Unchanged: {result.unchanged_count}")
|
|
514
|
+
console.print()
|
|
515
|
+
|
|
516
|
+
if not result.has_changes:
|
|
517
|
+
if not quiet:
|
|
518
|
+
Console().print("[dim]No changes detected.[/dim]")
|
|
519
|
+
return
|
|
520
|
+
|
|
521
|
+
# Format and output
|
|
522
|
+
rows = format_diff_table(result) if format == "table" else format_diff_records(result)
|
|
523
|
+
|
|
524
|
+
_output_results(rows, format, output)
|
|
525
|
+
|
|
526
|
+
|
|
416
527
|
@app.command("history")
|
|
417
528
|
def history(
|
|
418
529
|
name: Annotated[
|
|
@@ -43,6 +43,7 @@ class CollectionPlan:
|
|
|
43
43
|
params: dict[str, Any] | None = None,
|
|
44
44
|
dependency: str | None = None,
|
|
45
45
|
estimated_requests: int | None = None,
|
|
46
|
+
refresh: str = "auto",
|
|
46
47
|
) -> None:
|
|
47
48
|
self.steps.append({
|
|
48
49
|
"source": source_id,
|
|
@@ -51,6 +52,7 @@ class CollectionPlan:
|
|
|
51
52
|
"params": params or {},
|
|
52
53
|
"dependency": dependency,
|
|
53
54
|
"estimated_requests": estimated_requests,
|
|
55
|
+
"refresh": refresh,
|
|
54
56
|
})
|
|
55
57
|
|
|
56
58
|
|
|
@@ -116,8 +118,8 @@ async def collect_dataset(
|
|
|
116
118
|
|
|
117
119
|
try:
|
|
118
120
|
for source in ordered:
|
|
119
|
-
# Check incremental skip
|
|
120
|
-
if incremental:
|
|
121
|
+
# Check incremental skip (refresh: always bypasses this)
|
|
122
|
+
if incremental and source.refresh != "always":
|
|
121
123
|
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
122
124
|
if parquet_path.exists():
|
|
123
125
|
if not quiet:
|
|
@@ -276,8 +278,8 @@ async def _collect_from_file(
|
|
|
276
278
|
print_warning(f"No values extracted from {file_path}")
|
|
277
279
|
return []
|
|
278
280
|
|
|
279
|
-
# Filter already-collected inputs in incremental mode
|
|
280
|
-
if incremental and metadata:
|
|
281
|
+
# Filter already-collected inputs in incremental mode (refresh: always bypasses)
|
|
282
|
+
if incremental and source.refresh != "always" and metadata:
|
|
281
283
|
already = metadata.get_collected_inputs(source.id)
|
|
282
284
|
if already:
|
|
283
285
|
original = len(values)
|
|
@@ -432,8 +434,8 @@ async def _collect_dependent(
|
|
|
432
434
|
f"Source {source.id} has a dependency but no input_key defined"
|
|
433
435
|
)
|
|
434
436
|
|
|
435
|
-
# Filter already-collected inputs in incremental mode
|
|
436
|
-
if incremental and metadata:
|
|
437
|
+
# Filter already-collected inputs in incremental mode (refresh: always bypasses)
|
|
438
|
+
if incremental and source.refresh != "always" and metadata:
|
|
437
439
|
already = metadata.get_collected_inputs(source.id)
|
|
438
440
|
if already:
|
|
439
441
|
original = len(values)
|
|
@@ -579,7 +581,7 @@ def _build_plan(
|
|
|
579
581
|
plan = CollectionPlan()
|
|
580
582
|
|
|
581
583
|
for source in ordered:
|
|
582
|
-
if incremental:
|
|
584
|
+
if incremental and source.refresh != "always":
|
|
583
585
|
parquet_path = get_parquet_path(base_path, source.id, today)
|
|
584
586
|
if parquet_path.exists():
|
|
585
587
|
continue
|
|
@@ -592,6 +594,7 @@ def _build_plan(
|
|
|
592
594
|
kind="from_file",
|
|
593
595
|
params={"file": source.from_file, "field": source.file_field},
|
|
594
596
|
estimated_requests=est,
|
|
597
|
+
refresh=source.refresh,
|
|
595
598
|
)
|
|
596
599
|
elif source.dependency is None:
|
|
597
600
|
plan.add_step(
|
|
@@ -600,6 +603,7 @@ def _build_plan(
|
|
|
600
603
|
kind="independent",
|
|
601
604
|
params=source.params,
|
|
602
605
|
estimated_requests=1,
|
|
606
|
+
refresh=source.refresh,
|
|
603
607
|
)
|
|
604
608
|
else:
|
|
605
609
|
est = _count_dependent_inputs(source, base_path, metadata)
|
|
@@ -609,6 +613,7 @@ def _build_plan(
|
|
|
609
613
|
kind="dependent",
|
|
610
614
|
dependency=source.dependency.from_source,
|
|
611
615
|
estimated_requests=est,
|
|
616
|
+
refresh=source.refresh,
|
|
612
617
|
)
|
|
613
618
|
|
|
614
619
|
return plan
|
|
@@ -665,11 +670,14 @@ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
|
|
|
665
670
|
table.add_column("Est. Requests")
|
|
666
671
|
|
|
667
672
|
for i, step in enumerate(plan.steps, 1):
|
|
673
|
+
kind = step["kind"]
|
|
674
|
+
if step.get("refresh") == "always":
|
|
675
|
+
kind += " (refresh)"
|
|
668
676
|
table.add_row(
|
|
669
677
|
str(i),
|
|
670
678
|
step["source"],
|
|
671
679
|
step["endpoint"],
|
|
672
|
-
|
|
680
|
+
kind,
|
|
673
681
|
step.get("dependency") or "-",
|
|
674
682
|
str(step.get("estimated_requests") or "?"),
|
|
675
683
|
)
|