anysite-cli 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of anysite-cli might be problematic. Click here for more details.

Files changed (118) hide show
  1. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/.gitignore +1 -0
  2. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/CLAUDE.md +5 -2
  3. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/PKG-INFO +25 -3
  4. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/README.md +18 -2
  5. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/pyproject.toml +8 -1
  6. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/skills/anysite-cli/SKILL.md +25 -3
  7. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/skills/anysite-cli/references/dataset-guide.md +61 -1
  8. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/cli.py +111 -0
  9. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/collector.py +16 -8
  10. anysite_cli-0.1.3/src/anysite/dataset/differ.py +355 -0
  11. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/models.py +4 -0
  12. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/cli.py +22 -0
  13. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_collector.py +191 -0
  14. anysite_cli-0.1.3/tests/test_dataset/test_differ.py +338 -0
  15. anysite_cli-0.1.3/tests/test_dataset/test_integration_csv.py +291 -0
  16. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/.claude/settings.local.json +0 -0
  17. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/LICENSE +0 -0
  18. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/skills/anysite-cli/references/api-reference.md +0 -0
  19. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/__init__.py +0 -0
  20. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/__main__.py +0 -0
  21. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/__init__.py +0 -0
  22. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/client.py +0 -0
  23. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/errors.py +0 -0
  24. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/api/schemas.py +0 -0
  25. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/__init__.py +0 -0
  26. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/executor.py +0 -0
  27. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/input.py +0 -0
  28. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/batch/rate_limiter.py +0 -0
  29. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/__init__.py +0 -0
  30. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/config.py +0 -0
  31. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/executor.py +0 -0
  32. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/cli/options.py +0 -0
  33. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/config/__init__.py +0 -0
  34. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/config/paths.py +0 -0
  35. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/config/settings.py +0 -0
  36. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/__init__.py +0 -0
  37. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/analyzer.py +0 -0
  38. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/db_loader.py +0 -0
  39. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/errors.py +0 -0
  40. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/exporters.py +0 -0
  41. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/history.py +0 -0
  42. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/notifications.py +0 -0
  43. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/scheduler.py +0 -0
  44. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/storage.py +0 -0
  45. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/dataset/transformer.py +0 -0
  46. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/__init__.py +0 -0
  47. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/__init__.py +0 -0
  48. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/base.py +0 -0
  49. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/postgres.py +0 -0
  50. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/adapters/sqlite.py +0 -0
  51. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/config.py +0 -0
  52. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/manager.py +0 -0
  53. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/operations/__init__.py +0 -0
  54. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/operations/insert.py +0 -0
  55. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/operations/query.py +0 -0
  56. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/schema/__init__.py +0 -0
  57. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/schema/inference.py +0 -0
  58. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/schema/types.py +0 -0
  59. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/utils/__init__.py +0 -0
  60. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/db/utils/sanitize.py +0 -0
  61. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/main.py +0 -0
  62. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/models/__init__.py +0 -0
  63. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/__init__.py +0 -0
  64. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/console.py +0 -0
  65. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/formatters.py +0 -0
  66. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/output/templates.py +0 -0
  67. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/py.typed +0 -0
  68. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/streaming/__init__.py +0 -0
  69. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/streaming/progress.py +0 -0
  70. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/streaming/writer.py +0 -0
  71. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/utils/__init__.py +0 -0
  72. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/utils/fields.py +0 -0
  73. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/src/anysite/utils/retry.py +0 -0
  74. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/enriched_partners_sample_10.csv +0 -0
  75. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/linkedin-partners/company_aliases.txt +0 -0
  76. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/linkedin-partners/dataset.yaml +0 -0
  77. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-deep/dataset.yaml +0 -0
  78. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-intel/dataset.yaml +0 -0
  79. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-linkedin/company_aliases.txt +0 -0
  80. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-linkedin/dataset.yaml +0 -0
  81. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/test_data/partners-pipeline/dataset.yaml +0 -0
  82. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/__init__.py +0 -0
  83. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/conftest.py +0 -0
  84. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_api/__init__.py +0 -0
  85. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/__init__.py +0 -0
  86. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/test_executor.py +0 -0
  87. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/test_input.py +0 -0
  88. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_batch/test_rate_limiter.py +0 -0
  89. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_cli/__init__.py +0 -0
  90. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_cli/test_main.py +0 -0
  91. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/__init__.py +0 -0
  92. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_analyzer.py +0 -0
  93. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_db_loader.py +0 -0
  94. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_exporters.py +0 -0
  95. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_history.py +0 -0
  96. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_models.py +0 -0
  97. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_notifications.py +0 -0
  98. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_scheduler.py +0 -0
  99. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_storage.py +0 -0
  100. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_dataset/test_transformer.py +0 -0
  101. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/__init__.py +0 -0
  102. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_cli.py +0 -0
  103. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_config.py +0 -0
  104. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_inference.py +0 -0
  105. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_insert.py +0 -0
  106. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_manager.py +0 -0
  107. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_postgres_adapter.py +0 -0
  108. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_sanitize.py +0 -0
  109. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_db/test_sqlite_adapter.py +0 -0
  110. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_output/__init__.py +0 -0
  111. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_output/test_formatters.py +0 -0
  112. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_output/test_templates.py +0 -0
  113. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_streaming/__init__.py +0 -0
  114. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_streaming/test_progress.py +0 -0
  115. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_streaming/test_writer.py +0 -0
  116. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_utils/__init__.py +0 -0
  117. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_utils/test_fields.py +0 -0
  118. {anysite_cli-0.1.1 → anysite_cli-0.1.3}/tests/test_utils/test_retry.py +0 -0
@@ -87,3 +87,4 @@ Thumbs.db
87
87
  init_docs/
88
88
 
89
89
  data/
90
+ issues/
@@ -53,6 +53,8 @@ anysite dataset history my-dataset
53
53
  anysite dataset logs my-dataset --run 42
54
54
  anysite dataset schedule dataset.yaml --incremental --load-db pg
55
55
  anysite dataset schedule dataset.yaml --systemd --load-db pg
56
+ anysite dataset diff dataset.yaml --source profiles --key _input_value
57
+ anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
56
58
  anysite dataset reset-cursor dataset.yaml
57
59
  anysite dataset reset-cursor dataset.yaml --source profiles
58
60
 
@@ -102,7 +104,8 @@ anysite db upsert mydb --table users --conflict-columns id --stdin
102
104
  - `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
103
105
  - `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
104
106
  - `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
105
- - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `history`, `logs`, `schedule`, `reset-cursor`
107
+ - `dataset/differ.py` - `DatasetDiffer`: compare two Parquet snapshots using DuckDB (added/removed/changed records). `DiffResult` dataclass, `format_diff_table()` and `format_diff_records()` formatters
108
+ - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `diff`, `history`, `logs`, `schedule`, `reset-cursor`
106
109
  - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
107
110
  - `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
108
111
  - `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
@@ -202,5 +205,5 @@ Tests are in `tests/` with subdirectories mirroring `src/anysite/`:
202
205
  - `test_streaming/` — Progress and writer
203
206
  - `test_output/` — Formatters and templates
204
207
  - `test_utils/` — Field selection and retry
205
- - `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications
208
+ - `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications, differ
206
209
  - `test_db/` — Database adapters, schema inference, connection manager, operations
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: anysite-cli
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: CLI for Anysite API - web data extraction for humans and AI agents
5
5
  Project-URL: Homepage, https://anysite.io
6
6
  Project-URL: Documentation, https://docs.anysite.io/cli
@@ -33,6 +33,12 @@ Requires-Dist: pyyaml>=6.0.0
33
33
  Requires-Dist: rich>=13.0.0
34
34
  Requires-Dist: tabulate>=0.9.0
35
35
  Requires-Dist: typer[all]>=0.9.0
36
+ Provides-Extra: all
37
+ Requires-Dist: duckdb>=1.0.0; extra == 'all'
38
+ Requires-Dist: httpx>=0.25.0; extra == 'all'
39
+ Requires-Dist: psycopg[binary]>=3.1.0; extra == 'all'
40
+ Requires-Dist: pyarrow>=15.0.0; extra == 'all'
41
+ Requires-Dist: pymysql>=1.1.0; extra == 'all'
36
42
  Provides-Extra: data
37
43
  Requires-Dist: duckdb>=1.0.0; extra == 'data'
38
44
  Requires-Dist: httpx>=0.25.0; extra == 'data'
@@ -65,11 +71,21 @@ Web data extraction for humans and AI agents.
65
71
  pip install anysite-cli
66
72
  ```
67
73
 
74
+ Optional extras:
75
+
76
+ ```bash
77
+ pip install "anysite-cli[data]" # DuckDB + PyArrow for dataset pipelines
78
+ pip install "anysite-cli[postgres]" # PostgreSQL support
79
+ pip install "anysite-cli[all]" # All optional dependencies
80
+ ```
81
+
68
82
  Or install from source:
69
83
 
70
84
  ```bash
71
85
  git clone https://github.com/anysiteio/anysite-cli.git
72
86
  cd anysite-cli
87
+ python -m venv .venv
88
+ source .venv/bin/activate
73
89
  pip install -e .
74
90
  ```
75
91
 
@@ -256,6 +272,7 @@ sources:
256
272
  - type: company
257
273
  value: "{value}"
258
274
  count: 5
275
+ refresh: always # Re-collect every run with --incremental
259
276
  db_load:
260
277
  fields: [name, url, headline]
261
278
 
@@ -311,6 +328,9 @@ anysite dataset logs my-dataset --run 42
311
328
  # Generate cron/systemd schedule
312
329
  anysite dataset schedule dataset.yaml --incremental --load-db pg
313
330
 
331
+ # Compare snapshots (diff two collection dates)
332
+ anysite dataset diff dataset.yaml --source employees --key _input_value
333
+
314
334
  # Reset incremental state
315
335
  anysite dataset reset-cursor dataset.yaml
316
336
  ```
@@ -320,8 +340,10 @@ anysite dataset reset-cursor dataset.yaml
320
340
  Manage database connections and run queries.
321
341
 
322
342
  ```bash
323
- # Add a connection
324
- anysite db add pg
343
+ # Add a connection (--password auto-stores via env var reference)
344
+ anysite db add pg --type postgres --host localhost --database mydb --user app --password secret
345
+ # Or reference an existing env var
346
+ anysite db add pg --type postgres --host localhost --database mydb --user app --password-env PGPASS
325
347
 
326
348
  # List and test connections
327
349
  anysite db list
@@ -8,11 +8,21 @@ Web data extraction for humans and AI agents.
8
8
  pip install anysite-cli
9
9
  ```
10
10
 
11
+ Optional extras:
12
+
13
+ ```bash
14
+ pip install "anysite-cli[data]" # DuckDB + PyArrow for dataset pipelines
15
+ pip install "anysite-cli[postgres]" # PostgreSQL support
16
+ pip install "anysite-cli[all]" # All optional dependencies
17
+ ```
18
+
11
19
  Or install from source:
12
20
 
13
21
  ```bash
14
22
  git clone https://github.com/anysiteio/anysite-cli.git
15
23
  cd anysite-cli
24
+ python -m venv .venv
25
+ source .venv/bin/activate
16
26
  pip install -e .
17
27
  ```
18
28
 
@@ -199,6 +209,7 @@ sources:
199
209
  - type: company
200
210
  value: "{value}"
201
211
  count: 5
212
+ refresh: always # Re-collect every run with --incremental
202
213
  db_load:
203
214
  fields: [name, url, headline]
204
215
 
@@ -254,6 +265,9 @@ anysite dataset logs my-dataset --run 42
254
265
  # Generate cron/systemd schedule
255
266
  anysite dataset schedule dataset.yaml --incremental --load-db pg
256
267
 
268
+ # Compare snapshots (diff two collection dates)
269
+ anysite dataset diff dataset.yaml --source employees --key _input_value
270
+
257
271
  # Reset incremental state
258
272
  anysite dataset reset-cursor dataset.yaml
259
273
  ```
@@ -263,8 +277,10 @@ anysite dataset reset-cursor dataset.yaml
263
277
  Manage database connections and run queries.
264
278
 
265
279
  ```bash
266
- # Add a connection
267
- anysite db add pg
280
+ # Add a connection (--password auto-stores via env var reference)
281
+ anysite db add pg --type postgres --host localhost --database mydb --user app --password secret
282
+ # Or reference an existing env var
283
+ anysite db add pg --type postgres --host localhost --database mydb --user app --password-env PGPASS
268
284
 
269
285
  # List and test connections
270
286
  anysite db list
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "anysite-cli"
7
- version = "0.1.1"
7
+ version = "0.1.3"
8
8
  description = "CLI for Anysite API - web data extraction for humans and AI agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -57,6 +57,13 @@ db = [
57
57
  "psycopg[binary]>=3.1.0",
58
58
  "pymysql>=1.1.0",
59
59
  ]
60
+ all = [
61
+ "duckdb>=1.0.0",
62
+ "pyarrow>=15.0.0",
63
+ "httpx>=0.25.0",
64
+ "psycopg[binary]>=3.1.0",
65
+ "pymysql>=1.1.0",
66
+ ]
60
67
  dev = [
61
68
  "pytest>=7.4.0",
62
69
  "pytest-asyncio>=0.21.0",
@@ -10,9 +10,15 @@ Command-line tool for web data extraction, dataset pipelines, and database opera
10
10
  ## Prerequisites
11
11
 
12
12
  ```bash
13
- # Ensure CLI is installed
13
+ # Ensure CLI is installed (activate venv if installed from source)
14
+ source .venv/bin/activate # if using a virtual environment
14
15
  anysite --version
15
16
 
17
+ # Install extras for dataset pipelines and database support
18
+ pip install "anysite-cli[data]" # DuckDB + PyArrow for dataset commands
19
+ pip install "anysite-cli[postgres]" # PostgreSQL adapter
20
+ pip install "anysite-cli[all]" # All optional dependencies
21
+
16
22
  # Configure API key (one-time)
17
23
  anysite config set api_key sk-xxxxx
18
24
 
@@ -30,6 +36,9 @@ anysite api /api/linkedin/user user=satyanadella
30
36
  anysite api /api/linkedin/company company=anthropic --format table
31
37
  anysite api /api/linkedin/search/users title=CTO count=50 --format csv --output ctos.csv
32
38
 
39
+ # Search with specific parameters (always check with `anysite describe` first)
40
+ anysite api /api/linkedin/search/users first_name=Andrew last_name=Kulikov company_keywords=Anysite count=5
41
+
33
42
  # Field selection
34
43
  anysite api /api/linkedin/user user=satyanadella --fields "name,headline,follower_count"
35
44
  anysite api /api/linkedin/user user=satyanadella --exclude "certifications,patents"
@@ -124,6 +133,7 @@ sources:
124
133
  count: 5
125
134
  parallel: 3
126
135
  on_error: skip
136
+ refresh: always # Re-collect every run even with --incremental
127
137
 
128
138
  storage:
129
139
  format: parquet
@@ -203,7 +213,9 @@ Optional `db_load` config per source controls which fields go to DB:
203
213
 
204
214
  ```bash
205
215
  # Add connection
206
- anysite db add pg # Interactive prompts for type, host, port, etc.
216
+ anysite db add pg --type postgres --host localhost --port 5432 --database mydb --user myuser --password mypass
217
+ # Or use env var reference (password not stored in config):
218
+ anysite db add pg --type postgres --host localhost --database mydb --user myuser --password-env PGPASS
207
219
 
208
220
  # Test and inspect
209
221
  anysite db test pg
@@ -224,7 +236,17 @@ anysite api /api/linkedin/user user=satyanadella -q --format jsonl \
224
236
  | anysite db insert pg --table profiles --stdin --auto-create
225
237
  ```
226
238
 
227
- ### Step 6: History, Scheduling, and Notifications
239
+ ### Step 6: Compare Snapshots
240
+ ```bash
241
+ # Diff two most recent snapshots
242
+ anysite dataset diff dataset.yaml --source employees --key _input_value
243
+
244
+ # Diff specific dates, compare only certain fields
245
+ anysite dataset diff dataset.yaml --source employees --key _input_value \
246
+ --from 2026-01-30 --to 2026-02-01 --fields "name,headline"
247
+ ```
248
+
249
+ ### Step 7: History, Scheduling, and Notifications
228
250
  ```bash
229
251
  # View run history
230
252
  anysite dataset history my-dataset
@@ -24,6 +24,7 @@ sources:
24
24
  parallel: 3 # Concurrent requests
25
25
  rate_limit: "10/s" # Rate limiting
26
26
  on_error: skip # stop or skip
27
+ refresh: always # auto (default) or always — re-collect every run
27
28
  transform: # Post-collection transform (for exports only)
28
29
  filter: '.count > 10' # Safe filter expression
29
30
  fields: [name, url] # Field selection with aliases
@@ -101,6 +102,12 @@ Sources are topologically sorted — parents always run before children. Multi-l
101
102
  companies → employees → profiles → posts → comments
102
103
  ```
103
104
 
105
+ **Common dependency fields:**
106
+ - `/api/linkedin/company/employees` returns: `name`, `headline`, `url`, `image`, `location`, `internal_id`, `urn` — use `urn.value` (not `alias`) to chain into `/api/linkedin/user`
107
+ - `/api/linkedin/user` accepts both human-readable aliases (`satyanadella`) and URN values as the `user` parameter
108
+
109
+ Always run `anysite describe <endpoint>` to verify available fields before setting up dependencies.
110
+
104
111
  ### input_template
105
112
 
106
113
  Transforms extracted values before passing to the API. Use `{value}` placeholder:
@@ -163,6 +170,25 @@ With `--incremental`:
163
170
  2. Dependent/from_file sources: skips individual input values already in `metadata.json`
164
171
  3. New values are still collected and tracked
165
172
 
173
+ ### Refresh Mode
174
+
175
+ Per-source `refresh` field controls behavior with `--incremental`:
176
+
177
+ ```yaml
178
+ - id: posts
179
+ endpoint: /api/linkedin/user/posts
180
+ dependency: { from_source: profiles, field: urn.value }
181
+ input_key: user
182
+ refresh: always # Re-collect every run even with --incremental
183
+ ```
184
+
185
+ | Setting | `--incremental` | No flag |
186
+ |---------|----------------|---------|
187
+ | `refresh: auto` (default) | Skip collected inputs | Collect all |
188
+ | `refresh: always` | Collect all (ignore cache) | Collect all |
189
+
190
+ Use `refresh: always` for sources with frequently changing data (e.g., posts, activity feeds) where you want fresh snapshots each run while still caching stable parent data.
191
+
166
192
  ### Storage Layout
167
193
 
168
194
  ```
@@ -265,7 +291,8 @@ Result in database:
265
291
 
266
292
  ### Connection Management
267
293
  ```bash
268
- anysite db add <name> # Interactive add
294
+ anysite db add <name> --type postgres --host localhost --database mydb --user app --password secret
295
+ anysite db add <name> --type postgres --host localhost --database mydb --user app --password-env DB_PASS
269
296
  anysite db list # List all connections
270
297
  anysite db test <name> # Test connectivity
271
298
  anysite db info <name> # Show connection details
@@ -421,6 +448,39 @@ Payload: `{event: "complete"|"failure", dataset, timestamp, record_count, source
421
448
 
422
449
  ---
423
450
 
451
+ ## Comparing Snapshots (Diff)
452
+
453
+ Compare two collection snapshots to find added, removed, and changed records.
454
+
455
+ ```bash
456
+ # Compare two most recent snapshots (auto-detect dates)
457
+ anysite dataset diff dataset.yaml --source profiles --key _input_value
458
+
459
+ # Compare specific dates
460
+ anysite dataset diff dataset.yaml --source profiles --key urn --from 2026-01-30 --to 2026-02-01
461
+
462
+ # Only compare specific fields
463
+ anysite dataset diff dataset.yaml --source profiles --key urn --fields "name,headline,follower_count"
464
+
465
+ # Output as JSON/CSV
466
+ anysite dataset diff dataset.yaml --source profiles --key urn --format json --output diff.json
467
+ ```
468
+
469
+ **Options:**
470
+ - `--source, -s` (required) — source to compare
471
+ - `--key, -k` (required) — field to match records by (e.g., `_input_value`, `urn`)
472
+ - `--from` / `--to` — snapshot dates (default: two most recent)
473
+ - `--fields, -f` — only compare these fields
474
+ - `--format` — output format (table, json, jsonl, csv)
475
+ - `--output, -o` — write to file
476
+
477
+ **Output** shows summary counts and a table of changes:
478
+ - **added** — records in the new snapshot but not the old
479
+ - **removed** — records in the old snapshot but not the new
480
+ - **changed** — records with the same key but different values (shows `old → new`)
481
+
482
+ ---
483
+
424
484
  ## Reset Incremental State
425
485
 
426
486
  Clear collected input tracking to force re-collection.
@@ -413,6 +413,117 @@ def load_db(
413
413
  )
414
414
 
415
415
 
416
+ @app.command("diff")
417
+ def diff_cmd(
418
+ config_path: Annotated[
419
+ Path,
420
+ typer.Argument(help="Path to dataset.yaml"),
421
+ ],
422
+ source: Annotated[
423
+ str,
424
+ typer.Option("--source", "-s", help="Source to compare"),
425
+ ],
426
+ key: Annotated[
427
+ str,
428
+ typer.Option("--key", "-k", help="Field to match records by (e.g., _input_value, urn)"),
429
+ ],
430
+ from_date: Annotated[
431
+ str | None,
432
+ typer.Option("--from", help="Older snapshot date (YYYY-MM-DD)"),
433
+ ] = None,
434
+ to_date: Annotated[
435
+ str | None,
436
+ typer.Option("--to", help="Newer snapshot date (YYYY-MM-DD)"),
437
+ ] = None,
438
+ fields: Annotated[
439
+ str | None,
440
+ typer.Option("--fields", "-f", help="Only compare these fields (comma-separated)"),
441
+ ] = None,
442
+ format: Annotated[
443
+ str,
444
+ typer.Option("--format", help="Output format: table, json, jsonl, csv"),
445
+ ] = "table",
446
+ output: Annotated[
447
+ Path | None,
448
+ typer.Option("--output", "-o", help="Write output to file"),
449
+ ] = None,
450
+ quiet: Annotated[
451
+ bool,
452
+ typer.Option("--quiet", "-q", help="Suppress summary, only output data"),
453
+ ] = False,
454
+ ) -> None:
455
+ """Compare two snapshots of a source to show added, removed, and changed records."""
456
+ from datetime import date as date_type
457
+
458
+ from anysite.dataset.differ import (
459
+ DatasetDiffer,
460
+ format_diff_records,
461
+ format_diff_table,
462
+ )
463
+
464
+ config = _load_config(config_path)
465
+
466
+ # Validate source exists
467
+ src = config.get_source(source)
468
+ if src is None:
469
+ typer.echo(f"Error: source '{source}' not found in dataset", err=True)
470
+ raise typer.Exit(1)
471
+
472
+ differ = DatasetDiffer(config.storage_path())
473
+
474
+ # Parse dates
475
+ parsed_from = None
476
+ parsed_to = None
477
+ try:
478
+ if from_date:
479
+ parsed_from = date_type.fromisoformat(from_date)
480
+ if to_date:
481
+ parsed_to = date_type.fromisoformat(to_date)
482
+ except ValueError as e:
483
+ typer.echo(f"Error: invalid date format: {e}", err=True)
484
+ raise typer.Exit(1) from None
485
+
486
+ # Parse fields
487
+ field_list = None
488
+ if fields:
489
+ field_list = [f.strip() for f in fields.split(",") if f.strip()]
490
+
491
+ try:
492
+ result = differ.diff(
493
+ source,
494
+ key,
495
+ from_date=parsed_from,
496
+ to_date=parsed_to,
497
+ fields=field_list,
498
+ )
499
+ except DatasetError as e:
500
+ typer.echo(f"Error: {e}", err=True)
501
+ raise typer.Exit(1) from None
502
+
503
+ # Print summary unless quiet
504
+ if not quiet:
505
+ console = Console()
506
+ console.print(
507
+ f"\n[bold]Diff: {source}[/bold] "
508
+ f"({result.from_date.isoformat()} → {result.to_date.isoformat()})\n"
509
+ )
510
+ console.print(f" [green]Added:[/green] {len(result.added)}")
511
+ console.print(f" [red]Removed:[/red] {len(result.removed)}")
512
+ console.print(f" [yellow]Changed:[/yellow] {len(result.changed)}")
513
+ console.print(f" Unchanged: {result.unchanged_count}")
514
+ console.print()
515
+
516
+ if not result.has_changes:
517
+ if not quiet:
518
+ Console().print("[dim]No changes detected.[/dim]")
519
+ return
520
+
521
+ # Format and output
522
+ rows = format_diff_table(result) if format == "table" else format_diff_records(result)
523
+
524
+ _output_results(rows, format, output)
525
+
526
+
416
527
  @app.command("history")
417
528
  def history(
418
529
  name: Annotated[
@@ -43,6 +43,7 @@ class CollectionPlan:
43
43
  params: dict[str, Any] | None = None,
44
44
  dependency: str | None = None,
45
45
  estimated_requests: int | None = None,
46
+ refresh: str = "auto",
46
47
  ) -> None:
47
48
  self.steps.append({
48
49
  "source": source_id,
@@ -51,6 +52,7 @@ class CollectionPlan:
51
52
  "params": params or {},
52
53
  "dependency": dependency,
53
54
  "estimated_requests": estimated_requests,
55
+ "refresh": refresh,
54
56
  })
55
57
 
56
58
 
@@ -116,8 +118,8 @@ async def collect_dataset(
116
118
 
117
119
  try:
118
120
  for source in ordered:
119
- # Check incremental skip
120
- if incremental:
121
+ # Check incremental skip (refresh: always bypasses this)
122
+ if incremental and source.refresh != "always":
121
123
  parquet_path = get_parquet_path(base_path, source.id, today)
122
124
  if parquet_path.exists():
123
125
  if not quiet:
@@ -276,8 +278,8 @@ async def _collect_from_file(
276
278
  print_warning(f"No values extracted from {file_path}")
277
279
  return []
278
280
 
279
- # Filter already-collected inputs in incremental mode
280
- if incremental and metadata:
281
+ # Filter already-collected inputs in incremental mode (refresh: always bypasses)
282
+ if incremental and source.refresh != "always" and metadata:
281
283
  already = metadata.get_collected_inputs(source.id)
282
284
  if already:
283
285
  original = len(values)
@@ -432,8 +434,8 @@ async def _collect_dependent(
432
434
  f"Source {source.id} has a dependency but no input_key defined"
433
435
  )
434
436
 
435
- # Filter already-collected inputs in incremental mode
436
- if incremental and metadata:
437
+ # Filter already-collected inputs in incremental mode (refresh: always bypasses)
438
+ if incremental and source.refresh != "always" and metadata:
437
439
  already = metadata.get_collected_inputs(source.id)
438
440
  if already:
439
441
  original = len(values)
@@ -579,7 +581,7 @@ def _build_plan(
579
581
  plan = CollectionPlan()
580
582
 
581
583
  for source in ordered:
582
- if incremental:
584
+ if incremental and source.refresh != "always":
583
585
  parquet_path = get_parquet_path(base_path, source.id, today)
584
586
  if parquet_path.exists():
585
587
  continue
@@ -592,6 +594,7 @@ def _build_plan(
592
594
  kind="from_file",
593
595
  params={"file": source.from_file, "field": source.file_field},
594
596
  estimated_requests=est,
597
+ refresh=source.refresh,
595
598
  )
596
599
  elif source.dependency is None:
597
600
  plan.add_step(
@@ -600,6 +603,7 @@ def _build_plan(
600
603
  kind="independent",
601
604
  params=source.params,
602
605
  estimated_requests=1,
606
+ refresh=source.refresh,
603
607
  )
604
608
  else:
605
609
  est = _count_dependent_inputs(source, base_path, metadata)
@@ -609,6 +613,7 @@ def _build_plan(
609
613
  kind="dependent",
610
614
  dependency=source.dependency.from_source,
611
615
  estimated_requests=est,
616
+ refresh=source.refresh,
612
617
  )
613
618
 
614
619
  return plan
@@ -665,11 +670,14 @@ def _print_plan(plan: CollectionPlan) -> dict[str, int]:
665
670
  table.add_column("Est. Requests")
666
671
 
667
672
  for i, step in enumerate(plan.steps, 1):
673
+ kind = step["kind"]
674
+ if step.get("refresh") == "always":
675
+ kind += " (refresh)"
668
676
  table.add_row(
669
677
  str(i),
670
678
  step["source"],
671
679
  step["endpoint"],
672
- step["kind"],
680
+ kind,
673
681
  step.get("dependency") or "-",
674
682
  str(step.get("estimated_requests") or "?"),
675
683
  )