anysite-cli 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- anysite_cli-0.1.2/.claude/settings.local.json +17 -0
- anysite_cli-0.1.2/.gitignore +90 -0
- anysite_cli-0.1.2/CLAUDE.md +206 -0
- anysite_cli-0.1.2/LICENSE +21 -0
- anysite_cli-0.1.2/PKG-INFO +455 -0
- anysite_cli-0.1.2/README.md +392 -0
- anysite_cli-0.1.2/pyproject.toml +141 -0
- anysite_cli-0.1.2/skills/anysite-cli/SKILL.md +303 -0
- anysite_cli-0.1.2/skills/anysite-cli/references/api-reference.md +160 -0
- anysite_cli-0.1.2/skills/anysite-cli/references/dataset-guide.md +441 -0
- anysite_cli-0.1.2/src/anysite/__init__.py +4 -0
- anysite_cli-0.1.2/src/anysite/__main__.py +6 -0
- anysite_cli-0.1.2/src/anysite/api/__init__.py +21 -0
- anysite_cli-0.1.2/src/anysite/api/client.py +271 -0
- anysite_cli-0.1.2/src/anysite/api/errors.py +137 -0
- anysite_cli-0.1.2/src/anysite/api/schemas.py +333 -0
- anysite_cli-0.1.2/src/anysite/batch/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/batch/executor.py +176 -0
- anysite_cli-0.1.2/src/anysite/batch/input.py +160 -0
- anysite_cli-0.1.2/src/anysite/batch/rate_limiter.py +98 -0
- anysite_cli-0.1.2/src/anysite/cli/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/cli/config.py +176 -0
- anysite_cli-0.1.2/src/anysite/cli/executor.py +388 -0
- anysite_cli-0.1.2/src/anysite/cli/options.py +249 -0
- anysite_cli-0.1.2/src/anysite/config/__init__.py +11 -0
- anysite_cli-0.1.2/src/anysite/config/paths.py +46 -0
- anysite_cli-0.1.2/src/anysite/config/settings.py +187 -0
- anysite_cli-0.1.2/src/anysite/dataset/__init__.py +37 -0
- anysite_cli-0.1.2/src/anysite/dataset/analyzer.py +268 -0
- anysite_cli-0.1.2/src/anysite/dataset/cli.py +644 -0
- anysite_cli-0.1.2/src/anysite/dataset/collector.py +686 -0
- anysite_cli-0.1.2/src/anysite/dataset/db_loader.py +248 -0
- anysite_cli-0.1.2/src/anysite/dataset/errors.py +30 -0
- anysite_cli-0.1.2/src/anysite/dataset/exporters.py +121 -0
- anysite_cli-0.1.2/src/anysite/dataset/history.py +153 -0
- anysite_cli-0.1.2/src/anysite/dataset/models.py +245 -0
- anysite_cli-0.1.2/src/anysite/dataset/notifications.py +87 -0
- anysite_cli-0.1.2/src/anysite/dataset/scheduler.py +107 -0
- anysite_cli-0.1.2/src/anysite/dataset/storage.py +171 -0
- anysite_cli-0.1.2/src/anysite/dataset/transformer.py +213 -0
- anysite_cli-0.1.2/src/anysite/db/__init__.py +38 -0
- anysite_cli-0.1.2/src/anysite/db/adapters/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/db/adapters/base.py +158 -0
- anysite_cli-0.1.2/src/anysite/db/adapters/postgres.py +201 -0
- anysite_cli-0.1.2/src/anysite/db/adapters/sqlite.py +183 -0
- anysite_cli-0.1.2/src/anysite/db/cli.py +709 -0
- anysite_cli-0.1.2/src/anysite/db/config.py +92 -0
- anysite_cli-0.1.2/src/anysite/db/manager.py +166 -0
- anysite_cli-0.1.2/src/anysite/db/operations/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/db/operations/insert.py +199 -0
- anysite_cli-0.1.2/src/anysite/db/operations/query.py +43 -0
- anysite_cli-0.1.2/src/anysite/db/schema/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/db/schema/inference.py +213 -0
- anysite_cli-0.1.2/src/anysite/db/schema/types.py +71 -0
- anysite_cli-0.1.2/src/anysite/db/utils/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/db/utils/sanitize.py +99 -0
- anysite_cli-0.1.2/src/anysite/main.py +498 -0
- anysite_cli-0.1.2/src/anysite/models/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/output/__init__.py +11 -0
- anysite_cli-0.1.2/src/anysite/output/console.py +45 -0
- anysite_cli-0.1.2/src/anysite/output/formatters.py +301 -0
- anysite_cli-0.1.2/src/anysite/output/templates.py +76 -0
- anysite_cli-0.1.2/src/anysite/py.typed +0 -0
- anysite_cli-0.1.2/src/anysite/streaming/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/streaming/progress.py +121 -0
- anysite_cli-0.1.2/src/anysite/streaming/writer.py +130 -0
- anysite_cli-0.1.2/src/anysite/utils/__init__.py +1 -0
- anysite_cli-0.1.2/src/anysite/utils/fields.py +242 -0
- anysite_cli-0.1.2/src/anysite/utils/retry.py +109 -0
- anysite_cli-0.1.2/test_data/enriched_partners_sample_10.csv +11 -0
- anysite_cli-0.1.2/test_data/linkedin-partners/company_aliases.txt +10 -0
- anysite_cli-0.1.2/test_data/linkedin-partners/dataset.yaml +77 -0
- anysite_cli-0.1.2/test_data/partners-deep/dataset.yaml +102 -0
- anysite_cli-0.1.2/test_data/partners-intel/dataset.yaml +70 -0
- anysite_cli-0.1.2/test_data/partners-linkedin/company_aliases.txt +10 -0
- anysite_cli-0.1.2/test_data/partners-linkedin/dataset.yaml +103 -0
- anysite_cli-0.1.2/test_data/partners-pipeline/dataset.yaml +131 -0
- anysite_cli-0.1.2/tests/__init__.py +1 -0
- anysite_cli-0.1.2/tests/conftest.py +90 -0
- anysite_cli-0.1.2/tests/test_api/__init__.py +1 -0
- anysite_cli-0.1.2/tests/test_batch/__init__.py +0 -0
- anysite_cli-0.1.2/tests/test_batch/test_executor.py +128 -0
- anysite_cli-0.1.2/tests/test_batch/test_input.py +97 -0
- anysite_cli-0.1.2/tests/test_batch/test_rate_limiter.py +60 -0
- anysite_cli-0.1.2/tests/test_cli/__init__.py +1 -0
- anysite_cli-0.1.2/tests/test_cli/test_main.py +73 -0
- anysite_cli-0.1.2/tests/test_dataset/__init__.py +0 -0
- anysite_cli-0.1.2/tests/test_dataset/test_analyzer.py +158 -0
- anysite_cli-0.1.2/tests/test_dataset/test_collector.py +550 -0
- anysite_cli-0.1.2/tests/test_dataset/test_db_loader.py +346 -0
- anysite_cli-0.1.2/tests/test_dataset/test_exporters.py +115 -0
- anysite_cli-0.1.2/tests/test_dataset/test_history.py +104 -0
- anysite_cli-0.1.2/tests/test_dataset/test_models.py +251 -0
- anysite_cli-0.1.2/tests/test_dataset/test_notifications.py +81 -0
- anysite_cli-0.1.2/tests/test_dataset/test_scheduler.py +68 -0
- anysite_cli-0.1.2/tests/test_dataset/test_storage.py +148 -0
- anysite_cli-0.1.2/tests/test_dataset/test_transformer.py +143 -0
- anysite_cli-0.1.2/tests/test_db/__init__.py +0 -0
- anysite_cli-0.1.2/tests/test_db/test_cli.py +228 -0
- anysite_cli-0.1.2/tests/test_db/test_config.py +154 -0
- anysite_cli-0.1.2/tests/test_db/test_inference.py +169 -0
- anysite_cli-0.1.2/tests/test_db/test_insert.py +126 -0
- anysite_cli-0.1.2/tests/test_db/test_manager.py +112 -0
- anysite_cli-0.1.2/tests/test_db/test_postgres_adapter.py +428 -0
- anysite_cli-0.1.2/tests/test_db/test_sanitize.py +79 -0
- anysite_cli-0.1.2/tests/test_db/test_sqlite_adapter.py +198 -0
- anysite_cli-0.1.2/tests/test_output/__init__.py +1 -0
- anysite_cli-0.1.2/tests/test_output/test_formatters.py +151 -0
- anysite_cli-0.1.2/tests/test_output/test_templates.py +70 -0
- anysite_cli-0.1.2/tests/test_streaming/__init__.py +0 -0
- anysite_cli-0.1.2/tests/test_streaming/test_progress.py +47 -0
- anysite_cli-0.1.2/tests/test_streaming/test_writer.py +90 -0
- anysite_cli-0.1.2/tests/test_utils/__init__.py +0 -0
- anysite_cli-0.1.2/tests/test_utils/test_fields.py +103 -0
- anysite_cli-0.1.2/tests/test_utils/test_retry.py +102 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(source:*)",
|
|
5
|
+
"Bash(anysite linkedin user --help:*)",
|
|
6
|
+
"Bash(pytest:*)",
|
|
7
|
+
"Bash(tree:*)",
|
|
8
|
+
"Bash(find:*)",
|
|
9
|
+
"Bash(anysite:*)",
|
|
10
|
+
"Bash(anysite twitter user --help:*)",
|
|
11
|
+
"Bash(echo:*)",
|
|
12
|
+
"Bash(SCRATCHPAD=\"/private/tmp/claude-501/-Users-kulia-anysite-cli/fd73f4eb-9c04-4b25-8eca-83be56ae0e5e/scratchpad\":*)",
|
|
13
|
+
"Bash(export PG_TEST_PASS=testpass)",
|
|
14
|
+
"Bash(python -m pytest:*)"
|
|
15
|
+
]
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
*.manifest
|
|
29
|
+
*.spec
|
|
30
|
+
|
|
31
|
+
# Installer logs
|
|
32
|
+
pip-log.txt
|
|
33
|
+
pip-delete-this-directory.txt
|
|
34
|
+
|
|
35
|
+
# Unit test / coverage reports
|
|
36
|
+
htmlcov/
|
|
37
|
+
.tox/
|
|
38
|
+
.nox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
*.py,cover
|
|
46
|
+
.hypothesis/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Environments
|
|
54
|
+
.env
|
|
55
|
+
.env.local
|
|
56
|
+
.venv
|
|
57
|
+
env/
|
|
58
|
+
venv/
|
|
59
|
+
ENV/
|
|
60
|
+
env.bak/
|
|
61
|
+
venv.bak/
|
|
62
|
+
|
|
63
|
+
# IDE
|
|
64
|
+
.idea/
|
|
65
|
+
.vscode/
|
|
66
|
+
*.swp
|
|
67
|
+
*.swo
|
|
68
|
+
*~
|
|
69
|
+
|
|
70
|
+
# mypy
|
|
71
|
+
.mypy_cache/
|
|
72
|
+
.dmypy.json
|
|
73
|
+
dmypy.json
|
|
74
|
+
|
|
75
|
+
# ruff
|
|
76
|
+
.ruff_cache/
|
|
77
|
+
|
|
78
|
+
# OS
|
|
79
|
+
.DS_Store
|
|
80
|
+
Thumbs.db
|
|
81
|
+
|
|
82
|
+
# Project specific
|
|
83
|
+
*.log
|
|
84
|
+
.anysite/
|
|
85
|
+
|
|
86
|
+
# Internal docs
|
|
87
|
+
init_docs/
|
|
88
|
+
|
|
89
|
+
data/
|
|
90
|
+
issues/
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Commands
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install with dev dependencies
|
|
9
|
+
pip install -e ".[dev]"
|
|
10
|
+
|
|
11
|
+
# Install with dataset support (duckdb, pyarrow)
|
|
12
|
+
pip install -e ".[dev,data]"
|
|
13
|
+
|
|
14
|
+
# Run all tests
|
|
15
|
+
pytest
|
|
16
|
+
|
|
17
|
+
# Run single test file
|
|
18
|
+
pytest tests/test_cli/test_main.py
|
|
19
|
+
|
|
20
|
+
# Run single test
|
|
21
|
+
pytest tests/test_cli/test_main.py::test_version
|
|
22
|
+
|
|
23
|
+
# Run with coverage
|
|
24
|
+
pytest --cov=anysite --cov-report=term-missing
|
|
25
|
+
|
|
26
|
+
# Lint and format
|
|
27
|
+
ruff check src/
|
|
28
|
+
ruff check src/ --fix
|
|
29
|
+
ruff format src/
|
|
30
|
+
|
|
31
|
+
# Type check
|
|
32
|
+
mypy src/
|
|
33
|
+
|
|
34
|
+
# Test CLI directly
|
|
35
|
+
anysite --help
|
|
36
|
+
anysite api /api/linkedin/user user=satyanadella
|
|
37
|
+
anysite describe /api/linkedin/user
|
|
38
|
+
anysite schema update
|
|
39
|
+
|
|
40
|
+
# Dataset commands
|
|
41
|
+
anysite dataset init my-dataset
|
|
42
|
+
anysite dataset collect dataset.yaml
|
|
43
|
+
anysite dataset collect dataset.yaml --source linkedin_profiles --incremental --dry-run
|
|
44
|
+
anysite dataset collect dataset.yaml --load-db pg
|
|
45
|
+
anysite dataset status dataset.yaml
|
|
46
|
+
anysite dataset query dataset.yaml --sql "SELECT * FROM profiles LIMIT 10"
|
|
47
|
+
anysite dataset query dataset.yaml --source profiles --fields "name, urn.value AS urn_id, headline"
|
|
48
|
+
anysite dataset query dataset.yaml --interactive
|
|
49
|
+
anysite dataset stats dataset.yaml --source profiles
|
|
50
|
+
anysite dataset profile dataset.yaml
|
|
51
|
+
anysite dataset load-db dataset.yaml -c pg --drop-existing
|
|
52
|
+
anysite dataset history my-dataset
|
|
53
|
+
anysite dataset logs my-dataset --run 42
|
|
54
|
+
anysite dataset schedule dataset.yaml --incremental --load-db pg
|
|
55
|
+
anysite dataset schedule dataset.yaml --systemd --load-db pg
|
|
56
|
+
anysite dataset reset-cursor dataset.yaml
|
|
57
|
+
anysite dataset reset-cursor dataset.yaml --source profiles
|
|
58
|
+
|
|
59
|
+
# Database commands
|
|
60
|
+
anysite db add mydb
|
|
61
|
+
anysite db list
|
|
62
|
+
anysite db test mydb
|
|
63
|
+
anysite db info mydb
|
|
64
|
+
anysite db remove mydb
|
|
65
|
+
anysite db schema mydb
|
|
66
|
+
anysite db schema mydb --table users
|
|
67
|
+
anysite db insert mydb --table users --stdin --auto-create
|
|
68
|
+
anysite db query mydb --sql "SELECT * FROM users LIMIT 10" --format table
|
|
69
|
+
anysite db upsert mydb --table users --conflict-columns id --stdin
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Architecture
|
|
73
|
+
|
|
74
|
+
**CLI Framework**: Typer with Rich for terminal output.
|
|
75
|
+
|
|
76
|
+
**Module Structure**:
|
|
77
|
+
- `main.py` - Typer app entry point. Registers `api`, `describe`, `schema`, `config`, `dataset` commands. Handles global options (`--api-key`, `--debug`, `--no-color`).
|
|
78
|
+
- `cli/config.py` - Config management commands (set, get, list, path, init, reset)
|
|
79
|
+
- `cli/executor.py` - Async execution wrappers: `run_search_command()` for list/search endpoints, `run_single_command()` for single-item + batch
|
|
80
|
+
- `cli/options.py` - Reusable Typer option type aliases (FormatOption, FieldsOption, etc.) and `ErrorHandling` enum
|
|
81
|
+
- `api/client.py` - Async HTTP client (`AnysiteClient`) with retry logic, exponential backoff, auth via `access-token` header
|
|
82
|
+
- `api/errors.py` - Exception hierarchy (AuthenticationError, RateLimitError, NotFoundError, ValidationError, ServerError, NetworkError, TimeoutError)
|
|
83
|
+
- `api/schemas.py` - OpenAPI schema cache: fetch spec, resolve `$ref`, extract input/output, search/list endpoints, auto-convert CLI arg types
|
|
84
|
+
- `config/settings.py` - Pydantic Settings with priority: CLI > ENV > config file > defaults
|
|
85
|
+
- `config/paths.py` - Config/cache file paths (`~/.anysite/config.yaml`, `~/.anysite/schema.json`)
|
|
86
|
+
- `output/formatters.py` - JSON, JSONL, CSV, Table formatters with field selection and exclusion
|
|
87
|
+
- `output/templates.py` - Filename templates for batch output (`{id}`, `{username}`, `{date}`, `{index}`)
|
|
88
|
+
- `batch/executor.py` - BatchExecutor: parallel/sequential execution with semaphore, error handling (stop/skip/retry), progress callbacks
|
|
89
|
+
- `batch/input.py` - InputParser: text, JSONL, CSV input file parsing
|
|
90
|
+
- `batch/rate_limiter.py` - Token bucket rate limiter (`"10/s"`, `"100/m"`)
|
|
91
|
+
- `streaming/writer.py` - StreamingWriter for JSONL/CSV with field filtering, append mode, auto-flush
|
|
92
|
+
- `streaming/progress.py` - Rich progress bars, auto-detect TTY, statistics
|
|
93
|
+
- `utils/fields.py` - Field selection with dot notation, array wildcards, built-in presets (minimal, contact, recruiting)
|
|
94
|
+
- `utils/retry.py` - RetryConfig and retry logic
|
|
95
|
+
- `dataset/__init__.py` - `check_data_deps()` — verifies optional duckdb/pyarrow are installed
|
|
96
|
+
- `dataset/models.py` - Pydantic models for dataset YAML config (`DatasetConfig`, `DatasetSource`, `SourceDependency`, `StorageConfig`, `TransformConfig`, `ExportDestination`, `ScheduleConfig`, `NotificationsConfig`, `WebhookNotification`), topological sort (Kahn's algorithm)
|
|
97
|
+
- `dataset/storage.py` - Parquet read/write via pyarrow, directory layout (`raw/<source_id>/<date>.parquet`), `MetadataStore` for `metadata.json`
|
|
98
|
+
- `dataset/collector.py` - Collection orchestrator: topo-sorted execution, three source types (independent, from_file, dependent), per-source transform/export, run history, notifications. Uses `BatchExecutor` + `AnysiteClient`
|
|
99
|
+
- `dataset/analyzer.py` - DuckDB analytics: SQL query, column stats, profile, interactive shell. Registers views over Parquet files
|
|
100
|
+
- `dataset/transformer.py` - `RecordTransformer`: safe filter parser (no `eval()`), field selection with dot-notation/aliases, static column injection. Filter syntax: `.field > 10`, `.status == "active"`, `and`/`or`
|
|
101
|
+
- `dataset/exporters.py` - Per-source export after Parquet write: `FileExporter` (JSON/JSONL/CSV with `{{date}}`/`{{source}}` templates), `WebhookExporter` (POST records to URL)
|
|
102
|
+
- `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
|
|
103
|
+
- `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
|
|
104
|
+
- `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
|
|
105
|
+
- `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `history`, `logs`, `schedule`, `reset-cursor`
|
|
106
|
+
- `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
|
|
107
|
+
- `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
|
|
108
|
+
- `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
|
|
109
|
+
- `db/config.py` - `ConnectionConfig`, `DatabaseType`, `OnConflict` enums and models
|
|
110
|
+
- `db/manager.py` - `ConnectionManager`: named connections stored in `~/.anysite/connections.yaml`, adapter factory
|
|
111
|
+
- `db/adapters/base.py` - `DatabaseAdapter` ABC: connect, execute, fetch, insert_batch, create_table, transaction
|
|
112
|
+
- `db/adapters/sqlite.py` - `SQLiteAdapter`: stdlib sqlite3, WAL mode, FK support, JSON serialization
|
|
113
|
+
- `db/adapters/postgres.py` - `PostgresAdapter`: psycopg v3, JSONB support, parameterized queries
|
|
114
|
+
- `db/schema/inference.py` - `infer_table_schema()`: auto-detect column types from JSON data (integer, float, boolean, date, url, email, json, text)
|
|
115
|
+
- `db/schema/types.py` - `get_sql_type()`: maps inferred types to SQL types per dialect (sqlite, postgres, mysql)
|
|
116
|
+
- `db/operations/insert.py` - `insert_from_stream()`: batch insert with auto-create, conflict handling
|
|
117
|
+
- `db/operations/query.py` - `execute_query()`: SQL execution with output formatting
|
|
118
|
+
- `db/utils/sanitize.py` - `sanitize_identifier()`, `sanitize_table_name()`: safe SQL identifier quoting
|
|
119
|
+
- `db/cli.py` - Typer subcommands: `add`, `list`, `test`, `info`, `remove`, `schema`, `insert`, `upsert`, `query`, `create-table`
|
|
120
|
+
|
|
121
|
+
**API Pattern**: All Anysite API endpoints use POST with JSON body. Auth is via `access-token` header.
|
|
122
|
+
|
|
123
|
+
**Universal API Command**: Instead of per-platform CLI modules, a single `anysite api` command works with any endpoint. Parameters are `key=value` pairs, auto-typed via the schema cache.
|
|
124
|
+
|
|
125
|
+
**Two Execution Paths**:
|
|
126
|
+
- `execute_search_command()` - for list/search endpoints (single request, optional streaming)
|
|
127
|
+
- `execute_single_command()` - for single-item endpoints with optional batch support (from-file, stdin, parallel)
|
|
128
|
+
|
|
129
|
+
**Schema Cache**: `anysite schema update` fetches the OpenAPI spec, resolves all `$ref`/`allOf`/`anyOf`, and caches a compact representation to `~/.anysite/schema.json`. Used by `anysite describe` and for auto-typing `api` command parameters.
|
|
130
|
+
|
|
131
|
+
**Config Location**: `~/.anysite/config.yaml`
|
|
132
|
+
|
|
133
|
+
**Dataset Subsystem** (`anysite dataset`): Multi-source data collection, Parquet storage, DuckDB analytics, relational DB loading, per-source transforms/exports, run history, scheduling, and webhook notifications. Optional — requires `pip install anysite-cli[data]`. Registered in `main.py` via try/except ImportError.
|
|
134
|
+
|
|
135
|
+
**Dataset YAML Config**: Declarative multi-source pipelines. Three source types:
|
|
136
|
+
- **Independent** — single API call with `params`
|
|
137
|
+
- **from_file** — batch API calls with input values from CSV/JSONL/text file (`from_file` + `file_field` + `input_key`)
|
|
138
|
+
- **Dependent** — batch API calls using values extracted from a parent source's Parquet output (`dependency.from_source` + `dependency.field` + `input_key`)
|
|
139
|
+
|
|
140
|
+
Sources are topologically sorted by dependencies. `input_template` allows transforming extracted values before passing to API (e.g., `{type: company, value: "{value}"}`). Nested objects stored as JSON strings in Parquet are auto-parsed back when extracting with dot-notation paths.
|
|
141
|
+
|
|
142
|
+
**Per-Source Transform**: Optional `transform` block per source with `filter` (safe expression parser, e.g., `.count > 10 and .status == "active"`), `fields` (select/rename with dot-notation aliases), and `add_columns` (inject static values). Transforms apply to export destinations only — Parquet always stores full records to preserve dependency resolution.
|
|
143
|
+
|
|
144
|
+
**Per-Source Export**: Optional `export` list per source. Runs after Parquet write. Supports `type: file` (JSON/JSONL/CSV with `{{date}}`/`{{source}}`/`{{dataset}}` path templates) and `type: webhook` (POST records to URL with custom headers).
|
|
145
|
+
|
|
146
|
+
**Collect + Load-DB**: `anysite dataset collect --load-db <connection>` collects data and auto-loads into a database in one step. Used for scheduled pipelines.
|
|
147
|
+
|
|
148
|
+
**Run History**: `HistoryStore` records every collection run in SQLite (`~/.anysite/dataset_history.db`): start/finish time, status, record/source counts, duration, errors. `LogManager` stores per-run log files at `~/.anysite/logs/`.
|
|
149
|
+
|
|
150
|
+
**Scheduling**: `ScheduleGenerator` generates crontab entries and systemd timer/service units from `schedule.cron` in dataset config. Supports `--incremental` and `--load-db` flags in generated commands.
|
|
151
|
+
|
|
152
|
+
**Webhook Notifications**: `WebhookNotifier` sends POST notifications on collection complete/failure to URLs defined in `notifications.on_complete` / `notifications.on_failure`.
|
|
153
|
+
|
|
154
|
+
**Provenance Tracking**: Dependent and from_file source records are annotated with `_input_value` (the raw extracted value that produced the record) and `_parent_source` (parent source ID for dependent sources). This enables FK linking when loading into a relational database.
|
|
155
|
+
|
|
156
|
+
**Incremental Deduplication**: `MetadataStore` tracks which input values have been collected per source via `collected_inputs` in `metadata.json`. Running `--incremental` skips already-collected values for dependent and from_file sources. `anysite dataset reset-cursor` clears this state.
|
|
157
|
+
|
|
158
|
+
**Dot-Notation Query**: `expand_dot_fields()` converts `urn.value AS id` to `json_extract_string(urn, '$.value') AS id` for DuckDB queries. The `--source` and `--fields` options on `dataset query` auto-generate SQL with dot-notation expansion.
|
|
159
|
+
|
|
160
|
+
**Dataset DB Loading** (`dataset load-db`): `DatasetDbLoader` loads Parquet data into a relational database (SQLite/Postgres). Features:
|
|
161
|
+
- Schema inference from Parquet records via `infer_table_schema()`
|
|
162
|
+
- Auto-increment `id` primary key per table
|
|
163
|
+
- FK linking via provenance: parent `_input_value` → child `{parent}_id` column
|
|
164
|
+
- Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion
|
|
165
|
+
- Topological loading order (parents before children)
|
|
166
|
+
|
|
167
|
+
**Dataset Storage Layout**:
|
|
168
|
+
```
|
|
169
|
+
<storage.path>/
|
|
170
|
+
raw/<source_id>/<date>.parquet
|
|
171
|
+
metadata.json
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Database Subsystem** (`anysite db`): Named database connections, schema inspection, data insertion, SQL queries. Supports SQLite and PostgreSQL.
|
|
175
|
+
|
|
176
|
+
**Connection Storage**: `~/.anysite/connections.yaml`. Passwords stored as environment variable references (`password_env: PG_PASS`).
|
|
177
|
+
|
|
178
|
+
**Adapter Pattern**: `DatabaseAdapter` ABC with implementations for SQLite (stdlib) and PostgreSQL (psycopg v3). Context manager for connect/disconnect. Methods: `execute`, `fetch_one`, `fetch_all`, `insert_batch`, `create_table`, `table_exists`, `get_table_schema`, `transaction`.
|
|
179
|
+
|
|
180
|
+
**Schema Inference**: `infer_table_schema()` auto-detects column types from JSON data: integer, float, boolean, date, datetime, url, email, json, varchar, text. Type merging across rows. Dialect-aware SQL type mapping (sqlite, postgres, mysql).
|
|
181
|
+
|
|
182
|
+
## Common CLI Options Pattern
|
|
183
|
+
|
|
184
|
+
Reusable Typer option type aliases are defined in `cli/options.py`:
|
|
185
|
+
- `FormatOption` - output format (json/jsonl/csv/table)
|
|
186
|
+
- `FieldsOption` - comma-separated field selection
|
|
187
|
+
- `OutputOption` - file path for output
|
|
188
|
+
- `QuietOption` - suppress non-data output
|
|
189
|
+
- `ExcludeOption` - fields to exclude
|
|
190
|
+
- `CompactOption` - compact JSON output
|
|
191
|
+
- `FromFileOption`, `StdinOption` - batch input
|
|
192
|
+
- `ParallelOption`, `DelayOption`, `RateLimitOption` - concurrency control
|
|
193
|
+
- `OnErrorOption` - error handling mode (stop/skip/retry)
|
|
194
|
+
- `ProgressOption`, `StatsOption`, `VerboseOption` - feedback
|
|
195
|
+
|
|
196
|
+
## Testing
|
|
197
|
+
|
|
198
|
+
Tests are in `tests/` with subdirectories mirroring `src/anysite/`:
|
|
199
|
+
- `test_cli/` — CLI commands
|
|
200
|
+
- `test_api/` — API client
|
|
201
|
+
- `test_batch/` — Batch executor, rate limiter, input parser
|
|
202
|
+
- `test_streaming/` — Progress and writer
|
|
203
|
+
- `test_output/` — Formatters and templates
|
|
204
|
+
- `test_utils/` — Field selection and retry
|
|
205
|
+
- `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications
|
|
206
|
+
- `test_db/` — Database adapters, schema inference, connection manager, operations
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anysite Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|