anysite-cli 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. anysite_cli-0.1.2/.claude/settings.local.json +17 -0
  2. anysite_cli-0.1.2/.gitignore +90 -0
  3. anysite_cli-0.1.2/CLAUDE.md +206 -0
  4. anysite_cli-0.1.2/LICENSE +21 -0
  5. anysite_cli-0.1.2/PKG-INFO +455 -0
  6. anysite_cli-0.1.2/README.md +392 -0
  7. anysite_cli-0.1.2/pyproject.toml +141 -0
  8. anysite_cli-0.1.2/skills/anysite-cli/SKILL.md +303 -0
  9. anysite_cli-0.1.2/skills/anysite-cli/references/api-reference.md +160 -0
  10. anysite_cli-0.1.2/skills/anysite-cli/references/dataset-guide.md +441 -0
  11. anysite_cli-0.1.2/src/anysite/__init__.py +4 -0
  12. anysite_cli-0.1.2/src/anysite/__main__.py +6 -0
  13. anysite_cli-0.1.2/src/anysite/api/__init__.py +21 -0
  14. anysite_cli-0.1.2/src/anysite/api/client.py +271 -0
  15. anysite_cli-0.1.2/src/anysite/api/errors.py +137 -0
  16. anysite_cli-0.1.2/src/anysite/api/schemas.py +333 -0
  17. anysite_cli-0.1.2/src/anysite/batch/__init__.py +1 -0
  18. anysite_cli-0.1.2/src/anysite/batch/executor.py +176 -0
  19. anysite_cli-0.1.2/src/anysite/batch/input.py +160 -0
  20. anysite_cli-0.1.2/src/anysite/batch/rate_limiter.py +98 -0
  21. anysite_cli-0.1.2/src/anysite/cli/__init__.py +1 -0
  22. anysite_cli-0.1.2/src/anysite/cli/config.py +176 -0
  23. anysite_cli-0.1.2/src/anysite/cli/executor.py +388 -0
  24. anysite_cli-0.1.2/src/anysite/cli/options.py +249 -0
  25. anysite_cli-0.1.2/src/anysite/config/__init__.py +11 -0
  26. anysite_cli-0.1.2/src/anysite/config/paths.py +46 -0
  27. anysite_cli-0.1.2/src/anysite/config/settings.py +187 -0
  28. anysite_cli-0.1.2/src/anysite/dataset/__init__.py +37 -0
  29. anysite_cli-0.1.2/src/anysite/dataset/analyzer.py +268 -0
  30. anysite_cli-0.1.2/src/anysite/dataset/cli.py +644 -0
  31. anysite_cli-0.1.2/src/anysite/dataset/collector.py +686 -0
  32. anysite_cli-0.1.2/src/anysite/dataset/db_loader.py +248 -0
  33. anysite_cli-0.1.2/src/anysite/dataset/errors.py +30 -0
  34. anysite_cli-0.1.2/src/anysite/dataset/exporters.py +121 -0
  35. anysite_cli-0.1.2/src/anysite/dataset/history.py +153 -0
  36. anysite_cli-0.1.2/src/anysite/dataset/models.py +245 -0
  37. anysite_cli-0.1.2/src/anysite/dataset/notifications.py +87 -0
  38. anysite_cli-0.1.2/src/anysite/dataset/scheduler.py +107 -0
  39. anysite_cli-0.1.2/src/anysite/dataset/storage.py +171 -0
  40. anysite_cli-0.1.2/src/anysite/dataset/transformer.py +213 -0
  41. anysite_cli-0.1.2/src/anysite/db/__init__.py +38 -0
  42. anysite_cli-0.1.2/src/anysite/db/adapters/__init__.py +1 -0
  43. anysite_cli-0.1.2/src/anysite/db/adapters/base.py +158 -0
  44. anysite_cli-0.1.2/src/anysite/db/adapters/postgres.py +201 -0
  45. anysite_cli-0.1.2/src/anysite/db/adapters/sqlite.py +183 -0
  46. anysite_cli-0.1.2/src/anysite/db/cli.py +709 -0
  47. anysite_cli-0.1.2/src/anysite/db/config.py +92 -0
  48. anysite_cli-0.1.2/src/anysite/db/manager.py +166 -0
  49. anysite_cli-0.1.2/src/anysite/db/operations/__init__.py +1 -0
  50. anysite_cli-0.1.2/src/anysite/db/operations/insert.py +199 -0
  51. anysite_cli-0.1.2/src/anysite/db/operations/query.py +43 -0
  52. anysite_cli-0.1.2/src/anysite/db/schema/__init__.py +1 -0
  53. anysite_cli-0.1.2/src/anysite/db/schema/inference.py +213 -0
  54. anysite_cli-0.1.2/src/anysite/db/schema/types.py +71 -0
  55. anysite_cli-0.1.2/src/anysite/db/utils/__init__.py +1 -0
  56. anysite_cli-0.1.2/src/anysite/db/utils/sanitize.py +99 -0
  57. anysite_cli-0.1.2/src/anysite/main.py +498 -0
  58. anysite_cli-0.1.2/src/anysite/models/__init__.py +1 -0
  59. anysite_cli-0.1.2/src/anysite/output/__init__.py +11 -0
  60. anysite_cli-0.1.2/src/anysite/output/console.py +45 -0
  61. anysite_cli-0.1.2/src/anysite/output/formatters.py +301 -0
  62. anysite_cli-0.1.2/src/anysite/output/templates.py +76 -0
  63. anysite_cli-0.1.2/src/anysite/py.typed +0 -0
  64. anysite_cli-0.1.2/src/anysite/streaming/__init__.py +1 -0
  65. anysite_cli-0.1.2/src/anysite/streaming/progress.py +121 -0
  66. anysite_cli-0.1.2/src/anysite/streaming/writer.py +130 -0
  67. anysite_cli-0.1.2/src/anysite/utils/__init__.py +1 -0
  68. anysite_cli-0.1.2/src/anysite/utils/fields.py +242 -0
  69. anysite_cli-0.1.2/src/anysite/utils/retry.py +109 -0
  70. anysite_cli-0.1.2/test_data/enriched_partners_sample_10.csv +11 -0
  71. anysite_cli-0.1.2/test_data/linkedin-partners/company_aliases.txt +10 -0
  72. anysite_cli-0.1.2/test_data/linkedin-partners/dataset.yaml +77 -0
  73. anysite_cli-0.1.2/test_data/partners-deep/dataset.yaml +102 -0
  74. anysite_cli-0.1.2/test_data/partners-intel/dataset.yaml +70 -0
  75. anysite_cli-0.1.2/test_data/partners-linkedin/company_aliases.txt +10 -0
  76. anysite_cli-0.1.2/test_data/partners-linkedin/dataset.yaml +103 -0
  77. anysite_cli-0.1.2/test_data/partners-pipeline/dataset.yaml +131 -0
  78. anysite_cli-0.1.2/tests/__init__.py +1 -0
  79. anysite_cli-0.1.2/tests/conftest.py +90 -0
  80. anysite_cli-0.1.2/tests/test_api/__init__.py +1 -0
  81. anysite_cli-0.1.2/tests/test_batch/__init__.py +0 -0
  82. anysite_cli-0.1.2/tests/test_batch/test_executor.py +128 -0
  83. anysite_cli-0.1.2/tests/test_batch/test_input.py +97 -0
  84. anysite_cli-0.1.2/tests/test_batch/test_rate_limiter.py +60 -0
  85. anysite_cli-0.1.2/tests/test_cli/__init__.py +1 -0
  86. anysite_cli-0.1.2/tests/test_cli/test_main.py +73 -0
  87. anysite_cli-0.1.2/tests/test_dataset/__init__.py +0 -0
  88. anysite_cli-0.1.2/tests/test_dataset/test_analyzer.py +158 -0
  89. anysite_cli-0.1.2/tests/test_dataset/test_collector.py +550 -0
  90. anysite_cli-0.1.2/tests/test_dataset/test_db_loader.py +346 -0
  91. anysite_cli-0.1.2/tests/test_dataset/test_exporters.py +115 -0
  92. anysite_cli-0.1.2/tests/test_dataset/test_history.py +104 -0
  93. anysite_cli-0.1.2/tests/test_dataset/test_models.py +251 -0
  94. anysite_cli-0.1.2/tests/test_dataset/test_notifications.py +81 -0
  95. anysite_cli-0.1.2/tests/test_dataset/test_scheduler.py +68 -0
  96. anysite_cli-0.1.2/tests/test_dataset/test_storage.py +148 -0
  97. anysite_cli-0.1.2/tests/test_dataset/test_transformer.py +143 -0
  98. anysite_cli-0.1.2/tests/test_db/__init__.py +0 -0
  99. anysite_cli-0.1.2/tests/test_db/test_cli.py +228 -0
  100. anysite_cli-0.1.2/tests/test_db/test_config.py +154 -0
  101. anysite_cli-0.1.2/tests/test_db/test_inference.py +169 -0
  102. anysite_cli-0.1.2/tests/test_db/test_insert.py +126 -0
  103. anysite_cli-0.1.2/tests/test_db/test_manager.py +112 -0
  104. anysite_cli-0.1.2/tests/test_db/test_postgres_adapter.py +428 -0
  105. anysite_cli-0.1.2/tests/test_db/test_sanitize.py +79 -0
  106. anysite_cli-0.1.2/tests/test_db/test_sqlite_adapter.py +198 -0
  107. anysite_cli-0.1.2/tests/test_output/__init__.py +1 -0
  108. anysite_cli-0.1.2/tests/test_output/test_formatters.py +151 -0
  109. anysite_cli-0.1.2/tests/test_output/test_templates.py +70 -0
  110. anysite_cli-0.1.2/tests/test_streaming/__init__.py +0 -0
  111. anysite_cli-0.1.2/tests/test_streaming/test_progress.py +47 -0
  112. anysite_cli-0.1.2/tests/test_streaming/test_writer.py +90 -0
  113. anysite_cli-0.1.2/tests/test_utils/__init__.py +0 -0
  114. anysite_cli-0.1.2/tests/test_utils/test_fields.py +103 -0
  115. anysite_cli-0.1.2/tests/test_utils/test_retry.py +102 -0
@@ -0,0 +1,17 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(source:*)",
5
+ "Bash(anysite linkedin user --help:*)",
6
+ "Bash(pytest:*)",
7
+ "Bash(tree:*)",
8
+ "Bash(find:*)",
9
+ "Bash(anysite:*)",
10
+ "Bash(anysite twitter user --help:*)",
11
+ "Bash(echo:*)",
12
+ "Bash(SCRATCHPAD=\"/private/tmp/claude-501/-Users-kulia-anysite-cli/fd73f4eb-9c04-4b25-8eca-83be56ae0e5e/scratchpad\":*)",
13
+ "Bash(export PG_TEST_PASS=testpass)",
14
+ "Bash(python -m pytest:*)"
15
+ ]
16
+ }
17
+ }
@@ -0,0 +1,90 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Installer logs
32
+ pip-log.txt
33
+ pip-delete-this-directory.txt
34
+
35
+ # Unit test / coverage reports
36
+ htmlcov/
37
+ .tox/
38
+ .nox/
39
+ .coverage
40
+ .coverage.*
41
+ .cache
42
+ nosetests.xml
43
+ coverage.xml
44
+ *.cover
45
+ *.py,cover
46
+ .hypothesis/
47
+ .pytest_cache/
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Environments
54
+ .env
55
+ .env.local
56
+ .venv
57
+ env/
58
+ venv/
59
+ ENV/
60
+ env.bak/
61
+ venv.bak/
62
+
63
+ # IDE
64
+ .idea/
65
+ .vscode/
66
+ *.swp
67
+ *.swo
68
+ *~
69
+
70
+ # mypy
71
+ .mypy_cache/
72
+ .dmypy.json
73
+ dmypy.json
74
+
75
+ # ruff
76
+ .ruff_cache/
77
+
78
+ # OS
79
+ .DS_Store
80
+ Thumbs.db
81
+
82
+ # Project specific
83
+ *.log
84
+ .anysite/
85
+
86
+ # Internal docs
87
+ init_docs/
88
+
89
+ data/
90
+ issues/
@@ -0,0 +1,206 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Commands
6
+
7
+ ```bash
8
+ # Install with dev dependencies
9
+ pip install -e ".[dev]"
10
+
11
+ # Install with dataset support (duckdb, pyarrow)
12
+ pip install -e ".[dev,data]"
13
+
14
+ # Run all tests
15
+ pytest
16
+
17
+ # Run single test file
18
+ pytest tests/test_cli/test_main.py
19
+
20
+ # Run single test
21
+ pytest tests/test_cli/test_main.py::test_version
22
+
23
+ # Run with coverage
24
+ pytest --cov=anysite --cov-report=term-missing
25
+
26
+ # Lint and format
27
+ ruff check src/
28
+ ruff check src/ --fix
29
+ ruff format src/
30
+
31
+ # Type check
32
+ mypy src/
33
+
34
+ # Test CLI directly
35
+ anysite --help
36
+ anysite api /api/linkedin/user user=satyanadella
37
+ anysite describe /api/linkedin/user
38
+ anysite schema update
39
+
40
+ # Dataset commands
41
+ anysite dataset init my-dataset
42
+ anysite dataset collect dataset.yaml
43
+ anysite dataset collect dataset.yaml --source linkedin_profiles --incremental --dry-run
44
+ anysite dataset collect dataset.yaml --load-db pg
45
+ anysite dataset status dataset.yaml
46
+ anysite dataset query dataset.yaml --sql "SELECT * FROM profiles LIMIT 10"
47
+ anysite dataset query dataset.yaml --source profiles --fields "name, urn.value AS urn_id, headline"
48
+ anysite dataset query dataset.yaml --interactive
49
+ anysite dataset stats dataset.yaml --source profiles
50
+ anysite dataset profile dataset.yaml
51
+ anysite dataset load-db dataset.yaml -c pg --drop-existing
52
+ anysite dataset history my-dataset
53
+ anysite dataset logs my-dataset --run 42
54
+ anysite dataset schedule dataset.yaml --incremental --load-db pg
55
+ anysite dataset schedule dataset.yaml --systemd --load-db pg
56
+ anysite dataset reset-cursor dataset.yaml
57
+ anysite dataset reset-cursor dataset.yaml --source profiles
58
+
59
+ # Database commands
60
+ anysite db add mydb
61
+ anysite db list
62
+ anysite db test mydb
63
+ anysite db info mydb
64
+ anysite db remove mydb
65
+ anysite db schema mydb
66
+ anysite db schema mydb --table users
67
+ anysite db insert mydb --table users --stdin --auto-create
68
+ anysite db query mydb --sql "SELECT * FROM users LIMIT 10" --format table
69
+ anysite db upsert mydb --table users --conflict-columns id --stdin
70
+ ```
71
+
72
+ ## Architecture
73
+
74
+ **CLI Framework**: Typer with Rich for terminal output.
75
+
76
+ **Module Structure**:
77
+ - `main.py` - Typer app entry point. Registers `api`, `describe`, `schema`, `config`, `dataset` commands. Handles global options (`--api-key`, `--debug`, `--no-color`).
78
+ - `cli/config.py` - Config management commands (set, get, list, path, init, reset)
79
+ - `cli/executor.py` - Async execution wrappers: `run_search_command()` for list/search endpoints, `run_single_command()` for single-item + batch
80
+ - `cli/options.py` - Reusable Typer option type aliases (FormatOption, FieldsOption, etc.) and `ErrorHandling` enum
81
+ - `api/client.py` - Async HTTP client (`AnysiteClient`) with retry logic, exponential backoff, auth via `access-token` header
82
+ - `api/errors.py` - Exception hierarchy (AuthenticationError, RateLimitError, NotFoundError, ValidationError, ServerError, NetworkError, TimeoutError)
83
+ - `api/schemas.py` - OpenAPI schema cache: fetch spec, resolve `$ref`, extract input/output, search/list endpoints, auto-convert CLI arg types
84
+ - `config/settings.py` - Pydantic Settings with priority: CLI > ENV > config file > defaults
85
+ - `config/paths.py` - Config/cache file paths (`~/.anysite/config.yaml`, `~/.anysite/schema.json`)
86
+ - `output/formatters.py` - JSON, JSONL, CSV, Table formatters with field selection and exclusion
87
+ - `output/templates.py` - Filename templates for batch output (`{id}`, `{username}`, `{date}`, `{index}`)
88
+ - `batch/executor.py` - BatchExecutor: parallel/sequential execution with semaphore, error handling (stop/skip/retry), progress callbacks
89
+ - `batch/input.py` - InputParser: text, JSONL, CSV input file parsing
90
+ - `batch/rate_limiter.py` - Token bucket rate limiter (`"10/s"`, `"100/m"`)
91
+ - `streaming/writer.py` - StreamingWriter for JSONL/CSV with field filtering, append mode, auto-flush
92
+ - `streaming/progress.py` - Rich progress bars, auto-detect TTY, statistics
93
+ - `utils/fields.py` - Field selection with dot notation, array wildcards, built-in presets (minimal, contact, recruiting)
94
+ - `utils/retry.py` - RetryConfig and retry logic
95
+ - `dataset/__init__.py` - `check_data_deps()` — verifies optional duckdb/pyarrow are installed
96
+ - `dataset/models.py` - Pydantic models for dataset YAML config (`DatasetConfig`, `DatasetSource`, `SourceDependency`, `StorageConfig`, `TransformConfig`, `ExportDestination`, `ScheduleConfig`, `NotificationsConfig`, `WebhookNotification`), topological sort (Kahn's algorithm)
97
+ - `dataset/storage.py` - Parquet read/write via pyarrow, directory layout (`raw/<source_id>/<date>.parquet`), `MetadataStore` for `metadata.json`
98
+ - `dataset/collector.py` - Collection orchestrator: topo-sorted execution, three source types (independent, from_file, dependent), per-source transform/export, run history, notifications. Uses `BatchExecutor` + `AnysiteClient`
99
+ - `dataset/analyzer.py` - DuckDB analytics: SQL query, column stats, profile, interactive shell. Registers views over Parquet files
100
+ - `dataset/transformer.py` - `RecordTransformer`: safe filter parser (no `eval()`), field selection with dot-notation/aliases, static column injection. Filter syntax: `.field > 10`, `.status == "active"`, `and`/`or`
101
+ - `dataset/exporters.py` - Per-source export after Parquet write: `FileExporter` (JSON/JSONL/CSV with `{{date}}`/`{{source}}` templates), `WebhookExporter` (POST records to URL)
102
+ - `dataset/history.py` - `HistoryStore` (SQLite at `~/.anysite/dataset_history.db`): run start/finish tracking. `LogManager`: file-based per-run logs at `~/.anysite/logs/`
103
+ - `dataset/scheduler.py` - `ScheduleGenerator`: crontab and systemd timer unit generation from cron expressions
104
+ - `dataset/notifications.py` - `WebhookNotifier`: POST to webhook URLs on collection complete/failure
105
+ - `dataset/cli.py` - Typer subcommands: `init`, `collect` (with `--load-db`), `status`, `query`, `stats`, `profile`, `load-db`, `history`, `logs`, `schedule`, `reset-cursor`
106
+ - `dataset/db_loader.py` - `DatasetDbLoader`: loads Parquet data into relational DB with FK linking via provenance, dot-notation field extraction, schema inference
107
+ - `dataset/errors.py` - `DatasetError`, `CircularDependencyError`, `SourceNotFoundError`
108
+ - `db/__init__.py` - `check_db_deps()` — verifies optional psycopg is installed for Postgres
109
+ - `db/config.py` - `ConnectionConfig`, `DatabaseType`, `OnConflict` enums and models
110
+ - `db/manager.py` - `ConnectionManager`: named connections stored in `~/.anysite/connections.yaml`, adapter factory
111
+ - `db/adapters/base.py` - `DatabaseAdapter` ABC: connect, execute, fetch, insert_batch, create_table, transaction
112
+ - `db/adapters/sqlite.py` - `SQLiteAdapter`: stdlib sqlite3, WAL mode, FK support, JSON serialization
113
+ - `db/adapters/postgres.py` - `PostgresAdapter`: psycopg v3, JSONB support, parameterized queries
114
+ - `db/schema/inference.py` - `infer_table_schema()`: auto-detect column types from JSON data (integer, float, boolean, date, url, email, json, text)
115
+ - `db/schema/types.py` - `get_sql_type()`: maps inferred types to SQL types per dialect (sqlite, postgres, mysql)
116
+ - `db/operations/insert.py` - `insert_from_stream()`: batch insert with auto-create, conflict handling
117
+ - `db/operations/query.py` - `execute_query()`: SQL execution with output formatting
118
+ - `db/utils/sanitize.py` - `sanitize_identifier()`, `sanitize_table_name()`: safe SQL identifier quoting
119
+ - `db/cli.py` - Typer subcommands: `add`, `list`, `test`, `info`, `remove`, `schema`, `insert`, `upsert`, `query`, `create-table`
120
+
121
+ **API Pattern**: All Anysite API endpoints use POST with JSON body. Auth is via `access-token` header.
122
+
123
+ **Universal API Command**: Instead of per-platform CLI modules, a single `anysite api` command works with any endpoint. Parameters are `key=value` pairs, auto-typed via the schema cache.
124
+
125
+ **Two Execution Paths**:
126
+ - `execute_search_command()` - for list/search endpoints (single request, optional streaming)
127
+ - `execute_single_command()` - for single-item endpoints with optional batch support (from-file, stdin, parallel)
128
+
129
+ **Schema Cache**: `anysite schema update` fetches the OpenAPI spec, resolves all `$ref`/`allOf`/`anyOf`, and caches a compact representation to `~/.anysite/schema.json`. Used by `anysite describe` and for auto-typing `api` command parameters.
130
+
131
+ **Config Location**: `~/.anysite/config.yaml`
132
+
133
+ **Dataset Subsystem** (`anysite dataset`): Multi-source data collection, Parquet storage, DuckDB analytics, relational DB loading, per-source transforms/exports, run history, scheduling, and webhook notifications. Optional — requires `pip install anysite-cli[data]`. Registered in `main.py` via try/except ImportError.
134
+
135
+ **Dataset YAML Config**: Declarative multi-source pipelines. Three source types:
136
+ - **Independent** — single API call with `params`
137
+ - **from_file** — batch API calls with input values from CSV/JSONL/text file (`from_file` + `file_field` + `input_key`)
138
+ - **Dependent** — batch API calls using values extracted from a parent source's Parquet output (`dependency.from_source` + `dependency.field` + `input_key`)
139
+
140
+ Sources are topologically sorted by dependencies. `input_template` allows transforming extracted values before passing to API (e.g., `{type: company, value: "{value}"}`). Nested objects stored as JSON strings in Parquet are auto-parsed back when extracting with dot-notation paths.
141
+
142
+ **Per-Source Transform**: Optional `transform` block per source with `filter` (safe expression parser, e.g., `.count > 10 and .status == "active"`), `fields` (select/rename with dot-notation aliases), and `add_columns` (inject static values). Transforms apply to export destinations only — Parquet always stores full records to preserve dependency resolution.
143
+
144
+ **Per-Source Export**: Optional `export` list per source. Runs after Parquet write. Supports `type: file` (JSON/JSONL/CSV with `{{date}}`/`{{source}}`/`{{dataset}}` path templates) and `type: webhook` (POST records to URL with custom headers).
145
+
146
+ **Collect + Load-DB**: `anysite dataset collect --load-db <connection>` collects data and auto-loads into a database in one step. Used for scheduled pipelines.
147
+
148
+ **Run History**: `HistoryStore` records every collection run in SQLite (`~/.anysite/dataset_history.db`): start/finish time, status, record/source counts, duration, errors. `LogManager` stores per-run log files at `~/.anysite/logs/`.
149
+
150
+ **Scheduling**: `ScheduleGenerator` generates crontab entries and systemd timer/service units from `schedule.cron` in dataset config. Supports `--incremental` and `--load-db` flags in generated commands.
151
+
152
+ **Webhook Notifications**: `WebhookNotifier` sends POST notifications on collection complete/failure to URLs defined in `notifications.on_complete` / `notifications.on_failure`.
153
+
154
+ **Provenance Tracking**: Dependent and from_file source records are annotated with `_input_value` (the raw extracted value that produced the record) and `_parent_source` (parent source ID for dependent sources). This enables FK linking when loading into a relational database.
155
+
156
+ **Incremental Deduplication**: `MetadataStore` tracks which input values have been collected per source via `collected_inputs` in `metadata.json`. Running `--incremental` skips already-collected values for dependent and from_file sources. `anysite dataset reset-cursor` clears this state.
157
+
158
+ **Dot-Notation Query**: `expand_dot_fields()` converts `urn.value AS id` to `json_extract_string(urn, '$.value') AS id` for DuckDB queries. The `--source` and `--fields` options on `dataset query` auto-generate SQL with dot-notation expansion.
159
+
160
+ **Dataset DB Loading** (`dataset load-db`): `DatasetDbLoader` loads Parquet data into a relational database (SQLite/Postgres). Features:
161
+ - Schema inference from Parquet records via `infer_table_schema()`
162
+ - Auto-increment `id` primary key per table
163
+ - FK linking via provenance: parent `_input_value` → child `{parent}_id` column
164
+ - Optional `db_load` config per source: field selection, dot-notation extraction, custom table names, field exclusion
165
+ - Topological loading order (parents before children)
166
+
167
+ **Dataset Storage Layout**:
168
+ ```
169
+ <storage.path>/
170
+ raw/<source_id>/<date>.parquet
171
+ metadata.json
172
+ ```
173
+
174
+ **Database Subsystem** (`anysite db`): Named database connections, schema inspection, data insertion, SQL queries. Supports SQLite and PostgreSQL.
175
+
176
+ **Connection Storage**: `~/.anysite/connections.yaml`. Passwords stored as environment variable references (`password_env: PG_PASS`).
177
+
178
+ **Adapter Pattern**: `DatabaseAdapter` ABC with implementations for SQLite (stdlib) and PostgreSQL (psycopg v3). Context manager for connect/disconnect. Methods: `execute`, `fetch_one`, `fetch_all`, `insert_batch`, `create_table`, `table_exists`, `get_table_schema`, `transaction`.
179
+
180
+ **Schema Inference**: `infer_table_schema()` auto-detects column types from JSON data: integer, float, boolean, date, datetime, url, email, json, varchar, text. Type merging across rows. Dialect-aware SQL type mapping (sqlite, postgres, mysql).
181
+
182
+ ## Common CLI Options Pattern
183
+
184
+ Reusable Typer option type aliases are defined in `cli/options.py`:
185
+ - `FormatOption` - output format (json/jsonl/csv/table)
186
+ - `FieldsOption` - comma-separated field selection
187
+ - `OutputOption` - file path for output
188
+ - `QuietOption` - suppress non-data output
189
+ - `ExcludeOption` - fields to exclude
190
+ - `CompactOption` - compact JSON output
191
+ - `FromFileOption`, `StdinOption` - batch input
192
+ - `ParallelOption`, `DelayOption`, `RateLimitOption` - concurrency control
193
+ - `OnErrorOption` - error handling mode (stop/skip/retry)
194
+ - `ProgressOption`, `StatsOption`, `VerboseOption` - feedback
195
+
196
+ ## Testing
197
+
198
+ Tests are in `tests/` with subdirectories mirroring `src/anysite/`:
199
+ - `test_cli/` — CLI commands
200
+ - `test_api/` — API client
201
+ - `test_batch/` — Batch executor, rate limiter, input parser
202
+ - `test_streaming/` — Progress and writer
203
+ - `test_output/` — Formatters and templates
204
+ - `test_utils/` — Field selection and retry
205
+ - `test_dataset/` — Dataset models, storage, collector (mocked API), DuckDB analyzer, DB loader (SQLite in-memory), transformer, exporters, history, scheduler, notifications
206
+ - `test_db/` — Database adapters, schema inference, connection manager, operations
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anysite Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.