data-collection-framework 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_collection_framework-0.1.0/PKG-INFO +19 -0
- data_collection_framework-0.1.0/README.md +209 -0
- data_collection_framework-0.1.0/data_collection_framework.egg-info/PKG-INFO +19 -0
- data_collection_framework-0.1.0/data_collection_framework.egg-info/SOURCES.txt +53 -0
- data_collection_framework-0.1.0/data_collection_framework.egg-info/dependency_links.txt +1 -0
- data_collection_framework-0.1.0/data_collection_framework.egg-info/entry_points.txt +2 -0
- data_collection_framework-0.1.0/data_collection_framework.egg-info/requires.txt +15 -0
- data_collection_framework-0.1.0/data_collection_framework.egg-info/top_level.txt +1 -0
- data_collection_framework-0.1.0/dcf/__init__.py +4 -0
- data_collection_framework-0.1.0/dcf/cli.py +841 -0
- data_collection_framework-0.1.0/dcf/config/__init__.py +4 -0
- data_collection_framework-0.1.0/dcf/config/loader.py +77 -0
- data_collection_framework-0.1.0/dcf/config/models.py +240 -0
- data_collection_framework-0.1.0/dcf/engine/__init__.py +6 -0
- data_collection_framework-0.1.0/dcf/engine/fetcher.py +118 -0
- data_collection_framework-0.1.0/dcf/engine/iterator.py +96 -0
- data_collection_framework-0.1.0/dcf/engine/projector.py +56 -0
- data_collection_framework-0.1.0/dcf/engine/runner.py +90 -0
- data_collection_framework-0.1.0/dcf/engine/transforms.py +41 -0
- data_collection_framework-0.1.0/dcf/gcp/__init__.py +0 -0
- data_collection_framework-0.1.0/dcf/gcp/_collector_utils.py +87 -0
- data_collection_framework-0.1.0/dcf/gcp/auth.py +1 -0
- data_collection_framework-0.1.0/dcf/gcp/batch_deploy.py +548 -0
- data_collection_framework-0.1.0/dcf/gcp/bootstrap.py +131 -0
- data_collection_framework-0.1.0/dcf/gcp/gcloud.py +42 -0
- data_collection_framework-0.1.0/dcf/gcp/terraform.py +151 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/main.tf +32 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
- data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/variables.tf +25 -0
- data_collection_framework-0.1.0/dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
- data_collection_framework-0.1.0/dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
- data_collection_framework-0.1.0/dcf/infra/templates/docker-compose.yml.tftpl +76 -0
- data_collection_framework-0.1.0/dcf/local_deploy.py +756 -0
- data_collection_framework-0.1.0/dcf/project.py +23 -0
- data_collection_framework-0.1.0/dcf/spark_session.py +66 -0
- data_collection_framework-0.1.0/dcf/warehouse_reader.py +323 -0
- data_collection_framework-0.1.0/dcf/writer/__init__.py +3 -0
- data_collection_framework-0.1.0/dcf/writer/iceberg.py +315 -0
- data_collection_framework-0.1.0/pyproject.toml +38 -0
- data_collection_framework-0.1.0/setup.cfg +4 -0
- data_collection_framework-0.1.0/tests/test_deploy_cli.py +148 -0
- data_collection_framework-0.1.0/tests/test_deploy_model.py +83 -0
- data_collection_framework-0.1.0/tests/test_fetcher.py +69 -0
- data_collection_framework-0.1.0/tests/test_runner_errors.py +106 -0
- data_collection_framework-0.1.0/tests/test_transforms.py +38 -0
- data_collection_framework-0.1.0/tests/test_warehouse_reader.py +300 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-collection-framework
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Requires-Python: >=3.12
|
|
5
|
+
Requires-Dist: pyspark==4.0.2
|
|
6
|
+
Requires-Dist: pandas
|
|
7
|
+
Requires-Dist: requests
|
|
8
|
+
Requires-Dist: pyarrow
|
|
9
|
+
Requires-Dist: pytz
|
|
10
|
+
Requires-Dist: pydantic>=2
|
|
11
|
+
Requires-Dist: typer[all]>=0.9
|
|
12
|
+
Requires-Dist: google-auth>=2.29
|
|
13
|
+
Requires-Dist: google-cloud-storage>=2.16
|
|
14
|
+
Requires-Dist: google-cloud-secret-manager>=2.20
|
|
15
|
+
Requires-Dist: kafka-python>=2.0
|
|
16
|
+
Requires-Dist: google-api-python-client>=2.126
|
|
17
|
+
Requires-Dist: pyyaml>=6.0
|
|
18
|
+
Requires-Dist: mcp>=1.0
|
|
19
|
+
Requires-Dist: duckdb>=1.0
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# dcf
|
|
2
|
+
|
|
3
|
+
D.ata C.ollection F.ramework
|
|
4
|
+
|
|
5
|
+
It works like this
|
|
6
|
+
1. User defines collectors with basic configs in a YAML (like a dbt model)
|
|
7
|
+
2. dcf builds and runs the collector
|
|
8
|
+
3. Data lake has data
|
|
9
|
+
|
|
10
|
+
## Quickstart
|
|
11
|
+
|
|
12
|
+
This guide walks you from zero to a working data collector. The example ingests your GitHub repositories.
|
|
13
|
+
|
|
14
|
+
### 1. Create a project
|
|
15
|
+
|
|
16
|
+
dcf is a tool you depend on, not a repo you clone. Create a fresh directory:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
mkdir dcf-demo && cd dcf-demo
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**`pyproject.toml`:**
|
|
23
|
+
|
|
24
|
+
```toml
|
|
25
|
+
[project]
|
|
26
|
+
name = "dcf-demo"
|
|
27
|
+
version = "0.1.0"
|
|
28
|
+
requires-python = ">=3.12"
|
|
29
|
+
dependencies = [
|
|
30
|
+
"dcf",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.uv]
|
|
34
|
+
package = false
|
|
35
|
+
|
|
36
|
+
[tool.uv.sources]
|
|
37
|
+
dcf = { git = "https://github.com/zephschafer/dcf.git" }
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**`project.yml`**
|
|
41
|
+
|
|
42
|
+
```yaml
|
|
43
|
+
catalog: local
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
**`.gitignore`:**
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
warehouse/
|
|
50
|
+
project.yml
|
|
51
|
+
.venv/
|
|
52
|
+
__pycache__/
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
mkdir collectors
|
|
57
|
+
uv sync
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
### 2. Write a collector
|
|
63
|
+
|
|
64
|
+
Create `collectors/dcf_commits.yml`:
|
|
65
|
+
|
|
66
|
+
```yaml
|
|
67
|
+
name: dcf_commits
|
|
68
|
+
namespace: github
|
|
69
|
+
description: Commits to the dcf repository.
|
|
70
|
+
|
|
71
|
+
source:
|
|
72
|
+
type: http
|
|
73
|
+
url: https://api.github.com/repos/zephschafer/dcf/commits
|
|
74
|
+
method: GET
|
|
75
|
+
params:
|
|
76
|
+
- name: sha
|
|
77
|
+
type: string
|
|
78
|
+
value: main
|
|
79
|
+
- name: per_page
|
|
80
|
+
type: integer
|
|
81
|
+
value: 100
|
|
82
|
+
schema:
|
|
83
|
+
columns:
|
|
84
|
+
- {name: sha, path: sha, type: string}
|
|
85
|
+
- {name: author, path: commit.author.name, type: string}
|
|
86
|
+
- {name: message, path: commit.message, type: string}
|
|
87
|
+
- {name: committed_at, path: commit.author.date, type: timestamp}
|
|
88
|
+
|
|
89
|
+
cadence:
|
|
90
|
+
strategy: incremental
|
|
91
|
+
primary_key: sha
|
|
92
|
+
|
|
93
|
+
deployment:
|
|
94
|
+
schedule: "0 8 * * *"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
### 3. Validate
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
uv run dcf validate dcf_commits
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
### 4. Run
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
uv run dcf run dcf_commits
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
### 5. Query the warehouse
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
uv run dcf query 'SELECT * FROM github.dcf_commits'
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
You can also save your SQL to a file and run it with `--file`:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
uv run dcf query --file my_query.sql
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
### 6. Deploy
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
uv run dcf deploy dcf_commits
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
This schedules the collector to run daily at 8 AM UTC, as configured in `deployment.schedule`.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Developing dcf
|
|
140
|
+
|
|
141
|
+
Clone this repo, then create or point to a project for testing:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
git clone https://github.com/Data-Dispatch/dcf
|
|
145
|
+
cd dcf
|
|
146
|
+
uv sync
|
|
147
|
+
|
|
148
|
+
# Test against the demo project
|
|
149
|
+
git clone https://github.com/Data-Dispatch/quipu-data-generator ../quipu-data-generator
|
|
150
|
+
cd ../quipu-data-generator
|
|
151
|
+
uv sync # picks up dcf from ../dcf via editable path dep
|
|
152
|
+
uv run dcf validate all
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Or create a minimal test project:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
mkdir my-test-project && cd my-test-project
|
|
159
|
+
cat > pyproject.toml << 'EOF'
|
|
160
|
+
[project]
|
|
161
|
+
name = "my-test-project"
|
|
162
|
+
version = "0.1.0"
|
|
163
|
+
requires-python = ">=3.12"
|
|
164
|
+
dependencies = ["dcf"]
|
|
165
|
+
|
|
166
|
+
[tool.uv]
|
|
167
|
+
package = false
|
|
168
|
+
|
|
169
|
+
[tool.uv.sources]
|
|
170
|
+
dcf = { path = "../dcf", editable = true }
|
|
171
|
+
EOF
|
|
172
|
+
|
|
173
|
+
cat > project.yml << 'EOF'
|
|
174
|
+
catalog: local
|
|
175
|
+
EOF
|
|
176
|
+
|
|
177
|
+
mkdir collectors
|
|
178
|
+
uv sync
|
|
179
|
+
uv run dcf validate all # "OK — 0 collector(s)"
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## dcf package structure
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
dcf/
|
|
188
|
+
├── cli.py Entry point (Typer app)
|
|
189
|
+
├── project.py Project root discovery (CWD walk / DCF_PROJECT_DIR)
|
|
190
|
+
├── spark_session.py PySpark + Iceberg session factory
|
|
191
|
+
├── mcp_server.py MCP server (FastMCP)
|
|
192
|
+
├── warehouse_reader.py DuckDB-based warehouse query layer
|
|
193
|
+
├── config/
|
|
194
|
+
│ ├── models.py Pydantic models for collector YAML
|
|
195
|
+
│ └── loader.py YAML loading + env var resolution
|
|
196
|
+
├── engine/
|
|
197
|
+
│ ├── runner.py Outer loop (expand cadence → fetch → project → write)
|
|
198
|
+
│ ├── fetcher.py HTTP and Python source fetchers
|
|
199
|
+
│ ├── iterator.py Cartesian iteration over date ranges and categoricals
|
|
200
|
+
│ ├── projector.py Schema projection (path extraction, transforms)
|
|
201
|
+
│ └── transforms.py Column transforms (crs_reproject, etc.)
|
|
202
|
+
├── writer/
|
|
203
|
+
│ └── iceberg.py Iceberg write strategies (incremental / append / full_refresh)
|
|
204
|
+
└── gcp/
|
|
205
|
+
├── bootstrap.py GCS bucket + service account provisioning
|
|
206
|
+
├── terraform.py Terraform wrapper for lake infrastructure
|
|
207
|
+
├── auth.py GCP credential helpers
|
|
208
|
+
└── gcloud.py gcloud CLI wrappers
|
|
209
|
+
```
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-collection-framework
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Requires-Python: >=3.12
|
|
5
|
+
Requires-Dist: pyspark==4.0.2
|
|
6
|
+
Requires-Dist: pandas
|
|
7
|
+
Requires-Dist: requests
|
|
8
|
+
Requires-Dist: pyarrow
|
|
9
|
+
Requires-Dist: pytz
|
|
10
|
+
Requires-Dist: pydantic>=2
|
|
11
|
+
Requires-Dist: typer[all]>=0.9
|
|
12
|
+
Requires-Dist: google-auth>=2.29
|
|
13
|
+
Requires-Dist: google-cloud-storage>=2.16
|
|
14
|
+
Requires-Dist: google-cloud-secret-manager>=2.20
|
|
15
|
+
Requires-Dist: kafka-python>=2.0
|
|
16
|
+
Requires-Dist: google-api-python-client>=2.126
|
|
17
|
+
Requires-Dist: pyyaml>=6.0
|
|
18
|
+
Requires-Dist: mcp>=1.0
|
|
19
|
+
Requires-Dist: duckdb>=1.0
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
data_collection_framework.egg-info/PKG-INFO
|
|
4
|
+
data_collection_framework.egg-info/SOURCES.txt
|
|
5
|
+
data_collection_framework.egg-info/dependency_links.txt
|
|
6
|
+
data_collection_framework.egg-info/entry_points.txt
|
|
7
|
+
data_collection_framework.egg-info/requires.txt
|
|
8
|
+
data_collection_framework.egg-info/top_level.txt
|
|
9
|
+
dcf/__init__.py
|
|
10
|
+
dcf/cli.py
|
|
11
|
+
dcf/local_deploy.py
|
|
12
|
+
dcf/project.py
|
|
13
|
+
dcf/spark_session.py
|
|
14
|
+
dcf/warehouse_reader.py
|
|
15
|
+
dcf/config/__init__.py
|
|
16
|
+
dcf/config/loader.py
|
|
17
|
+
dcf/config/models.py
|
|
18
|
+
dcf/engine/__init__.py
|
|
19
|
+
dcf/engine/fetcher.py
|
|
20
|
+
dcf/engine/iterator.py
|
|
21
|
+
dcf/engine/projector.py
|
|
22
|
+
dcf/engine/runner.py
|
|
23
|
+
dcf/engine/transforms.py
|
|
24
|
+
dcf/gcp/__init__.py
|
|
25
|
+
dcf/gcp/_collector_utils.py
|
|
26
|
+
dcf/gcp/auth.py
|
|
27
|
+
dcf/gcp/batch_deploy.py
|
|
28
|
+
dcf/gcp/bootstrap.py
|
|
29
|
+
dcf/gcp/gcloud.py
|
|
30
|
+
dcf/gcp/terraform.py
|
|
31
|
+
dcf/infra/modules/batch_collector/gcp/main.tf
|
|
32
|
+
dcf/infra/modules/batch_collector/gcp/outputs.tf
|
|
33
|
+
dcf/infra/modules/batch_collector/gcp/variables.tf
|
|
34
|
+
dcf/infra/modules/batch_collector/gcp/airflow/main.tf
|
|
35
|
+
dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf
|
|
36
|
+
dcf/infra/modules/batch_collector/gcp/airflow/variables.tf
|
|
37
|
+
dcf/infra/modules/batch_collector/local/main.tf
|
|
38
|
+
dcf/infra/modules/batch_collector/local/outputs.tf
|
|
39
|
+
dcf/infra/modules/batch_collector/local/variables.tf
|
|
40
|
+
dcf/infra/modules/batch_collector/local/airflow/main.tf
|
|
41
|
+
dcf/infra/modules/batch_collector/local/airflow/outputs.tf
|
|
42
|
+
dcf/infra/modules/batch_collector/local/airflow/variables.tf
|
|
43
|
+
dcf/infra/templates/airflow.Dockerfile.tftpl
|
|
44
|
+
dcf/infra/templates/batch_collector.Dockerfile.tftpl
|
|
45
|
+
dcf/infra/templates/docker-compose.yml.tftpl
|
|
46
|
+
dcf/writer/__init__.py
|
|
47
|
+
dcf/writer/iceberg.py
|
|
48
|
+
tests/test_deploy_cli.py
|
|
49
|
+
tests/test_deploy_model.py
|
|
50
|
+
tests/test_fetcher.py
|
|
51
|
+
tests/test_runner_errors.py
|
|
52
|
+
tests/test_transforms.py
|
|
53
|
+
tests/test_warehouse_reader.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
pyspark==4.0.2
|
|
2
|
+
pandas
|
|
3
|
+
requests
|
|
4
|
+
pyarrow
|
|
5
|
+
pytz
|
|
6
|
+
pydantic>=2
|
|
7
|
+
typer[all]>=0.9
|
|
8
|
+
google-auth>=2.29
|
|
9
|
+
google-cloud-storage>=2.16
|
|
10
|
+
google-cloud-secret-manager>=2.20
|
|
11
|
+
kafka-python>=2.0
|
|
12
|
+
google-api-python-client>=2.126
|
|
13
|
+
pyyaml>=6.0
|
|
14
|
+
mcp>=1.0
|
|
15
|
+
duckdb>=1.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dcf
|