data-collection-framework 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data_collection_framework-0.1.0/PKG-INFO +19 -0
  2. data_collection_framework-0.1.0/README.md +209 -0
  3. data_collection_framework-0.1.0/data_collection_framework.egg-info/PKG-INFO +19 -0
  4. data_collection_framework-0.1.0/data_collection_framework.egg-info/SOURCES.txt +53 -0
  5. data_collection_framework-0.1.0/data_collection_framework.egg-info/dependency_links.txt +1 -0
  6. data_collection_framework-0.1.0/data_collection_framework.egg-info/entry_points.txt +2 -0
  7. data_collection_framework-0.1.0/data_collection_framework.egg-info/requires.txt +15 -0
  8. data_collection_framework-0.1.0/data_collection_framework.egg-info/top_level.txt +1 -0
  9. data_collection_framework-0.1.0/dcf/__init__.py +4 -0
  10. data_collection_framework-0.1.0/dcf/cli.py +841 -0
  11. data_collection_framework-0.1.0/dcf/config/__init__.py +4 -0
  12. data_collection_framework-0.1.0/dcf/config/loader.py +77 -0
  13. data_collection_framework-0.1.0/dcf/config/models.py +240 -0
  14. data_collection_framework-0.1.0/dcf/engine/__init__.py +6 -0
  15. data_collection_framework-0.1.0/dcf/engine/fetcher.py +118 -0
  16. data_collection_framework-0.1.0/dcf/engine/iterator.py +96 -0
  17. data_collection_framework-0.1.0/dcf/engine/projector.py +56 -0
  18. data_collection_framework-0.1.0/dcf/engine/runner.py +90 -0
  19. data_collection_framework-0.1.0/dcf/engine/transforms.py +41 -0
  20. data_collection_framework-0.1.0/dcf/gcp/__init__.py +0 -0
  21. data_collection_framework-0.1.0/dcf/gcp/_collector_utils.py +87 -0
  22. data_collection_framework-0.1.0/dcf/gcp/auth.py +1 -0
  23. data_collection_framework-0.1.0/dcf/gcp/batch_deploy.py +548 -0
  24. data_collection_framework-0.1.0/dcf/gcp/bootstrap.py +131 -0
  25. data_collection_framework-0.1.0/dcf/gcp/gcloud.py +42 -0
  26. data_collection_framework-0.1.0/dcf/gcp/terraform.py +151 -0
  27. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/airflow/main.tf +194 -0
  28. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf +9 -0
  29. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/airflow/variables.tf +52 -0
  30. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/main.tf +70 -0
  31. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/outputs.tf +4 -0
  32. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/gcp/variables.tf +40 -0
  33. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/airflow/main.tf +64 -0
  34. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/airflow/outputs.tf +9 -0
  35. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/airflow/variables.tf +59 -0
  36. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/main.tf +32 -0
  37. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/outputs.tf +4 -0
  38. data_collection_framework-0.1.0/dcf/infra/modules/batch_collector/local/variables.tf +25 -0
  39. data_collection_framework-0.1.0/dcf/infra/templates/airflow.Dockerfile.tftpl +6 -0
  40. data_collection_framework-0.1.0/dcf/infra/templates/batch_collector.Dockerfile.tftpl +14 -0
  41. data_collection_framework-0.1.0/dcf/infra/templates/docker-compose.yml.tftpl +76 -0
  42. data_collection_framework-0.1.0/dcf/local_deploy.py +756 -0
  43. data_collection_framework-0.1.0/dcf/project.py +23 -0
  44. data_collection_framework-0.1.0/dcf/spark_session.py +66 -0
  45. data_collection_framework-0.1.0/dcf/warehouse_reader.py +323 -0
  46. data_collection_framework-0.1.0/dcf/writer/__init__.py +3 -0
  47. data_collection_framework-0.1.0/dcf/writer/iceberg.py +315 -0
  48. data_collection_framework-0.1.0/pyproject.toml +38 -0
  49. data_collection_framework-0.1.0/setup.cfg +4 -0
  50. data_collection_framework-0.1.0/tests/test_deploy_cli.py +148 -0
  51. data_collection_framework-0.1.0/tests/test_deploy_model.py +83 -0
  52. data_collection_framework-0.1.0/tests/test_fetcher.py +69 -0
  53. data_collection_framework-0.1.0/tests/test_runner_errors.py +106 -0
  54. data_collection_framework-0.1.0/tests/test_transforms.py +38 -0
  55. data_collection_framework-0.1.0/tests/test_warehouse_reader.py +300 -0
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-collection-framework
3
+ Version: 0.1.0
4
+ Requires-Python: >=3.12
5
+ Requires-Dist: pyspark==4.0.2
6
+ Requires-Dist: pandas
7
+ Requires-Dist: requests
8
+ Requires-Dist: pyarrow
9
+ Requires-Dist: pytz
10
+ Requires-Dist: pydantic>=2
11
+ Requires-Dist: typer[all]>=0.9
12
+ Requires-Dist: google-auth>=2.29
13
+ Requires-Dist: google-cloud-storage>=2.16
14
+ Requires-Dist: google-cloud-secret-manager>=2.20
15
+ Requires-Dist: kafka-python>=2.0
16
+ Requires-Dist: google-api-python-client>=2.126
17
+ Requires-Dist: pyyaml>=6.0
18
+ Requires-Dist: mcp>=1.0
19
+ Requires-Dist: duckdb>=1.0
@@ -0,0 +1,209 @@
1
+ # dcf
2
+
3
+ D.ata C.ollection F.ramework
4
+
5
+ It works like this
6
+ 1. User defines collectors with basic configs in a YAML (like a dbt model)
7
+ 2. dcf builds and runs the collector
8
+ 3. Data lake has data
9
+
10
+ ## Quickstart
11
+
12
+ This guide walks you from zero to a working data collector. The example ingests your GitHub repositories.
13
+
14
+ ### 1. Create a project
15
+
16
+ dcf is a tool you depend on, not a repo you clone. Create a fresh directory:
17
+
18
+ ```bash
19
+ mkdir dcf-demo && cd dcf-demo
20
+ ```
21
+
22
+ **`pyproject.toml`:**
23
+
24
+ ```toml
25
+ [project]
26
+ name = "dcf-demo"
27
+ version = "0.1.0"
28
+ requires-python = ">=3.12"
29
+ dependencies = [
30
+ "dcf",
31
+ ]
32
+
33
+ [tool.uv]
34
+ package = false
35
+
36
+ [tool.uv.sources]
37
+ dcf = { git = "https://github.com/zephschafer/dcf.git" }
38
+ ```
39
+
40
+ **`project.yml`**
41
+
42
+ ```yaml
43
+ catalog: local
44
+ ```
45
+
46
+ **`.gitignore`:**
47
+
48
+ ```
49
+ warehouse/
50
+ project.yml
51
+ .venv/
52
+ __pycache__/
53
+ ```
54
+
55
+ ```bash
56
+ mkdir collectors
57
+ uv sync
58
+ ```
59
+
60
+ ---
61
+
62
+ ### 2. Write a collector
63
+
64
+ Create `collectors/dcf_commits.yml`:
65
+
66
+ ```yaml
67
+ name: dcf_commits
68
+ namespace: github
69
+ description: Commits to the dcf repository.
70
+
71
+ source:
72
+ type: http
73
+ url: https://api.github.com/repos/zephschafer/dcf/commits
74
+ method: GET
75
+ params:
76
+ - name: sha
77
+ type: string
78
+ value: main
79
+ - name: per_page
80
+ type: integer
81
+ value: 100
82
+ schema:
83
+ columns:
84
+ - {name: sha, path: sha, type: string}
85
+ - {name: author, path: commit.author.name, type: string}
86
+ - {name: message, path: commit.message, type: string}
87
+ - {name: committed_at, path: commit.author.date, type: timestamp}
88
+
89
+ cadence:
90
+ strategy: incremental
91
+ primary_key: sha
92
+
93
+ deployment:
94
+ schedule: "0 8 * * *"
95
+ ```
96
+
97
+ ---
98
+
99
+ ### 3. Validate
100
+
101
+ ```bash
102
+ uv run dcf validate dcf_commits
103
+ ```
104
+
105
+ ---
106
+
107
+ ### 4. Run
108
+
109
+ ```bash
110
+ uv run dcf run dcf_commits
111
+ ```
112
+
113
+ ---
114
+
115
+ ### 5. Query the warehouse
116
+
117
+ ```bash
118
+ uv run dcf query 'SELECT * FROM github.dcf_commits'
119
+ ```
120
+
121
+ You can also save your SQL to a file and run it with `--file`:
122
+
123
+ ```bash
124
+ uv run dcf query --file my_query.sql
125
+ ```
126
+
127
+ ---
128
+
129
+ ### 6. Deploy
130
+
131
+ ```bash
132
+ uv run dcf deploy dcf_commits
133
+ ```
134
+
135
+ This schedules the collector to run daily at 8 AM UTC, as configured in `deployment.schedule`.
136
+
137
+ ---
138
+
139
+ ## Developing dcf
140
+
141
+ Clone this repo, then create or point to a project for testing:
142
+
143
+ ```bash
144
+ git clone https://github.com/Data-Dispatch/dcf
145
+ cd dcf
146
+ uv sync
147
+
148
+ # Test against the demo project
149
+ git clone https://github.com/Data-Dispatch/quipu-data-generator ../quipu-data-generator
150
+ cd ../quipu-data-generator
151
+ uv sync # picks up dcf from ../dcf via editable path dep
152
+ uv run dcf validate all
153
+ ```
154
+
155
+ Or create a minimal test project:
156
+
157
+ ```bash
158
+ mkdir my-test-project && cd my-test-project
159
+ cat > pyproject.toml << 'EOF'
160
+ [project]
161
+ name = "my-test-project"
162
+ version = "0.1.0"
163
+ requires-python = ">=3.12"
164
+ dependencies = ["dcf"]
165
+
166
+ [tool.uv]
167
+ package = false
168
+
169
+ [tool.uv.sources]
170
+ dcf = { path = "../dcf", editable = true }
171
+ EOF
172
+
173
+ cat > project.yml << 'EOF'
174
+ catalog: local
175
+ EOF
176
+
177
+ mkdir collectors
178
+ uv sync
179
+ uv run dcf validate all # "OK — 0 collector(s)"
180
+ ```
181
+
182
+ ---
183
+
184
+ ## dcf package structure
185
+
186
+ ```
187
+ dcf/
188
+ ├── cli.py Entry point (Typer app)
189
+ ├── project.py Project root discovery (CWD walk / DCF_PROJECT_DIR)
190
+ ├── spark_session.py PySpark + Iceberg session factory
191
+ ├── mcp_server.py MCP server (FastMCP)
192
+ ├── warehouse_reader.py DuckDB-based warehouse query layer
193
+ ├── config/
194
+ │ ├── models.py Pydantic models for collector YAML
195
+ │ └── loader.py YAML loading + env var resolution
196
+ ├── engine/
197
+ │ ├── runner.py Outer loop (expand cadence → fetch → project → write)
198
+ │ ├── fetcher.py HTTP and Python source fetchers
199
+ │ ├── iterator.py Cartesian iteration over date ranges and categoricals
200
+ │ ├── projector.py Schema projection (path extraction, transforms)
201
+ │ └── transforms.py Column transforms (crs_reproject, etc.)
202
+ ├── writer/
203
+ │ └── iceberg.py Iceberg write strategies (incremental / append / full_refresh)
204
+ └── gcp/
205
+ ├── bootstrap.py GCS bucket + service account provisioning
206
+ ├── terraform.py Terraform wrapper for lake infrastructure
207
+ ├── auth.py GCP credential helpers
208
+ └── gcloud.py gcloud CLI wrappers
209
+ ```
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-collection-framework
3
+ Version: 0.1.0
4
+ Requires-Python: >=3.12
5
+ Requires-Dist: pyspark==4.0.2
6
+ Requires-Dist: pandas
7
+ Requires-Dist: requests
8
+ Requires-Dist: pyarrow
9
+ Requires-Dist: pytz
10
+ Requires-Dist: pydantic>=2
11
+ Requires-Dist: typer[all]>=0.9
12
+ Requires-Dist: google-auth>=2.29
13
+ Requires-Dist: google-cloud-storage>=2.16
14
+ Requires-Dist: google-cloud-secret-manager>=2.20
15
+ Requires-Dist: kafka-python>=2.0
16
+ Requires-Dist: google-api-python-client>=2.126
17
+ Requires-Dist: pyyaml>=6.0
18
+ Requires-Dist: mcp>=1.0
19
+ Requires-Dist: duckdb>=1.0
@@ -0,0 +1,53 @@
1
+ README.md
2
+ pyproject.toml
3
+ data_collection_framework.egg-info/PKG-INFO
4
+ data_collection_framework.egg-info/SOURCES.txt
5
+ data_collection_framework.egg-info/dependency_links.txt
6
+ data_collection_framework.egg-info/entry_points.txt
7
+ data_collection_framework.egg-info/requires.txt
8
+ data_collection_framework.egg-info/top_level.txt
9
+ dcf/__init__.py
10
+ dcf/cli.py
11
+ dcf/local_deploy.py
12
+ dcf/project.py
13
+ dcf/spark_session.py
14
+ dcf/warehouse_reader.py
15
+ dcf/config/__init__.py
16
+ dcf/config/loader.py
17
+ dcf/config/models.py
18
+ dcf/engine/__init__.py
19
+ dcf/engine/fetcher.py
20
+ dcf/engine/iterator.py
21
+ dcf/engine/projector.py
22
+ dcf/engine/runner.py
23
+ dcf/engine/transforms.py
24
+ dcf/gcp/__init__.py
25
+ dcf/gcp/_collector_utils.py
26
+ dcf/gcp/auth.py
27
+ dcf/gcp/batch_deploy.py
28
+ dcf/gcp/bootstrap.py
29
+ dcf/gcp/gcloud.py
30
+ dcf/gcp/terraform.py
31
+ dcf/infra/modules/batch_collector/gcp/main.tf
32
+ dcf/infra/modules/batch_collector/gcp/outputs.tf
33
+ dcf/infra/modules/batch_collector/gcp/variables.tf
34
+ dcf/infra/modules/batch_collector/gcp/airflow/main.tf
35
+ dcf/infra/modules/batch_collector/gcp/airflow/outputs.tf
36
+ dcf/infra/modules/batch_collector/gcp/airflow/variables.tf
37
+ dcf/infra/modules/batch_collector/local/main.tf
38
+ dcf/infra/modules/batch_collector/local/outputs.tf
39
+ dcf/infra/modules/batch_collector/local/variables.tf
40
+ dcf/infra/modules/batch_collector/local/airflow/main.tf
41
+ dcf/infra/modules/batch_collector/local/airflow/outputs.tf
42
+ dcf/infra/modules/batch_collector/local/airflow/variables.tf
43
+ dcf/infra/templates/airflow.Dockerfile.tftpl
44
+ dcf/infra/templates/batch_collector.Dockerfile.tftpl
45
+ dcf/infra/templates/docker-compose.yml.tftpl
46
+ dcf/writer/__init__.py
47
+ dcf/writer/iceberg.py
48
+ tests/test_deploy_cli.py
49
+ tests/test_deploy_model.py
50
+ tests/test_fetcher.py
51
+ tests/test_runner_errors.py
52
+ tests/test_transforms.py
53
+ tests/test_warehouse_reader.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dcf = dcf.cli:app
@@ -0,0 +1,15 @@
1
+ pyspark==4.0.2
2
+ pandas
3
+ requests
4
+ pyarrow
5
+ pytz
6
+ pydantic>=2
7
+ typer[all]>=0.9
8
+ google-auth>=2.29
9
+ google-cloud-storage>=2.16
10
+ google-cloud-secret-manager>=2.20
11
+ kafka-python>=2.0
12
+ google-api-python-client>=2.126
13
+ pyyaml>=6.0
14
+ mcp>=1.0
15
+ duckdb>=1.0
@@ -0,0 +1,4 @@
1
+ from .engine.runner import run_collector
2
+ from .config import load_collector, load_all_collectors
3
+
4
+ __all__ = ["run_collector", "load_collector", "load_all_collectors"]