bblocks-datacommons-tools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. bblocks_datacommons_tools-0.0.1/LICENSE +21 -0
  2. bblocks_datacommons_tools-0.0.1/PKG-INFO +224 -0
  3. bblocks_datacommons_tools-0.0.1/README.md +205 -0
  4. bblocks_datacommons_tools-0.0.1/pyproject.toml +35 -0
  5. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/__init__.py +16 -0
  6. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/__main__.py +6 -0
  7. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/__init__.py +5 -0
  8. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/common.py +9 -0
  9. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/csv2mcf.py +93 -0
  10. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/data_load.py +36 -0
  11. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/data_load_pipeline.py +56 -0
  12. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/main.py +44 -0
  13. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/redeploy.py +34 -0
  14. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/cli/upload.py +37 -0
  15. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/__init__.py +5 -0
  16. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/config_utils.py +178 -0
  17. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/data_management.py +1092 -0
  18. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/__init__.py +0 -0
  19. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/common.py +91 -0
  20. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/config_file.py +87 -0
  21. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/data_files.py +127 -0
  22. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/mcf.py +182 -0
  23. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/sources.py +17 -0
  24. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/stat_vars.py +132 -0
  25. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/models/topics.py +34 -0
  26. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/custom_data/schema_tools.py +250 -0
  27. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/gcp_utilities/__init__.py +26 -0
  28. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/gcp_utilities/clients.py +33 -0
  29. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/gcp_utilities/jobs.py +121 -0
  30. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/gcp_utilities/pipeline.py +67 -0
  31. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/gcp_utilities/settings.py +129 -0
  32. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/gcp_utilities/storage.py +258 -0
  33. bblocks_datacommons_tools-0.0.1/src/bblocks/datacommons_tools/logger.py +26 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 ONE Campaign
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,224 @@
1
+ Metadata-Version: 2.3
2
+ Name: bblocks-datacommons-tools
3
+ Version: 0.0.1
4
+ Summary: Tools to work with Data Commons. Part of the bblocks projects.
5
+ License: MIT
6
+ Author: ONE Campaign
7
+ Requires-Python: >=3.11
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: google-cloud-run (>=0.10.17,<0.11.0)
14
+ Requires-Dist: google-cloud-storage (>=3.1.0,<4.0.0)
15
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
16
+ Requires-Dist: pydantic (>=2.11.3,<3.0.0)
17
+ Requires-Dist: pydantic-settings (>=2.9.1,<3.0.0)
18
+ Description-Content-Type: text/markdown
19
+
20
+ # bblocks-datacommons-tools
21
+
22
+ __Manage and load data to custom Data Commons instances__
23
+
24
+ [![PyPI](https://img.shields.io/pypi/v/bblocks_datacommons_tools.svg)](https://pypi.org/project/bblocks_datacommons_tools/)
25
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/bblocks_datacommons_tools.svg)](https://pypi.org/project/bblocks_places/)
26
+ [![Docs](https://img.shields.io/badge/docs-bblocks-blue)](https://docs.one.org/tools/bblocks/datacommons_tools/)
27
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
28
+ [![codecov](https://codecov.io/gh/ONEcampaign/bblocks-datacommons-tools/graph/badge.svg?token=3ONEA8JQTC)](https://codecov.io/gh/ONEcampaign/bblocks-datacommons-tools)
29
+
30
+ Custom [Data Commons](https://docs.datacommons.org/custom_dc/custom_data.html) requires that you provide your data in a specific schema, format, and file structure.
31
+
32
+ At a high level, you need to provide the following:
33
+
34
+ - All observations data must be in CSV format, using a predefined schema.
35
+ - You must also provide a JSON configuration file, named `config.json`, that specifies how to map and resolve the CSV contents to the Data Commons schema knowledge graph.
36
+ - Depending on how you define your statistical variables (metrics), you may need to provide MCF (Meta Content Framework) files.
37
+ - You may also need to define new custom entities.
38
+
39
+ Managing this workflow by hand is tedious and easy to get wrong.
40
+
41
+ The `bblocks.datacommons_tools` package streamlines that process. It provides a Python API and command line utilities for building config files, generating MCF from CSV metadata and running the data load pipeline on Google Cloud.
42
+
43
+ Use this package when you want to:
44
+
45
+ - Manage `config.json` files programmatically.
46
+ - Define statistical variables, entities or groups using MCF files.
47
+ - Programmatically upload CSVs, MCF files, and the `config.json` file to Cloud Storage, trigger the load job and redeploy your custom Data Commons service with code.
48
+
49
+ In short, `datacommons-tools` removes much of the manual work involved in setting up and maintaining a custom Data Commons Knowledge Graph.
50
+
51
+ `bblocks-datacommons-tools` is part of the `bblocks` ecosystem,
52
+ a set of Python packages designed as building blocks for working with data in the international development
53
+ and humanitarian sectors.
54
+
55
+ Read the [documentation](https://docs.one.org/tools/bblocks/datacommons-tools/)
56
+ for more details on how to use the package and the motivation for its creation.
57
+
58
+
59
+ ## Installation
60
+
61
+ The package can be installed in various ways.
62
+
63
+ Directly as
64
+ ```bash
65
+ pip install bblocks-datacommons-tools
66
+ ```
67
+
68
+ Or from the main `bblocks` package with an extra:
69
+
70
+ ```bash
71
+ pip install "bblocks[datacommons-tools]"
72
+ ```
73
+
74
+ It can also be installed from GitHub:
75
+ ```bash
76
+ pip install git+https://github.com/ONEcampaign/bblocks-datacommons-tools
77
+ ```
78
+
79
+ ## Sample Usage
80
+
81
+ Here's a simple example covering how to use the "implicit" Data Commons
82
+ schema to load a single dataset. Please see the full [documentation page](https://docs.one.org/tools/bblocks/datacommons-tools/) for a thorough
83
+ introduction to the package, and to learn how to use it.
84
+
85
+
86
+ ### 1. Create a CustomDataManager object.
87
+
88
+ The CustomDataManager object will handle generating the `config.json` file, as well as (optionally) taking Pandas DataFrames and exporting them as CSVs (in the right format) for loading to the Knowlede Graph.
89
+
90
+ In this example, we assume a `config.json` does not yet exist.
91
+
92
+ ```python title="Instantiate the CustomDataManager class"
93
+ from bblocks.datacommons_tools import CustomDataManager
94
+
95
+ # Create the object and call it "manager"
96
+ manager = CustomDataManager()
97
+
98
+ # Configure it to include subdirectories
99
+ manager.set_includeInputSubdirs(True)
100
+
101
+ ```
102
+
103
+ ### 2. Add the provenance information for our data
104
+ You can add or manage provenance information on the `config.py` file.
105
+
106
+ In this example, we will add a provenance for ONE Data's Climate Finance Files.
107
+
108
+ ```python title="Add provenance and source"
109
+ manager.add_provenance(
110
+ provenance_name="ONE Climate Finance",
111
+ provenance_url="https://datacommons.one.org/data/climate-finance-files",
112
+ source_name="ONE Data",
113
+ source_url="https://data.one.org",
114
+ )
115
+ ```
116
+
117
+ ### 3. Add the data to the CustomDataManager object.
118
+ Next, you need to specify your data on the `config.json` file.
119
+
120
+ Adding actual data data to the `CustomDataManager` is an optional step.
121
+
122
+ For this example, we will assume a DataFrame is available via the
123
+ `data` variable.
124
+
125
+ To add to the `CustomDataManager`, using the Implicit Schema:
126
+
127
+ ```python title="Register data"
128
+ manager.add_implicit_schema_file(
129
+ file_name="climate_finance/one_cf_provider_commitments.csv",
130
+ provenance="ONE Climate Finance",
131
+ entityType="Country",
132
+ data=data,
133
+ ignoreColumns=["oecd_provider_code"],
134
+ observationProperties={"unit": "USDollar"},
135
+ )
136
+ ```
137
+
138
+ Adding the data in the step above is optional. You can also create the inputFile in the config and add the data tied to that inputFile at a later stage by running:
139
+
140
+ ```python
141
+ manager.add_data(data=data, file_name='one_cf_provider_commitments.csv')
142
+ ```
143
+
144
+ Or you can manually add the relevant CSV file (matching what you declared as `file_name`).
145
+
146
+ ### 4. Add the indicators to config
147
+ Next, you need to specify information about the StatVars (variables) contained
148
+ in your data file(s).
149
+
150
+ When using the Implicit Schema, you can specify additional information.
151
+
152
+ For convenience, you could loop through a dictionary of indicators and information. For this example we'll add a single indicator.
153
+
154
+ ```python title="Register an indicator"
155
+ manager.add_variable_to_config(
156
+ statVar="climateFinanceProvidedCommitments",
157
+ name="Climate Finance Commitments (bilateral)",
158
+ group="ONE/Environment/Climate finance/Provider perspective/Commitments",
159
+ description="Funding for climate adaptation and mitigation projects",
160
+ searchDescriptions=[
161
+ "Climate finance commitments provided",
162
+ "Adaptation and mitigation finance provided",
163
+ ],
164
+ properties={"measurementMethod": "Commitment"},
165
+ )
166
+ ```
167
+
168
+ ### 5. Export the `config.json` and (optionally) data CSVs
169
+
170
+ Next, once all the data is added and the config is set up, you can export the `config.json` and data. When you export, the `config.json` is validated automatically
171
+
172
+ ```python title="Export config and data"
173
+ manager.export_all("path/to/output/folder")
174
+ ```
175
+
176
+ ### 6. (Optionally) load to the Knowledge Graph
177
+ You can also programmatically push the data and config to a Google Cloud
178
+ Storage Bucket, trigger the data load job, and redeploy your Data Commons
179
+ instance.
180
+
181
+ To do this, you'll need to load information about your
182
+ project, Storage Bucket, etc. You can use `.env` or `.json` files,
183
+ or simply make the right information available as environment variables.
184
+ A detailed description of the needed information, can be found in the documentation.
185
+
186
+ #### Load the settings
187
+ First, load the settings using `get_kg_settings`. In this example, we will load them from a `.env` file available in our working directory.
188
+
189
+ ```python title="Load settings"
190
+ from bblocks.datacommons_tools.gcp_utilities import (
191
+ upload_to_cloud_storage,
192
+ run_data_load,
193
+ redeploy_service,
194
+ get_kg_settings,
195
+ )
196
+
197
+ settings = get_kg_settings(source="env", env_file="customDC.env")
198
+ ```
199
+ Second, we'll upload the directory which contains the `config.json` file and
200
+ any CSV and/or MCF files.
201
+
202
+ ```python title="Upload to GCS"
203
+ upload_to_cloud_storage(settings=settings, directory="path/to/output/folder")
204
+ ```
205
+
206
+ Third, we'll run the data load job on Google Cloud Platform.
207
+ ```python
208
+ run_data_load(settings=settings)
209
+ ```
210
+
211
+ Last, we need to redeploy the Custom Data Commons instance.
212
+
213
+ ```python
214
+ redeploy_service(settings=settings)
215
+ ```
216
+
217
+ ---
218
+
219
+ Visit the [documentation page](https://docs.one.org/tools/bblocks/datacommons-tools/) for the full package documentation and examples.
220
+
221
+ ## Contributing
222
+ Contributions are welcome! Please see the
223
+ [CONTRIBUTING](https://github.com/ONEcampaign/bblocks-datacommons-tools/blob/main/CONTRIBUTING.md)
224
+ page for details on how to get started, report bugs, fix issues, and submit enhancements.
@@ -0,0 +1,205 @@
1
+ # bblocks-datacommons-tools
2
+
3
+ __Manage and load data to custom Data Commons instances__
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/bblocks_datacommons_tools.svg)](https://pypi.org/project/bblocks_datacommons_tools/)
6
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/bblocks_datacommons_tools.svg)](https://pypi.org/project/bblocks_places/)
7
+ [![Docs](https://img.shields.io/badge/docs-bblocks-blue)](https://docs.one.org/tools/bblocks/datacommons_tools/)
8
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
9
+ [![codecov](https://codecov.io/gh/ONEcampaign/bblocks-datacommons-tools/graph/badge.svg?token=3ONEA8JQTC)](https://codecov.io/gh/ONEcampaign/bblocks-datacommons-tools)
10
+
11
+ Custom [Data Commons](https://docs.datacommons.org/custom_dc/custom_data.html) requires that you provide your data in a specific schema, format, and file structure.
12
+
13
+ At a high level, you need to provide the following:
14
+
15
+ - All observations data must be in CSV format, using a predefined schema.
16
+ - You must also provide a JSON configuration file, named `config.json`, that specifies how to map and resolve the CSV contents to the Data Commons schema knowledge graph.
17
+ - Depending on how you define your statistical variables (metrics), you may need to provide MCF (Meta Content Framework) files.
18
+ - You may also need to define new custom entities.
19
+
20
+ Managing this workflow by hand is tedious and easy to get wrong.
21
+
22
+ The `bblocks.datacommons_tools` package streamlines that process. It provides a Python API and command line utilities for building config files, generating MCF from CSV metadata and running the data load pipeline on Google Cloud.
23
+
24
+ Use this package when you want to:
25
+
26
+ - Manage `config.json` files programmatically.
27
+ - Define statistical variables, entities or groups using MCF files.
28
+ - Programmatically upload CSVs, MCF files, and the `config.json` file to Cloud Storage, trigger the load job and redeploy your custom Data Commons service with code.
29
+
30
+ In short, `datacommons-tools` removes much of the manual work involved in setting up and maintaining a custom Data Commons Knowledge Graph.
31
+
32
+ `bblocks-datacommons-tools` is part of the `bblocks` ecosystem,
33
+ a set of Python packages designed as building blocks for working with data in the international development
34
+ and humanitarian sectors.
35
+
36
+ Read the [documentation](https://docs.one.org/tools/bblocks/datacommons-tools/)
37
+ for more details on how to use the package and the motivation for its creation.
38
+
39
+
40
+ ## Installation
41
+
42
+ The package can be installed in various ways.
43
+
44
+ Directly as
45
+ ```bash
46
+ pip install bblocks-datacommons-tools
47
+ ```
48
+
49
+ Or from the main `bblocks` package with an extra:
50
+
51
+ ```bash
52
+ pip install "bblocks[datacommons-tools]"
53
+ ```
54
+
55
+ It can also be installed from GitHub:
56
+ ```bash
57
+ pip install git+https://github.com/ONEcampaign/bblocks-datacommons-tools
58
+ ```
59
+
60
+ ## Sample Usage
61
+
62
+ Here's a simple example covering how to use the "implicit" Data Commons
63
+ schema to load a single dataset. Please see the full [documentation page](https://docs.one.org/tools/bblocks/datacommons-tools/) for a thorough
64
+ introduction to the package, and to learn how to use it.
65
+
66
+
67
+ ### 1. Create a CustomDataManager object.
68
+
69
+ The CustomDataManager object will handle generating the `config.json` file, as well as (optionally) taking Pandas DataFrames and exporting them as CSVs (in the right format) for loading to the Knowlede Graph.
70
+
71
+ In this example, we assume a `config.json` does not yet exist.
72
+
73
+ ```python title="Instantiate the CustomDataManager class"
74
+ from bblocks.datacommons_tools import CustomDataManager
75
+
76
+ # Create the object and call it "manager"
77
+ manager = CustomDataManager()
78
+
79
+ # Configure it to include subdirectories
80
+ manager.set_includeInputSubdirs(True)
81
+
82
+ ```
83
+
84
+ ### 2. Add the provenance information for our data
85
+ You can add or manage provenance information on the `config.py` file.
86
+
87
+ In this example, we will add a provenance for ONE Data's Climate Finance Files.
88
+
89
+ ```python title="Add provenance and source"
90
+ manager.add_provenance(
91
+ provenance_name="ONE Climate Finance",
92
+ provenance_url="https://datacommons.one.org/data/climate-finance-files",
93
+ source_name="ONE Data",
94
+ source_url="https://data.one.org",
95
+ )
96
+ ```
97
+
98
+ ### 3. Add the data to the CustomDataManager object.
99
+ Next, you need to specify your data on the `config.json` file.
100
+
101
+ Adding actual data data to the `CustomDataManager` is an optional step.
102
+
103
+ For this example, we will assume a DataFrame is available via the
104
+ `data` variable.
105
+
106
+ To add to the `CustomDataManager`, using the Implicit Schema:
107
+
108
+ ```python title="Register data"
109
+ manager.add_implicit_schema_file(
110
+ file_name="climate_finance/one_cf_provider_commitments.csv",
111
+ provenance="ONE Climate Finance",
112
+ entityType="Country",
113
+ data=data,
114
+ ignoreColumns=["oecd_provider_code"],
115
+ observationProperties={"unit": "USDollar"},
116
+ )
117
+ ```
118
+
119
+ Adding the data in the step above is optional. You can also create the inputFile in the config and add the data tied to that inputFile at a later stage by running:
120
+
121
+ ```python
122
+ manager.add_data(data=data, file_name='one_cf_provider_commitments.csv')
123
+ ```
124
+
125
+ Or you can manually add the relevant CSV file (matching what you declared as `file_name`).
126
+
127
+ ### 4. Add the indicators to config
128
+ Next, you need to specify information about the StatVars (variables) contained
129
+ in your data file(s).
130
+
131
+ When using the Implicit Schema, you can specify additional information.
132
+
133
+ For convenience, you could loop through a dictionary of indicators and information. For this example we'll add a single indicator.
134
+
135
+ ```python title="Register an indicator"
136
+ manager.add_variable_to_config(
137
+ statVar="climateFinanceProvidedCommitments",
138
+ name="Climate Finance Commitments (bilateral)",
139
+ group="ONE/Environment/Climate finance/Provider perspective/Commitments",
140
+ description="Funding for climate adaptation and mitigation projects",
141
+ searchDescriptions=[
142
+ "Climate finance commitments provided",
143
+ "Adaptation and mitigation finance provided",
144
+ ],
145
+ properties={"measurementMethod": "Commitment"},
146
+ )
147
+ ```
148
+
149
+ ### 5. Export the `config.json` and (optionally) data CSVs
150
+
151
+ Next, once all the data is added and the config is set up, you can export the `config.json` and data. When you export, the `config.json` is validated automatically
152
+
153
+ ```python title="Export config and data"
154
+ manager.export_all("path/to/output/folder")
155
+ ```
156
+
157
+ ### 6. (Optionally) load to the Knowledge Graph
158
+ You can also programmatically push the data and config to a Google Cloud
159
+ Storage Bucket, trigger the data load job, and redeploy your Data Commons
160
+ instance.
161
+
162
+ To do this, you'll need to load information about your
163
+ project, Storage Bucket, etc. You can use `.env` or `.json` files,
164
+ or simply make the right information available as environment variables.
165
+ A detailed description of the needed information, can be found in the documentation.
166
+
167
+ #### Load the settings
168
+ First, load the settings using `get_kg_settings`. In this example, we will load them from a `.env` file available in our working directory.
169
+
170
+ ```python title="Load settings"
171
+ from bblocks.datacommons_tools.gcp_utilities import (
172
+ upload_to_cloud_storage,
173
+ run_data_load,
174
+ redeploy_service,
175
+ get_kg_settings,
176
+ )
177
+
178
+ settings = get_kg_settings(source="env", env_file="customDC.env")
179
+ ```
180
+ Second, we'll upload the directory which contains the `config.json` file and
181
+ any CSV and/or MCF files.
182
+
183
+ ```python title="Upload to GCS"
184
+ upload_to_cloud_storage(settings=settings, directory="path/to/output/folder")
185
+ ```
186
+
187
+ Third, we'll run the data load job on Google Cloud Platform.
188
+ ```python
189
+ run_data_load(settings=settings)
190
+ ```
191
+
192
+ Last, we need to redeploy the Custom Data Commons instance.
193
+
194
+ ```python
195
+ redeploy_service(settings=settings)
196
+ ```
197
+
198
+ ---
199
+
200
+ Visit the [documentation page](https://docs.one.org/tools/bblocks/datacommons-tools/) for the full package documentation and examples.
201
+
202
+ ## Contributing
203
+ Contributions are welcome! Please see the
204
+ [CONTRIBUTING](https://github.com/ONEcampaign/bblocks-datacommons-tools/blob/main/CONTRIBUTING.md)
205
+ page for details on how to get started, report bugs, fix issues, and submit enhancements.
@@ -0,0 +1,35 @@
1
+ [project]
2
+ name = "bblocks-datacommons-tools"
3
+ version = "0.0.1"
4
+ description = "Tools to work with Data Commons. Part of the bblocks projects."
5
+ authors = [
6
+ {name = "ONE Campaign"},
7
+ {name = "Luca Picci",email = "lpicci96@gmail.com"},
8
+ {name = "Jorge Rivera", email = "jorge.rivera@one.org"}
9
+ ]
10
+ license = {text = "MIT"}
11
+ readme = "README.md"
12
+ requires-python = ">=3.11"
13
+ dependencies = [
14
+ "pydantic (>=2.11.3,<3.0.0)",
15
+ "pandas (>=2.2.3,<3.0.0)",
16
+ "pydantic-settings (>=2.9.1,<3.0.0)",
17
+ "google-cloud-storage (>=3.1.0,<4.0.0)",
18
+ "google-cloud-run (>=0.10.17,<0.11.0)",
19
+ ]
20
+
21
+ [project.scripts]
22
+ bblocks-dc-tools = "bblocks.datacommons_tools.cli:main"
23
+
24
+ [tool.poetry]
25
+ packages = [{include = "bblocks", from = "src"}]
26
+
27
+ [tool.poetry.group.dev.dependencies]
28
+ pytest = "^8.3.5"
29
+ black = {extras = ["d"], version = "^25.1.0"}
30
+ mkdocs-material = "^9.6.14"
31
+ pytest-cov = "^6.2.1"
32
+
33
+ [build-system]
34
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
35
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,16 @@
1
+ from importlib.metadata import version
2
+
3
+ from .custom_data.config_utils import merge_configs_from_directory
4
+ from .custom_data.data_management import CustomDataManager
5
+ from .custom_data.schema_tools import csv_metadata_to_nodes, csv_metadata_to_mfc_file
6
+
7
+
8
+ __version__ = version("bblocks-datacommons-tools")
9
+
10
+
11
+ __all__ = [
12
+ "CustomDataManager",
13
+ "csv_metadata_to_nodes",
14
+ "csv_metadata_to_mfc_file",
15
+ "merge_configs_from_directory",
16
+ ]
@@ -0,0 +1,6 @@
1
+ """Enable ``python -m bblocks.datacommons_tools``."""
2
+
3
+ from bblocks.datacommons_tools.cli.main import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
@@ -0,0 +1,5 @@
1
+ """Command line interface for ``bblocks.datacommons_tools`` package."""
2
+
3
+ from bblocks.datacommons_tools.cli.main import main
4
+
5
+ __all__ = ["main"]
@@ -0,0 +1,9 @@
1
+ import argparse
2
+
3
+ from bblocks.datacommons_tools.gcp_utilities import KGSettings, get_kg_settings
4
+
5
+
6
+ def load_settings_from_args(args: argparse.Namespace) -> KGSettings:
7
+ if args.settings_file:
8
+ return get_kg_settings(source="json", file=args.settings_file)
9
+ return get_kg_settings(env_file=args.env_file)
@@ -0,0 +1,93 @@
1
+ """CSV to MCF command implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+
8
+ from bblocks.datacommons_tools.custom_data.schema_tools import (
9
+ csv_metadata_to_mfc_file,
10
+ NodeTypes,
11
+ )
12
+
13
+ __all__ = ["add_parser", "run"]
14
+
15
+
16
+ def _kv_pair(value: str) -> tuple[str, str]:
17
+ """Parse a KEY=VALUE string into a tuple."""
18
+ if "=" not in value:
19
+ raise ValueError(f"Invalid key-value pair: {value}")
20
+ key, val = value.split("=", 1)
21
+ return key.strip(), val.strip()
22
+
23
+
24
+ def add_parser(subparsers: argparse._SubParsersAction) -> None:
25
+ """Register the ``csv2mcf`` subcommand with the CLI parser."""
26
+ parser = subparsers.add_parser(
27
+ "csv2mcf", help="Convert a CSV of Node metadata to an MCF file"
28
+ )
29
+
30
+ parser.add_argument("csv", type=Path, help="Path to the input CSV file")
31
+ parser.add_argument("mcf", type=Path, help="Path to write the generated MCF")
32
+
33
+ parser.add_argument(
34
+ "--node-type",
35
+ choices=[node_type.value for node_type in NodeTypes],
36
+ default="Node",
37
+ help="Type of node to create (default: %(default)s)",
38
+ )
39
+
40
+ parser.add_argument(
41
+ "--column-mapping",
42
+ metavar="CSV_COL=MCF_PROP",
43
+ type=_kv_pair,
44
+ action="append",
45
+ help=(
46
+ "Map CSV column names to MCF properties. "
47
+ "May be used multiple times, eg: "
48
+ "--column-mapping description=searchDescription --column-mapping indicator=Node"
49
+ ),
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--csv-option",
54
+ metavar="KEY=VALUE",
55
+ type=_kv_pair,
56
+ action="append",
57
+ help=(
58
+ "Extra keyword arguments forwarded to pandas.read_csv, "
59
+ 'e.g. --csv-option delimiter=";" --csv-option encoding=UTF-8'
60
+ ),
61
+ )
62
+
63
+ parser.add_argument(
64
+ "--ignore-column",
65
+ metavar="COLUMN",
66
+ action="append",
67
+ help="Name of a CSV column to ignore. May be specified multiple times.",
68
+ )
69
+
70
+ parser.add_argument(
71
+ "--override",
72
+ action="store_true",
73
+ help="Overwrite the output file if it exists",
74
+ )
75
+
76
+ parser.set_defaults(func=run)
77
+
78
+
79
+ def run(args: argparse.Namespace) -> int:
80
+ """Execute the ``csv2mcf`` command."""
81
+ column_mapping = dict(args.column_mapping) if args.column_mapping else None
82
+ csv_options = dict(args.csv_option) if args.csv_option else None
83
+
84
+ csv_metadata_to_mfc_file(
85
+ csv_path=args.csv,
86
+ mcf_path=args.mcf,
87
+ node_type=args.node_type,
88
+ column_to_property_mapping=column_mapping,
89
+ csv_options=csv_options,
90
+ ignore_columns=args.ignore_column,
91
+ override=args.override,
92
+ )
93
+ return 0
@@ -0,0 +1,36 @@
1
+ """Run the data load job via CLI."""
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from bblocks.datacommons_tools.cli.common import load_settings_from_args
7
+ from bblocks.datacommons_tools.gcp_utilities.pipeline import run_data_load
8
+
9
+ __all__ = ["add_parser", "run"]
10
+
11
+
12
+ def add_parser(subparsers: argparse._SubParsersAction) -> None:
13
+ """Register the ``dataload`` subcommand."""
14
+ parser = subparsers.add_parser(
15
+ "dataload", help="Run the Knowledge Graph data load job"
16
+ )
17
+ parser.add_argument(
18
+ "--settings-file", type=Path, help="Path to the KG settings JSON file"
19
+ )
20
+ parser.add_argument(
21
+ "--env-file", type=Path, help="Optional .env file containing KG settings"
22
+ )
23
+ parser.add_argument(
24
+ "--timeout",
25
+ type=int,
26
+ default=6000,
27
+ help="Timeout for the job in seconds (default: %(default)s)",
28
+ )
29
+ parser.set_defaults(func=run)
30
+
31
+
32
+ def run(args: argparse.Namespace) -> int:
33
+ """Execute the ``dataload`` command."""
34
+ settings = load_settings_from_args(args)
35
+ run_data_load(settings=settings, timeout=args.timeout)
36
+ return 0