syspop-v2 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- syspop_v2-0.4.0/PKG-INFO +145 -0
- syspop_v2-0.4.0/README.md +124 -0
- syspop_v2-0.4.0/pyproject.toml +35 -0
- syspop_v2-0.4.0/setup.cfg +4 -0
- syspop_v2-0.4.0/setup.py +17 -0
- syspop_v2-0.4.0/syspop/__init__.py +0 -0
- syspop_v2-0.4.0/syspop/data/__init__.py +0 -0
- syspop_v2-0.4.0/syspop/data/data.py +93 -0
- syspop_v2-0.4.0/syspop/data/query.py +73 -0
- syspop_v2-0.4.0/syspop/data/sample.py +80 -0
- syspop_v2-0.4.0/syspop/data/utils.py +81 -0
- syspop_v2-0.4.0/syspop/model/__init__.py +1 -0
- syspop_v2-0.4.0/syspop/model/stochastic_impute.py +143 -0
- syspop_v2-0.4.0/syspop/model/utils.py +39 -0
- syspop_v2-0.4.0/syspop/postp/__init__.py +0 -0
- syspop_v2-0.4.0/syspop/postp/vis.py +67 -0
- syspop_v2-0.4.0/syspop_v2.egg-info/PKG-INFO +145 -0
- syspop_v2-0.4.0/syspop_v2.egg-info/SOURCES.txt +22 -0
- syspop_v2-0.4.0/syspop_v2.egg-info/dependency_links.txt +1 -0
- syspop_v2-0.4.0/syspop_v2.egg-info/requires.txt +5 -0
- syspop_v2-0.4.0/syspop_v2.egg-info/top_level.txt +1 -0
- syspop_v2-0.4.0/test/test_ipf.py +104 -0
- syspop_v2-0.4.0/test/test_merge.py +245 -0
- syspop_v2-0.4.0/test/test_sdv.py +110 -0
syspop_v2-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: syspop_v2
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: A package for stochastic population imputation
|
|
5
|
+
Author: Sijin Zhang
|
|
6
|
+
Author-email: Sijin Zhang <zsjzyhzp@gmail.com>
|
|
7
|
+
Project-URL: Homepage, https://github.com/jzanetti/Syspop
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/jzanetti/Syspop/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: numpy
|
|
16
|
+
Requires-Dist: matplotlib
|
|
17
|
+
Requires-Dist: pyyaml
|
|
18
|
+
Requires-Dist: pyarrow
|
|
19
|
+
Dynamic: author
|
|
20
|
+
Dynamic: requires-python
|
|
21
|
+
|
|
22
|
+
# Stochastic Impute: Synthetic Data Generation Engine
|
|
23
|
+
|
|
24
|
+
A Python utility designed to generate an **integreated synthetic unit record population data** using aggregated public data sources (e.g., from [Stats NZ Data Explorer](https://explore.data.stats.govt.nz)).
|
|
25
|
+
|
|
26
|
+
This tool is optimized for large-scale microdata generation where individual attributes are assigned based on conditional probability distributions.
|
|
27
|
+
|
|
28
|
+
The New Zealand unit records of synthetic population can be explored at [here](https://jzanetti1985.pythonanywhere.com)
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 🚀 Key Features
|
|
33
|
+
|
|
34
|
+
* **Synthetic Unit Record Generation**: Transforms multiple aggregated data sources into a unified, granular unit-record dataset.
|
|
35
|
+
* **Dynamic Column Matching**: Automatically identifies shared features between the base population and reference data.
|
|
36
|
+
* **Missingness-Aware Logic**: Handles rows with `NaN` values by dynamically re-calculating probabilities based only on the available non-null features.
|
|
37
|
+
* **Stochastic Selection**: Uses weighted random sampling to preserve the natural variance and distribution of the source data.
|
|
38
|
+
* **Optimized Performance**: Processes millions rows in seconds by grouping identical "missingness patterns" rather than iterating row-by-row.
|
|
39
|
+
|
|
40
|
+
> Please see the [FAQ](#-faq) for more details.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## 📋 Data Requirements
|
|
45
|
+
|
|
46
|
+
### 1. Population Seed
|
|
47
|
+
The starting point for your synthetic data.
|
|
48
|
+
* Contain existing columns you wish to "expand".
|
|
49
|
+
|
|
50
|
+
### 2. Reference Distributions
|
|
51
|
+
Each entry in the dictionary must be a DataFrame containing:
|
|
52
|
+
* **Shared Features**: (e.g., `age`, `location`) to match against the seed.
|
|
53
|
+
* **Target Column**: The attribute being generated.
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## 💻 Example Usage
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from pandas import DataFrame
|
|
61
|
+
from numpy import nan
|
|
62
|
+
from syspop.model.stochastic_impute import stochastic_impute
|
|
63
|
+
from syspop.postp.vis import plot_distribution
|
|
64
|
+
|
|
65
|
+
# ---------------------------------
|
|
66
|
+
# 1. Define base aggregated population data (e.g., from a census)
|
|
67
|
+
# For example, a total of 50 + 60 + 70 people in this example
|
|
68
|
+
# ---------------------------------
|
|
69
|
+
base_population_data = DataFrame(
|
|
70
|
+
{
|
|
71
|
+
"gender": [1, 2, 1],
|
|
72
|
+
"age": [25, 30, 40],
|
|
73
|
+
"value": [50, 60, 70],
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# ---------------------------------
|
|
78
|
+
# 2. Define reference aggregated data (e.g., Work Status distribution)
|
|
79
|
+
# For example, a total of 8 + 2 + 6 + 4 people in this reference data
|
|
80
|
+
# ---------------------------------
|
|
81
|
+
income_data = DataFrame(
|
|
82
|
+
{
|
|
83
|
+
"gender": [1, 1, 2, 2],
|
|
84
|
+
"age": [25, 30, 25, nan],
|
|
85
|
+
"work_status": [1, 2, 1, 2],
|
|
86
|
+
"income": [50000, 60000, 55000, 45000],
|
|
87
|
+
"value": [8, 2, 6, 4],
|
|
88
|
+
}
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# ---------------------------------
|
|
92
|
+
# 3. Combine data into a dictionary for the imputation process
|
|
93
|
+
# ---------------------------------
|
|
94
|
+
data = {"seed": base_population_data, "income": income_data}
|
|
95
|
+
|
|
96
|
+
# -------------------------------------------------------------------------
|
|
97
|
+
# 4. Imputation Task Configuration
|
|
98
|
+
#
|
|
99
|
+
# OBJECTIVE:
|
|
100
|
+
# Define imputation models where 'features' (age, gender)
|
|
101
|
+
# predict 'targets' (work_status and income).
|
|
102
|
+
#
|
|
103
|
+
# HANDLING SAMPLE SIZE DISCREPANCIES (Sparsity Alignment):
|
|
104
|
+
# The source datasets (e.g., income) sometimes have smaller populations
|
|
105
|
+
# than the seed population (180 records). To maintain statistical
|
|
106
|
+
# consistency, we introduce NaNs into the output for matching
|
|
107
|
+
# reference data in 'drop_list'.
|
|
108
|
+
#
|
|
109
|
+
# Example: If the income dataset only contains 20 records, we randomly
|
|
110
|
+
# retain only 20 income/work_status in the output population and set the remaining
|
|
111
|
+
# 160 to NaN
|
|
112
|
+
# -------------------------------------------------------------------------
|
|
113
|
+
task_list = {
|
|
114
|
+
"income": {
|
|
115
|
+
"targets": {"work_status": "category", "income": "numeric"},
|
|
116
|
+
"features": ["age", "gender"],
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
drop_list = ["income"]
|
|
120
|
+
# ---------------------------------
|
|
121
|
+
# 5. Run the stochastic imputation process
|
|
122
|
+
# ---------------------------------
|
|
123
|
+
syn_pop = stochastic_impute(data, task_list)
|
|
124
|
+
|
|
125
|
+
# ---------------------------------
|
|
126
|
+
# 6. Plot distribution
|
|
127
|
+
# ---------------------------------
|
|
128
|
+
# 6.1 Plot distribution for age
|
|
129
|
+
plot_distribution(syn_pop, ["age"])
|
|
130
|
+
# 6.2 Plot joint distribution for gender + age
|
|
131
|
+
plot_distribution(syn_pop, ["gender", "age"])
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
<a name="faq"></a>
|
|
135
|
+
## 🧠FAQ
|
|
136
|
+
### Is generating synthetic unit-record data in this way actually accurate?
|
|
137
|
+
Well, it depends on your use case and the quality of your inputs. Ideally, you should work with real unit-record data. However, in practice, this isn't always feasible (i.e., in New Zealand, if you live far away from a Stats NZ IDI data lab). This utility provides a method to statistically link different aggregated, published population benchmarks together. This allows you to build out full data pipelines and prototype products locally before taking your code into a restricted environment.
|
|
138
|
+
|
|
139
|
+
### Maximizing accuracy?
|
|
140
|
+
The accuracy of the synthetic output depends heavily on task design. The more shared variables (covariates) present in your reference data to condition the probabilities, the closer the synthetic distribution will mirror reality.
|
|
141
|
+
|
|
142
|
+
Also, you can run the process multiple times to capture inherent uncertainties.
|
|
143
|
+
|
|
144
|
+
### Are we doing any prediction modelling here ?
|
|
145
|
+
It is out of scope at the moment. If certain covariate values are missing from the reference data to condition the probabilities, the process simply ignores those missing values when linking the reference data to the seed data (for example, if the reference data does not contain income information for children, when integrating the reference data into the seed population data, the integrated data will just set the income for children as NaN). The reason is that many covariates in these datasets are categorical, and applying simple prediction models can struggle to capture the nuances and introduce unwanted noise or uncertainty into the output data. However, you are welcome to apply your own predictive models to handle missing data prior to running this process if your use case requires it.
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Stochastic Impute: Synthetic Data Generation Engine
|
|
2
|
+
|
|
3
|
+
A Python utility designed to generate an **integreated synthetic unit record population data** using aggregated public data sources (e.g., from [Stats NZ Data Explorer](https://explore.data.stats.govt.nz)).
|
|
4
|
+
|
|
5
|
+
This tool is optimized for large-scale microdata generation where individual attributes are assigned based on conditional probability distributions.
|
|
6
|
+
|
|
7
|
+
The New Zealand unit records of synthetic population can be explored at [here](https://jzanetti1985.pythonanywhere.com)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## 🚀 Key Features
|
|
12
|
+
|
|
13
|
+
* **Synthetic Unit Record Generation**: Transforms multiple aggregated data sources into a unified, granular unit-record dataset.
|
|
14
|
+
* **Dynamic Column Matching**: Automatically identifies shared features between the base population and reference data.
|
|
15
|
+
* **Missingness-Aware Logic**: Handles rows with `NaN` values by dynamically re-calculating probabilities based only on the available non-null features.
|
|
16
|
+
* **Stochastic Selection**: Uses weighted random sampling to preserve the natural variance and distribution of the source data.
|
|
17
|
+
* **Optimized Performance**: Processes millions rows in seconds by grouping identical "missingness patterns" rather than iterating row-by-row.
|
|
18
|
+
|
|
19
|
+
> Please see the [FAQ](#-faq) for more details.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 📋 Data Requirements
|
|
24
|
+
|
|
25
|
+
### 1. Population Seed
|
|
26
|
+
The starting point for your synthetic data.
|
|
27
|
+
* Contain existing columns you wish to "expand".
|
|
28
|
+
|
|
29
|
+
### 2. Reference Distributions
|
|
30
|
+
Each entry in the dictionary must be a DataFrame containing:
|
|
31
|
+
* **Shared Features**: (e.g., `age`, `location`) to match against the seed.
|
|
32
|
+
* **Target Column**: The attribute being generated.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 💻 Example Usage
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from pandas import DataFrame
|
|
40
|
+
from numpy import nan
|
|
41
|
+
from syspop.model.stochastic_impute import stochastic_impute
|
|
42
|
+
from syspop.postp.vis import plot_distribution
|
|
43
|
+
|
|
44
|
+
# ---------------------------------
|
|
45
|
+
# 1. Define base aggregated population data (e.g., from a census)
|
|
46
|
+
# For example, a total of 50 + 60 + 70 people in this example
|
|
47
|
+
# ---------------------------------
|
|
48
|
+
base_population_data = DataFrame(
|
|
49
|
+
{
|
|
50
|
+
"gender": [1, 2, 1],
|
|
51
|
+
"age": [25, 30, 40],
|
|
52
|
+
"value": [50, 60, 70],
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# ---------------------------------
|
|
57
|
+
# 2. Define reference aggregated data (e.g., Work Status distribution)
|
|
58
|
+
# For example, a total of 8 + 2 + 6 + 4 people in this reference data
|
|
59
|
+
# ---------------------------------
|
|
60
|
+
income_data = DataFrame(
|
|
61
|
+
{
|
|
62
|
+
"gender": [1, 1, 2, 2],
|
|
63
|
+
"age": [25, 30, 25, nan],
|
|
64
|
+
"work_status": [1, 2, 1, 2],
|
|
65
|
+
"income": [50000, 60000, 55000, 45000],
|
|
66
|
+
"value": [8, 2, 6, 4],
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# ---------------------------------
|
|
71
|
+
# 3. Combine data into a dictionary for the imputation process
|
|
72
|
+
# ---------------------------------
|
|
73
|
+
data = {"seed": base_population_data, "income": income_data}
|
|
74
|
+
|
|
75
|
+
# -------------------------------------------------------------------------
|
|
76
|
+
# 4. Imputation Task Configuration
|
|
77
|
+
#
|
|
78
|
+
# OBJECTIVE:
|
|
79
|
+
# Define imputation models where 'features' (age, gender)
|
|
80
|
+
# predict 'targets' (work_status and income).
|
|
81
|
+
#
|
|
82
|
+
# HANDLING SAMPLE SIZE DISCREPANCIES (Sparsity Alignment):
|
|
83
|
+
# The source datasets (e.g., income) sometimes have smaller populations
|
|
84
|
+
# than the seed population (180 records). To maintain statistical
|
|
85
|
+
# consistency, we introduce NaNs into the output for matching
|
|
86
|
+
# reference data in 'drop_list'.
|
|
87
|
+
#
|
|
88
|
+
# Example: If the income dataset only contains 20 records, we randomly
|
|
89
|
+
# retain only 20 income/work_status in the output population and set the remaining
|
|
90
|
+
# 160 to NaN
|
|
91
|
+
# -------------------------------------------------------------------------
|
|
92
|
+
task_list = {
|
|
93
|
+
"income": {
|
|
94
|
+
"targets": {"work_status": "category", "income": "numeric"},
|
|
95
|
+
"features": ["age", "gender"],
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
drop_list = ["income"]
|
|
99
|
+
# ---------------------------------
|
|
100
|
+
# 5. Run the stochastic imputation process
|
|
101
|
+
# ---------------------------------
|
|
102
|
+
syn_pop = stochastic_impute(data, task_list)
|
|
103
|
+
|
|
104
|
+
# ---------------------------------
|
|
105
|
+
# 6. Plot distribution
|
|
106
|
+
# ---------------------------------
|
|
107
|
+
# 6.1 Plot distribution for age
|
|
108
|
+
plot_distribution(syn_pop, ["age"])
|
|
109
|
+
# 6.2 Plot joint distribution for gender + age
|
|
110
|
+
plot_distribution(syn_pop, ["gender", "age"])
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
<a name="faq"></a>
|
|
114
|
+
## 🧠FAQ
|
|
115
|
+
### Is generating synthetic unit-record data in this way actually accurate?
|
|
116
|
+
Well, it depends on your use case and the quality of your inputs. Ideally, you should work with real unit-record data. However, in practice, this isn't always feasible (i.e., in New Zealand, if you live far away from a Stats NZ IDI data lab). This utility provides a method to statistically link different aggregated, published population benchmarks together. This allows you to build out full data pipelines and prototype products locally before taking your code into a restricted environment.
|
|
117
|
+
|
|
118
|
+
### Maximizing accuracy?
|
|
119
|
+
The accuracy of the synthetic output depends heavily on task design. The more shared variables (covariates) present in your reference data to condition the probabilities, the closer the synthetic distribution will mirror reality.
|
|
120
|
+
|
|
121
|
+
Also, you can run the process multiple times to capture inherent uncertainties.
|
|
122
|
+
|
|
123
|
+
### Are we doing any prediction modelling here ?
|
|
124
|
+
It is out of scope at the moment. If certain covariate values are missing from the reference data to condition the probabilities, the process simply ignores those missing values when linking the reference data to the seed data (for example, if the reference data does not contain income information for children, when integrating the reference data into the seed population data, the integrated data will just set the income for children as NaN). The reason is that many covariates in these datasets are categorical, and applying simple prediction models can struggle to capture the nuances and introduce unwanted noise or uncertainty into the output data. However, you are welcome to apply your own predictive models to handle missing data prior to running this process if your use case requires it.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "syspop_v2"
|
|
7
|
+
version = "0.4.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Sijin Zhang", email="zsjzyhzp@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A package for stochastic population imputation"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
]
|
|
19
|
+
# List the external libraries your package needs to run
|
|
20
|
+
dependencies = [
|
|
21
|
+
"pandas",
|
|
22
|
+
"numpy",
|
|
23
|
+
"matplotlib",
|
|
24
|
+
"pyyaml",
|
|
25
|
+
"pyarrow"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
"Homepage" = "https://github.com/jzanetti/Syspop"
|
|
30
|
+
"Bug Tracker" = "https://github.com/jzanetti/Syspop/issues"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["."] # Tells it to look in the current root directory
|
|
34
|
+
include = ["syspop*"] # Grabs your 'syspop' folder and everything inside it
|
|
35
|
+
exclude = ["examples*"] # Ensures the 'examples' folder doesn't get published
|
syspop_v2-0.4.0/setup.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="syspop_v2",
|
|
5
|
+
version="0.4.0", # Or whatever version is appropriate
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
install_requires=[
|
|
8
|
+
"pandas",
|
|
9
|
+
"numpy",
|
|
10
|
+
"pyarrow"
|
|
11
|
+
"pyyaml",
|
|
12
|
+
"matplotlib"
|
|
13
|
+
],
|
|
14
|
+
author="Sijin Zhang",
|
|
15
|
+
description="Creating synthetic population",
|
|
16
|
+
python_requires='>=3.8',
|
|
17
|
+
)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from syspop.data.query import obtain_stats_data
|
|
2
|
+
from syspop.data.utils import stats_data_proc
|
|
3
|
+
from pandas import DataFrame as pdDataFrame
|
|
4
|
+
from sklearn.preprocessing import LabelEncoder
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def obtain_data(cfg: dict, api_key: str):
|
|
8
|
+
"""
|
|
9
|
+
Obtains and processes population statistics data based on the provided configuration and API key.
|
|
10
|
+
|
|
11
|
+
The function performs the following steps:
|
|
12
|
+
1. Fetches raw statistics data using the API configuration and key.
|
|
13
|
+
2. Processes the raw data according to the configuration.
|
|
14
|
+
3. Applies inclusion and exclusion filters to the data based on specified criteria.
|
|
15
|
+
4. Maps the filtered data to the desired columns.
|
|
16
|
+
5. Expands the DataFrame by repeating rows according to the 'value' column.
|
|
17
|
+
6. Returns the final DataFrame with the 'value' column removed and index reset.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
cfg (dict): Configuration dictionary containing API settings, mapping of column names,
|
|
21
|
+
inclusion and exclusion criteria.
|
|
22
|
+
api_key (str): API key for accessing the statistics data.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
pandas.DataFrame: Processed and filtered DataFrame with population statistics.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def _obtain_qc_key(cfg, proc_qc_type):
|
|
29
|
+
if proc_qc_type in cfg["map"]:
|
|
30
|
+
return cfg["map"][proc_qc_type]
|
|
31
|
+
return proc_qc_type
|
|
32
|
+
|
|
33
|
+
data_pop = obtain_stats_data(cfg["api"], api_key=api_key)
|
|
34
|
+
data_pop = stats_data_proc(data_pop, cfg)
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
for proc_qc_type in cfg["inclusion"]:
|
|
38
|
+
proc_qc_key = _obtain_qc_key(cfg, proc_qc_type)
|
|
39
|
+
data_pop = data_pop[
|
|
40
|
+
data_pop[proc_qc_key].isin(cfg["inclusion"][proc_qc_type])
|
|
41
|
+
]
|
|
42
|
+
except TypeError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
for proc_qc_type in cfg["exclusion"]:
|
|
47
|
+
proc_qc_key = _obtain_qc_key(cfg, proc_qc_type)
|
|
48
|
+
data_pop = data_pop[
|
|
49
|
+
~data_pop[proc_qc_key].isin(cfg["exclusion"][proc_qc_type])
|
|
50
|
+
]
|
|
51
|
+
except TypeError:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
df = data_pop[list(cfg["map"].values())]
|
|
55
|
+
|
|
56
|
+
return df
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def encode_weights(data_dict) -> dict:
|
|
60
|
+
|
|
61
|
+
for key in data_dict:
|
|
62
|
+
|
|
63
|
+
run_repeat = False
|
|
64
|
+
if key == "seed":
|
|
65
|
+
run_repeat = True
|
|
66
|
+
|
|
67
|
+
df = data_dict[key]
|
|
68
|
+
|
|
69
|
+
if run_repeat:
|
|
70
|
+
df = df.loc[df.index.repeat(df["value"])].copy()
|
|
71
|
+
df = df.reset_index(drop=True).drop(columns=["value"])
|
|
72
|
+
else:
|
|
73
|
+
group_cols = df.columns.drop("value").tolist()
|
|
74
|
+
df_grouped = df.groupby(group_cols, as_index=False)["value"].sum()
|
|
75
|
+
df_grouped["probability"] = df_grouped["value"] / df_grouped["value"].sum()
|
|
76
|
+
df = df_grouped.reset_index(drop=True).drop(columns=["value"])
|
|
77
|
+
|
|
78
|
+
data_dict[key] = df
|
|
79
|
+
|
|
80
|
+
return data_dict
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def encode_sum(data_dict, drop_list) -> dict:
|
|
84
|
+
|
|
85
|
+
result_dict = {}
|
|
86
|
+
|
|
87
|
+
for key in data_dict:
|
|
88
|
+
|
|
89
|
+
if key not in drop_list:
|
|
90
|
+
continue
|
|
91
|
+
result_dict[key] = data_dict[key]["value"].sum()
|
|
92
|
+
|
|
93
|
+
return result_dict
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from requests import get, exceptions
|
|
2
|
+
import xml.etree.ElementTree as ET
|
|
3
|
+
from pandas import DataFrame
|
|
4
|
+
from pandas import to_numeric
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def stats_data_proc(data):
|
|
8
|
+
data["value"] = to_numeric(data["value"], errors="coerce")
|
|
9
|
+
data = data.dropna()
|
|
10
|
+
data["value"] = data["value"].astype(int)
|
|
11
|
+
return data
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def obtain_stats_data(api_url: str, api_key: str or None = None):
|
|
15
|
+
|
|
16
|
+
if api_key is None:
|
|
17
|
+
raise Exception("No proper Stats API is provided")
|
|
18
|
+
|
|
19
|
+
# Set headers with the API key
|
|
20
|
+
headers = {"Ocp-Apim-Subscription-Key": api_key}
|
|
21
|
+
# Make the GET request
|
|
22
|
+
try:
|
|
23
|
+
response = get(api_url, headers=headers)
|
|
24
|
+
response.raise_for_status() # Check for HTTP errors
|
|
25
|
+
except exceptions.HTTPError as e:
|
|
26
|
+
print(f"HTTP Error: {e}")
|
|
27
|
+
if response.status_code == 401:
|
|
28
|
+
print(
|
|
29
|
+
"Authentication failed. Please check your API key or obtain a valid one from https://api.data.stats.govt.nz/"
|
|
30
|
+
)
|
|
31
|
+
elif response.status_code == 400:
|
|
32
|
+
print(
|
|
33
|
+
"Bad Request. The URL or dimensions may be invalid. Check the API documentation or simplify the query."
|
|
34
|
+
)
|
|
35
|
+
print(f"Raw response: {response.text}")
|
|
36
|
+
exit()
|
|
37
|
+
except exceptions.RequestException as e:
|
|
38
|
+
print(f"Error fetching data: {e}")
|
|
39
|
+
exit()
|
|
40
|
+
|
|
41
|
+
# Check if response is empty
|
|
42
|
+
if not response.text:
|
|
43
|
+
print("Error: Empty response received from the API.")
|
|
44
|
+
exit()
|
|
45
|
+
xml_text = response.text # your XML string
|
|
46
|
+
|
|
47
|
+
# Parse XML
|
|
48
|
+
root = ET.fromstring(xml_text)
|
|
49
|
+
|
|
50
|
+
# Namespaces
|
|
51
|
+
ns = {
|
|
52
|
+
"generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
|
|
53
|
+
"message": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
rows = []
|
|
57
|
+
# Iterate over all Obs elements
|
|
58
|
+
for obs in root.findall(".//generic:Obs", ns):
|
|
59
|
+
row = {}
|
|
60
|
+
|
|
61
|
+
# Extract all key/value pairs from ObsKey
|
|
62
|
+
for val in obs.findall("./generic:ObsKey/generic:Value", ns):
|
|
63
|
+
row[val.attrib["id"]] = val.attrib["value"]
|
|
64
|
+
|
|
65
|
+
# Extract the observed value
|
|
66
|
+
obs_value = obs.find("./generic:ObsValue", ns)
|
|
67
|
+
if obs_value is not None:
|
|
68
|
+
row["OBS_VALUE"] = obs_value.attrib["value"]
|
|
69
|
+
|
|
70
|
+
rows.append(row)
|
|
71
|
+
|
|
72
|
+
# Convert to DataFrame
|
|
73
|
+
return DataFrame(rows)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from process.data.data import obtain_data
|
|
2
|
+
from logging import info as log_info
|
|
3
|
+
from etc.sample_data.api_keys import STATS_API
|
|
4
|
+
from yaml import safe_load
|
|
5
|
+
from pickle import dump as pickle_dump
|
|
6
|
+
from pickle import load as pickle_load
|
|
7
|
+
from process.model.utils import obtain_all_tasks
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def obtain_sample_data_cfg():
|
|
11
|
+
with open("etc/sample_data/sample_data_cfg.yml", "r") as fid:
|
|
12
|
+
cfg = safe_load(fid)
|
|
13
|
+
|
|
14
|
+
return cfg["tables"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def obtain_sample_api_key(api_key: str or None = None):
|
|
18
|
+
if api_key is None:
|
|
19
|
+
api_key = STATS_API
|
|
20
|
+
|
|
21
|
+
return api_key
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def load_sample_data(
|
|
25
|
+
data_types: list = [
|
|
26
|
+
"seed",
|
|
27
|
+
"industry",
|
|
28
|
+
"occupation",
|
|
29
|
+
"occupation_income",
|
|
30
|
+
"industry_income",
|
|
31
|
+
"travel_to_work",
|
|
32
|
+
"work_hours",
|
|
33
|
+
],
|
|
34
|
+
refresh: bool = False,
|
|
35
|
+
):
|
|
36
|
+
"""
|
|
37
|
+
Wrapper function to retrieve data for specified types using an API key.
|
|
38
|
+
|
|
39
|
+
cfg_data (dict): Dictionary containing configuration for each data type,
|
|
40
|
+
where keys are data type strings and values are configuration dicts.
|
|
41
|
+
api_key (str or None, optional): API key to use for data retrieval. If None,
|
|
42
|
+
the key will be obtained via `obtain_sample_api_key`. Defaults to None.
|
|
43
|
+
data_types (list, optional): List of data type strings to retrieve.
|
|
44
|
+
Defaults to ["pop"].
|
|
45
|
+
|
|
46
|
+
dict: A dictionary mapping each data type (from `data_types`) to its
|
|
47
|
+
corresponding data retrieved using the API.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
KeyError: If a specified data type is not present in `cfg_data`.
|
|
51
|
+
Exception: Propagates exceptions raised during API key retrieval or data fetching.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
>>> cfg_data = {"seed": {...}, "income": {...}}
|
|
55
|
+
>>> result = obtain_data_wrapper(cfg_data, api_key="my_key", data_types=["pop", "income"])
|
|
56
|
+
>>> print(result)
|
|
57
|
+
{'seed': <pop_data>, 'income': <income_data>}
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
if not refresh:
|
|
61
|
+
data_dict = pickle_load(open("etc/sample_data/sample_data.pkl", "rb"))
|
|
62
|
+
else:
|
|
63
|
+
api_key = obtain_sample_api_key()
|
|
64
|
+
cfg_data = obtain_sample_data_cfg()
|
|
65
|
+
|
|
66
|
+
data_dict = {}
|
|
67
|
+
for data_type in data_types:
|
|
68
|
+
|
|
69
|
+
log_info(f"Obtaining data for type: {data_type}")
|
|
70
|
+
|
|
71
|
+
data_dict[data_type] = obtain_data(cfg_data[data_type], api_key)
|
|
72
|
+
|
|
73
|
+
pickle_dump(data_dict, open("etc/sample_data/sample_data.pkl", "wb"))
|
|
74
|
+
|
|
75
|
+
with open("etc/sample_data/sample_model_cfg.yml", "r") as fid:
|
|
76
|
+
model_cfg = safe_load(fid)
|
|
77
|
+
|
|
78
|
+
task_list = obtain_all_tasks(model_cfg["tasks"], model_cfg["cfg"])
|
|
79
|
+
|
|
80
|
+
return data_dict, task_list
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from pandas import to_numeric
|
|
2
|
+
from pandas import DataFrame
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def stats_data_proc(data: DataFrame, cfg: dict):
|
|
6
|
+
|
|
7
|
+
data = data.rename(columns=cfg["map"])
|
|
8
|
+
|
|
9
|
+
data["value"] = to_numeric(data["value"], errors="coerce")
|
|
10
|
+
data = data.dropna()
|
|
11
|
+
data["value"] = data["value"].astype(int)
|
|
12
|
+
|
|
13
|
+
return data
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def check_data_consistency(
|
|
17
|
+
data_dict: dict,
|
|
18
|
+
check_err: bool = True,
|
|
19
|
+
throw_err: bool = False,
|
|
20
|
+
output_dir: str or None = None,
|
|
21
|
+
):
|
|
22
|
+
rows = []
|
|
23
|
+
|
|
24
|
+
for key, df in data_dict.items():
|
|
25
|
+
for col in df.columns:
|
|
26
|
+
unique_list = df[col].unique()
|
|
27
|
+
unique_str = ", ".join(map(str, unique_list))
|
|
28
|
+
unique_str = [item.strip() for item in unique_str.split(",")]
|
|
29
|
+
unique_str.sort()
|
|
30
|
+
unique_str = ", ".join(unique_str)
|
|
31
|
+
|
|
32
|
+
unique_count = len(unique_list)
|
|
33
|
+
rows.append(
|
|
34
|
+
{
|
|
35
|
+
"data_key": key,
|
|
36
|
+
"cols": col,
|
|
37
|
+
"unique_count": unique_count,
|
|
38
|
+
"unique_value": unique_str,
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
summary_df = DataFrame(rows)
|
|
43
|
+
if output_dir is not None:
|
|
44
|
+
summary_df.to_csv(f"{output_dir}/data_consistency.csv", index=False)
|
|
45
|
+
|
|
46
|
+
if check_err:
|
|
47
|
+
all_cols = summary_df["cols"].unique()
|
|
48
|
+
|
|
49
|
+
for proc_col in all_cols:
|
|
50
|
+
proc_series = summary_df[summary_df["cols"] == proc_col]["unique_value"]
|
|
51
|
+
|
|
52
|
+
proc_series = [set(val.split(", ")) for val in proc_series]
|
|
53
|
+
|
|
54
|
+
all_same = all(s == proc_series[0] for s in proc_series)
|
|
55
|
+
|
|
56
|
+
if not all_same:
|
|
57
|
+
print(
|
|
58
|
+
summary_df[summary_df["cols"] == proc_col][
|
|
59
|
+
["data_key", "cols", "unique_value"]
|
|
60
|
+
]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if throw_err:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Column '{proc_col}' has different unique values across datasets."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Ask the user if they want to continue
|
|
69
|
+
user_choice = (
|
|
70
|
+
input(
|
|
71
|
+
f"\nColumn '{proc_col}' has different unique values "
|
|
72
|
+
+ "across datasets. Continue? [Y/N]: "
|
|
73
|
+
)
|
|
74
|
+
.strip()
|
|
75
|
+
.upper()
|
|
76
|
+
)
|
|
77
|
+
if user_choice != "Y":
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"Execution halted by user. Column '{proc_col}' has "
|
|
80
|
+
+ "different unique values across datasets."
|
|
81
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
MODEL_TRAINING_TEST_RATIO = 0.1
|