syspop-v2 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ Metadata-Version: 2.4
2
+ Name: syspop_v2
3
+ Version: 0.4.0
4
+ Summary: A package for stochastic population imputation
5
+ Author: Sijin Zhang
6
+ Author-email: Sijin Zhang <zsjzyhzp@gmail.com>
7
+ Project-URL: Homepage, https://github.com/jzanetti/Syspop
8
+ Project-URL: Bug Tracker, https://github.com/jzanetti/Syspop/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: pandas
15
+ Requires-Dist: numpy
16
+ Requires-Dist: matplotlib
17
+ Requires-Dist: pyyaml
18
+ Requires-Dist: pyarrow
19
+ Dynamic: author
20
+ Dynamic: requires-python
21
+
22
+ # Stochastic Impute: Synthetic Data Generation Engine
23
+
24
+ A Python utility designed to generate an **integreated synthetic unit record population data** using aggregated public data sources (e.g., from [Stats NZ Data Explorer](https://explore.data.stats.govt.nz)).
25
+
26
+ This tool is optimized for large-scale microdata generation where individual attributes are assigned based on conditional probability distributions.
27
+
28
+ The New Zealand unit records of synthetic population can be explored at [here](https://jzanetti1985.pythonanywhere.com)
29
+
30
+ ---
31
+
32
+ ## 🚀 Key Features
33
+
34
+ * **Synthetic Unit Record Generation**: Transforms multiple aggregated data sources into a unified, granular unit-record dataset.
35
+ * **Dynamic Column Matching**: Automatically identifies shared features between the base population and reference data.
36
+ * **Missingness-Aware Logic**: Handles rows with `NaN` values by dynamically re-calculating probabilities based only on the available non-null features.
37
+ * **Stochastic Selection**: Uses weighted random sampling to preserve the natural variance and distribution of the source data.
38
+ * **Optimized Performance**: Processes millions rows in seconds by grouping identical "missingness patterns" rather than iterating row-by-row.
39
+
40
+ > Please see the [FAQ](#-faq) for more details.
41
+
42
+ ---
43
+
44
+ ## 📋 Data Requirements
45
+
46
+ ### 1. Population Seed
47
+ The starting point for your synthetic data.
48
+ * Contain existing columns you wish to "expand".
49
+
50
+ ### 2. Reference Distributions
51
+ Each entry in the dictionary must be a DataFrame containing:
52
+ * **Shared Features**: (e.g., `age`, `location`) to match against the seed.
53
+ * **Target Column**: The attribute being generated.
54
+
55
+ ---
56
+
57
+ ## 💻 Example Usage
58
+
59
+ ```python
60
+ from pandas import DataFrame
61
+ from numpy import nan
62
+ from syspop.model.stochastic_impute import stochastic_impute
63
+ from syspop.postp.vis import plot_distribution
64
+
65
+ # ---------------------------------
66
+ # 1. Define base aggregated population data (e.g., from a census)
67
+ # For example, a total of 50 + 60 + 70 people in this example
68
+ # ---------------------------------
69
+ base_population_data = DataFrame(
70
+ {
71
+ "gender": [1, 2, 1],
72
+ "age": [25, 30, 40],
73
+ "value": [50, 60, 70],
74
+ }
75
+ )
76
+
77
+ # ---------------------------------
78
+ # 2. Define reference aggregated data (e.g., Work Status distribution)
79
+ # For example, a total of 8 + 2 + 6 + 4 people in this reference data
80
+ # ---------------------------------
81
+ income_data = DataFrame(
82
+ {
83
+ "gender": [1, 1, 2, 2],
84
+ "age": [25, 30, 25, nan],
85
+ "work_status": [1, 2, 1, 2],
86
+ "income": [50000, 60000, 55000, 45000],
87
+ "value": [8, 2, 6, 4],
88
+ }
89
+ )
90
+
91
+ # ---------------------------------
92
+ # 3. Combine data into a dictionary for the imputation process
93
+ # ---------------------------------
94
+ data = {"seed": base_population_data, "income": income_data}
95
+
96
+ # -------------------------------------------------------------------------
97
+ # 4. Imputation Task Configuration
98
+ #
99
+ # OBJECTIVE:
100
+ # Define imputation models where 'features' (age, gender)
101
+ # predict 'targets' (work_status and income).
102
+ #
103
+ # HANDLING SAMPLE SIZE DISCREPANCIES (Sparsity Alignment):
104
+ # The source datasets (e.g., income) sometimes have smaller populations
105
+ # than the seed population (180 records). To maintain statistical
106
+ # consistency, we introduce NaNs into the output for matching
107
+ # reference data in 'drop_list'.
108
+ #
109
+ # Example: If the income dataset only contains 20 records, we randomly
110
+ # retain only 20 income/work_status in the output population and set the remaining
111
+ # 160 to NaN
112
+ # -------------------------------------------------------------------------
113
+ task_list = {
114
+ "income": {
115
+ "targets": {"work_status": "category", "income": "numeric"},
116
+ "features": ["age", "gender"],
117
+ }
118
+ }
119
+ drop_list = ["income"]
120
+ # ---------------------------------
121
+ # 5. Run the stochastic imputation process
122
+ # ---------------------------------
123
+ syn_pop = stochastic_impute(data, task_list)
124
+
125
+ # ---------------------------------
126
+ # 6. Plot distribution
127
+ # ---------------------------------
128
+ # 6.1 Plot distribution for age
129
+ plot_distribution(syn_pop, ["age"])
130
+ # 6.2 Plot joint distribution for gender + age
131
+ plot_distribution(syn_pop, ["gender", "age"])
132
+ ```
133
+
134
+ <a name="faq"></a>
135
+ ## 🧠 FAQ
136
+ ### Is generating synthetic unit-record data in this way actually accurate?
137
+ Well, it depends on your use case and the quality of your inputs. Ideally, you should work with real unit-record data. However, in practice, this isn't always feasible (i.e., in New Zealand, if you live far away from a Stats NZ IDI data lab). This utility provides a method to statistically link different aggregated, published population benchmarks together. This allows you to build out full data pipelines and prototype products locally before taking your code into a restricted environment.
138
+
139
+ ### Maximizing accuracy?
140
+ The accuracy of the synthetic output depends heavily on task design. The more shared variables (covariates) present in your reference data to condition the probabilities, the closer the synthetic distribution will mirror reality.
141
+
142
+ Also, you can run the process multiple times to capture inherent uncertainties.
143
+
144
+ ### Are we doing any prediction modelling here ?
145
+ It is out of scope at the moment. If certain covariate values are missing from the reference data to condition the probabilities, the process simply ignores those missing values when linking the reference data to the seed data (for example, if the reference data does not contain income information for children, when integrating the reference data into the seed population data, the integrated data will just set the income for children as NaN). The reason is that many covariates in these datasets are categorical, and applying simple prediction models can struggle to capture the nuances and introduce unwanted noise or uncertainty into the output data. However, you are welcome to apply your own predictive models to handle missing data prior to running this process if your use case requires it.
@@ -0,0 +1,124 @@
1
+ # Stochastic Impute: Synthetic Data Generation Engine
2
+
3
+ A Python utility designed to generate an **integreated synthetic unit record population data** using aggregated public data sources (e.g., from [Stats NZ Data Explorer](https://explore.data.stats.govt.nz)).
4
+
5
+ This tool is optimized for large-scale microdata generation where individual attributes are assigned based on conditional probability distributions.
6
+
7
+ The New Zealand unit records of synthetic population can be explored at [here](https://jzanetti1985.pythonanywhere.com)
8
+
9
+ ---
10
+
11
+ ## 🚀 Key Features
12
+
13
+ * **Synthetic Unit Record Generation**: Transforms multiple aggregated data sources into a unified, granular unit-record dataset.
14
+ * **Dynamic Column Matching**: Automatically identifies shared features between the base population and reference data.
15
+ * **Missingness-Aware Logic**: Handles rows with `NaN` values by dynamically re-calculating probabilities based only on the available non-null features.
16
+ * **Stochastic Selection**: Uses weighted random sampling to preserve the natural variance and distribution of the source data.
17
+ * **Optimized Performance**: Processes millions rows in seconds by grouping identical "missingness patterns" rather than iterating row-by-row.
18
+
19
+ > Please see the [FAQ](#-faq) for more details.
20
+
21
+ ---
22
+
23
+ ## 📋 Data Requirements
24
+
25
+ ### 1. Population Seed
26
+ The starting point for your synthetic data.
27
+ * Contain existing columns you wish to "expand".
28
+
29
+ ### 2. Reference Distributions
30
+ Each entry in the dictionary must be a DataFrame containing:
31
+ * **Shared Features**: (e.g., `age`, `location`) to match against the seed.
32
+ * **Target Column**: The attribute being generated.
33
+
34
+ ---
35
+
36
+ ## 💻 Example Usage
37
+
38
+ ```python
39
+ from pandas import DataFrame
40
+ from numpy import nan
41
+ from syspop.model.stochastic_impute import stochastic_impute
42
+ from syspop.postp.vis import plot_distribution
43
+
44
+ # ---------------------------------
45
+ # 1. Define base aggregated population data (e.g., from a census)
46
+ # For example, a total of 50 + 60 + 70 people in this example
47
+ # ---------------------------------
48
+ base_population_data = DataFrame(
49
+ {
50
+ "gender": [1, 2, 1],
51
+ "age": [25, 30, 40],
52
+ "value": [50, 60, 70],
53
+ }
54
+ )
55
+
56
+ # ---------------------------------
57
+ # 2. Define reference aggregated data (e.g., Work Status distribution)
58
+ # For example, a total of 8 + 2 + 6 + 4 people in this reference data
59
+ # ---------------------------------
60
+ income_data = DataFrame(
61
+ {
62
+ "gender": [1, 1, 2, 2],
63
+ "age": [25, 30, 25, nan],
64
+ "work_status": [1, 2, 1, 2],
65
+ "income": [50000, 60000, 55000, 45000],
66
+ "value": [8, 2, 6, 4],
67
+ }
68
+ )
69
+
70
+ # ---------------------------------
71
+ # 3. Combine data into a dictionary for the imputation process
72
+ # ---------------------------------
73
+ data = {"seed": base_population_data, "income": income_data}
74
+
75
+ # -------------------------------------------------------------------------
76
+ # 4. Imputation Task Configuration
77
+ #
78
+ # OBJECTIVE:
79
+ # Define imputation models where 'features' (age, gender)
80
+ # predict 'targets' (work_status and income).
81
+ #
82
+ # HANDLING SAMPLE SIZE DISCREPANCIES (Sparsity Alignment):
83
+ # The source datasets (e.g., income) sometimes have smaller populations
84
+ # than the seed population (180 records). To maintain statistical
85
+ # consistency, we introduce NaNs into the output for matching
86
+ # reference data in 'drop_list'.
87
+ #
88
+ # Example: If the income dataset only contains 20 records, we randomly
89
+ # retain only 20 income/work_status in the output population and set the remaining
90
+ # 160 to NaN
91
+ # -------------------------------------------------------------------------
92
+ task_list = {
93
+ "income": {
94
+ "targets": {"work_status": "category", "income": "numeric"},
95
+ "features": ["age", "gender"],
96
+ }
97
+ }
98
+ drop_list = ["income"]
99
+ # ---------------------------------
100
+ # 5. Run the stochastic imputation process
101
+ # ---------------------------------
102
+ syn_pop = stochastic_impute(data, task_list)
103
+
104
+ # ---------------------------------
105
+ # 6. Plot distribution
106
+ # ---------------------------------
107
+ # 6.1 Plot distribution for age
108
+ plot_distribution(syn_pop, ["age"])
109
+ # 6.2 Plot joint distribution for gender + age
110
+ plot_distribution(syn_pop, ["gender", "age"])
111
+ ```
112
+
113
+ <a name="faq"></a>
114
+ ## 🧠 FAQ
115
+ ### Is generating synthetic unit-record data in this way actually accurate?
116
+ Well, it depends on your use case and the quality of your inputs. Ideally, you should work with real unit-record data. However, in practice, this isn't always feasible (i.e., in New Zealand, if you live far away from a Stats NZ IDI data lab). This utility provides a method to statistically link different aggregated, published population benchmarks together. This allows you to build out full data pipelines and prototype products locally before taking your code into a restricted environment.
117
+
118
+ ### Maximizing accuracy?
119
+ The accuracy of the synthetic output depends heavily on task design. The more shared variables (covariates) present in your reference data to condition the probabilities, the closer the synthetic distribution will mirror reality.
120
+
121
+ Also, you can run the process multiple times to capture inherent uncertainties.
122
+
123
+ ### Are we doing any prediction modelling here ?
124
+ It is out of scope at the moment. If certain covariate values are missing from the reference data to condition the probabilities, the process simply ignores those missing values when linking the reference data to the seed data (for example, if the reference data does not contain income information for children, when integrating the reference data into the seed population data, the integrated data will just set the income for children as NaN). The reason is that many covariates in these datasets are categorical, and applying simple prediction models can struggle to capture the nuances and introduce unwanted noise or uncertainty into the output data. However, you are welcome to apply your own predictive models to handle missing data prior to running this process if your use case requires it.
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "syspop_v2"
7
+ version = "0.4.0"
8
+ authors = [
9
+ { name="Sijin Zhang", email="zsjzyhzp@gmail.com" },
10
+ ]
11
+ description = "A package for stochastic population imputation"
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ ]
19
+ # List the external libraries your package needs to run
20
+ dependencies = [
21
+ "pandas",
22
+ "numpy",
23
+ "matplotlib",
24
+ "pyyaml",
25
+ "pyarrow"
26
+ ]
27
+
28
+ [project.urls]
29
+ "Homepage" = "https://github.com/jzanetti/Syspop"
30
+ "Bug Tracker" = "https://github.com/jzanetti/Syspop/issues"
31
+
32
+ [tool.setuptools.packages.find]
33
+ where = ["."] # Tells it to look in the current root directory
34
+ include = ["syspop*"] # Grabs your 'syspop' folder and everything inside it
35
+ exclude = ["examples*"] # Ensures the 'examples' folder doesn't get published
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,17 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="syspop_v2",
5
+ version="0.4.0", # Or whatever version is appropriate
6
+ packages=find_packages(),
7
+ install_requires=[
8
+ "pandas",
9
+ "numpy",
10
+ "pyarrow"
11
+ "pyyaml",
12
+ "matplotlib"
13
+ ],
14
+ author="Sijin Zhang",
15
+ description="Creating synthetic population",
16
+ python_requires='>=3.8',
17
+ )
File without changes
File without changes
@@ -0,0 +1,93 @@
1
+ from syspop.data.query import obtain_stats_data
2
+ from syspop.data.utils import stats_data_proc
3
+ from pandas import DataFrame as pdDataFrame
4
+ from sklearn.preprocessing import LabelEncoder
5
+
6
+
7
+ def obtain_data(cfg: dict, api_key: str):
8
+ """
9
+ Obtains and processes population statistics data based on the provided configuration and API key.
10
+
11
+ The function performs the following steps:
12
+ 1. Fetches raw statistics data using the API configuration and key.
13
+ 2. Processes the raw data according to the configuration.
14
+ 3. Applies inclusion and exclusion filters to the data based on specified criteria.
15
+ 4. Maps the filtered data to the desired columns.
16
+ 5. Expands the DataFrame by repeating rows according to the 'value' column.
17
+ 6. Returns the final DataFrame with the 'value' column removed and index reset.
18
+
19
+ Args:
20
+ cfg (dict): Configuration dictionary containing API settings, mapping of column names,
21
+ inclusion and exclusion criteria.
22
+ api_key (str): API key for accessing the statistics data.
23
+
24
+ Returns:
25
+ pandas.DataFrame: Processed and filtered DataFrame with population statistics.
26
+ """
27
+
28
+ def _obtain_qc_key(cfg, proc_qc_type):
29
+ if proc_qc_type in cfg["map"]:
30
+ return cfg["map"][proc_qc_type]
31
+ return proc_qc_type
32
+
33
+ data_pop = obtain_stats_data(cfg["api"], api_key=api_key)
34
+ data_pop = stats_data_proc(data_pop, cfg)
35
+
36
+ try:
37
+ for proc_qc_type in cfg["inclusion"]:
38
+ proc_qc_key = _obtain_qc_key(cfg, proc_qc_type)
39
+ data_pop = data_pop[
40
+ data_pop[proc_qc_key].isin(cfg["inclusion"][proc_qc_type])
41
+ ]
42
+ except TypeError:
43
+ pass
44
+
45
+ try:
46
+ for proc_qc_type in cfg["exclusion"]:
47
+ proc_qc_key = _obtain_qc_key(cfg, proc_qc_type)
48
+ data_pop = data_pop[
49
+ ~data_pop[proc_qc_key].isin(cfg["exclusion"][proc_qc_type])
50
+ ]
51
+ except TypeError:
52
+ pass
53
+
54
+ df = data_pop[list(cfg["map"].values())]
55
+
56
+ return df
57
+
58
+
59
+ def encode_weights(data_dict) -> dict:
60
+
61
+ for key in data_dict:
62
+
63
+ run_repeat = False
64
+ if key == "seed":
65
+ run_repeat = True
66
+
67
+ df = data_dict[key]
68
+
69
+ if run_repeat:
70
+ df = df.loc[df.index.repeat(df["value"])].copy()
71
+ df = df.reset_index(drop=True).drop(columns=["value"])
72
+ else:
73
+ group_cols = df.columns.drop("value").tolist()
74
+ df_grouped = df.groupby(group_cols, as_index=False)["value"].sum()
75
+ df_grouped["probability"] = df_grouped["value"] / df_grouped["value"].sum()
76
+ df = df_grouped.reset_index(drop=True).drop(columns=["value"])
77
+
78
+ data_dict[key] = df
79
+
80
+ return data_dict
81
+
82
+
83
+ def encode_sum(data_dict, drop_list) -> dict:
84
+
85
+ result_dict = {}
86
+
87
+ for key in data_dict:
88
+
89
+ if key not in drop_list:
90
+ continue
91
+ result_dict[key] = data_dict[key]["value"].sum()
92
+
93
+ return result_dict
@@ -0,0 +1,73 @@
1
+ from requests import get, exceptions
2
+ import xml.etree.ElementTree as ET
3
+ from pandas import DataFrame
4
+ from pandas import to_numeric
5
+
6
+
7
+ def stats_data_proc(data):
8
+ data["value"] = to_numeric(data["value"], errors="coerce")
9
+ data = data.dropna()
10
+ data["value"] = data["value"].astype(int)
11
+ return data
12
+
13
+
14
+ def obtain_stats_data(api_url: str, api_key: str or None = None):
15
+
16
+ if api_key is None:
17
+ raise Exception("No proper Stats API is provided")
18
+
19
+ # Set headers with the API key
20
+ headers = {"Ocp-Apim-Subscription-Key": api_key}
21
+ # Make the GET request
22
+ try:
23
+ response = get(api_url, headers=headers)
24
+ response.raise_for_status() # Check for HTTP errors
25
+ except exceptions.HTTPError as e:
26
+ print(f"HTTP Error: {e}")
27
+ if response.status_code == 401:
28
+ print(
29
+ "Authentication failed. Please check your API key or obtain a valid one from https://api.data.stats.govt.nz/"
30
+ )
31
+ elif response.status_code == 400:
32
+ print(
33
+ "Bad Request. The URL or dimensions may be invalid. Check the API documentation or simplify the query."
34
+ )
35
+ print(f"Raw response: {response.text}")
36
+ exit()
37
+ except exceptions.RequestException as e:
38
+ print(f"Error fetching data: {e}")
39
+ exit()
40
+
41
+ # Check if response is empty
42
+ if not response.text:
43
+ print("Error: Empty response received from the API.")
44
+ exit()
45
+ xml_text = response.text # your XML string
46
+
47
+ # Parse XML
48
+ root = ET.fromstring(xml_text)
49
+
50
+ # Namespaces
51
+ ns = {
52
+ "generic": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic",
53
+ "message": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message",
54
+ }
55
+
56
+ rows = []
57
+ # Iterate over all Obs elements
58
+ for obs in root.findall(".//generic:Obs", ns):
59
+ row = {}
60
+
61
+ # Extract all key/value pairs from ObsKey
62
+ for val in obs.findall("./generic:ObsKey/generic:Value", ns):
63
+ row[val.attrib["id"]] = val.attrib["value"]
64
+
65
+ # Extract the observed value
66
+ obs_value = obs.find("./generic:ObsValue", ns)
67
+ if obs_value is not None:
68
+ row["OBS_VALUE"] = obs_value.attrib["value"]
69
+
70
+ rows.append(row)
71
+
72
+ # Convert to DataFrame
73
+ return DataFrame(rows)
@@ -0,0 +1,80 @@
1
+ from process.data.data import obtain_data
2
+ from logging import info as log_info
3
+ from etc.sample_data.api_keys import STATS_API
4
+ from yaml import safe_load
5
+ from pickle import dump as pickle_dump
6
+ from pickle import load as pickle_load
7
+ from process.model.utils import obtain_all_tasks
8
+
9
+
10
+ def obtain_sample_data_cfg():
11
+ with open("etc/sample_data/sample_data_cfg.yml", "r") as fid:
12
+ cfg = safe_load(fid)
13
+
14
+ return cfg["tables"]
15
+
16
+
17
+ def obtain_sample_api_key(api_key: str or None = None):
18
+ if api_key is None:
19
+ api_key = STATS_API
20
+
21
+ return api_key
22
+
23
+
24
+ def load_sample_data(
25
+ data_types: list = [
26
+ "seed",
27
+ "industry",
28
+ "occupation",
29
+ "occupation_income",
30
+ "industry_income",
31
+ "travel_to_work",
32
+ "work_hours",
33
+ ],
34
+ refresh: bool = False,
35
+ ):
36
+ """
37
+ Wrapper function to retrieve data for specified types using an API key.
38
+
39
+ cfg_data (dict): Dictionary containing configuration for each data type,
40
+ where keys are data type strings and values are configuration dicts.
41
+ api_key (str or None, optional): API key to use for data retrieval. If None,
42
+ the key will be obtained via `obtain_sample_api_key`. Defaults to None.
43
+ data_types (list, optional): List of data type strings to retrieve.
44
+ Defaults to ["pop"].
45
+
46
+ dict: A dictionary mapping each data type (from `data_types`) to its
47
+ corresponding data retrieved using the API.
48
+
49
+ Raises:
50
+ KeyError: If a specified data type is not present in `cfg_data`.
51
+ Exception: Propagates exceptions raised during API key retrieval or data fetching.
52
+
53
+ Example:
54
+ >>> cfg_data = {"seed": {...}, "income": {...}}
55
+ >>> result = obtain_data_wrapper(cfg_data, api_key="my_key", data_types=["pop", "income"])
56
+ >>> print(result)
57
+ {'seed': <pop_data>, 'income': <income_data>}
58
+ """
59
+
60
+ if not refresh:
61
+ data_dict = pickle_load(open("etc/sample_data/sample_data.pkl", "rb"))
62
+ else:
63
+ api_key = obtain_sample_api_key()
64
+ cfg_data = obtain_sample_data_cfg()
65
+
66
+ data_dict = {}
67
+ for data_type in data_types:
68
+
69
+ log_info(f"Obtaining data for type: {data_type}")
70
+
71
+ data_dict[data_type] = obtain_data(cfg_data[data_type], api_key)
72
+
73
+ pickle_dump(data_dict, open("etc/sample_data/sample_data.pkl", "wb"))
74
+
75
+ with open("etc/sample_data/sample_model_cfg.yml", "r") as fid:
76
+ model_cfg = safe_load(fid)
77
+
78
+ task_list = obtain_all_tasks(model_cfg["tasks"], model_cfg["cfg"])
79
+
80
+ return data_dict, task_list
@@ -0,0 +1,81 @@
1
+ from pandas import to_numeric
2
+ from pandas import DataFrame
3
+
4
+
5
+ def stats_data_proc(data: DataFrame, cfg: dict):
6
+
7
+ data = data.rename(columns=cfg["map"])
8
+
9
+ data["value"] = to_numeric(data["value"], errors="coerce")
10
+ data = data.dropna()
11
+ data["value"] = data["value"].astype(int)
12
+
13
+ return data
14
+
15
+
16
+ def check_data_consistency(
17
+ data_dict: dict,
18
+ check_err: bool = True,
19
+ throw_err: bool = False,
20
+ output_dir: str or None = None,
21
+ ):
22
+ rows = []
23
+
24
+ for key, df in data_dict.items():
25
+ for col in df.columns:
26
+ unique_list = df[col].unique()
27
+ unique_str = ", ".join(map(str, unique_list))
28
+ unique_str = [item.strip() for item in unique_str.split(",")]
29
+ unique_str.sort()
30
+ unique_str = ", ".join(unique_str)
31
+
32
+ unique_count = len(unique_list)
33
+ rows.append(
34
+ {
35
+ "data_key": key,
36
+ "cols": col,
37
+ "unique_count": unique_count,
38
+ "unique_value": unique_str,
39
+ }
40
+ )
41
+
42
+ summary_df = DataFrame(rows)
43
+ if output_dir is not None:
44
+ summary_df.to_csv(f"{output_dir}/data_consistency.csv", index=False)
45
+
46
+ if check_err:
47
+ all_cols = summary_df["cols"].unique()
48
+
49
+ for proc_col in all_cols:
50
+ proc_series = summary_df[summary_df["cols"] == proc_col]["unique_value"]
51
+
52
+ proc_series = [set(val.split(", ")) for val in proc_series]
53
+
54
+ all_same = all(s == proc_series[0] for s in proc_series)
55
+
56
+ if not all_same:
57
+ print(
58
+ summary_df[summary_df["cols"] == proc_col][
59
+ ["data_key", "cols", "unique_value"]
60
+ ]
61
+ )
62
+
63
+ if throw_err:
64
+ raise ValueError(
65
+ f"Column '{proc_col}' has different unique values across datasets."
66
+ )
67
+
68
+ # Ask the user if they want to continue
69
+ user_choice = (
70
+ input(
71
+ f"\nColumn '{proc_col}' has different unique values "
72
+ + "across datasets. Continue? [Y/N]: "
73
+ )
74
+ .strip()
75
+ .upper()
76
+ )
77
+ if user_choice != "Y":
78
+ raise ValueError(
79
+ f"Execution halted by user. Column '{proc_col}' has "
80
+ + "different unique values across datasets."
81
+ )
@@ -0,0 +1 @@
1
+ MODEL_TRAINING_TEST_RATIO = 0.1