globalmind 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ # Byte-compiled / cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .pytest_cache/
6
+
7
+ # Build
8
+ dist/
9
+ build/
10
+ *.egg
11
+ *.whl
12
+
13
+ # Virtual environments
14
+ .venv/
15
+ venv/
16
+ env/
17
+
18
+ # Hatch
19
+ .hatch/
20
+
21
+ # Coverage
22
+ htmlcov/
23
+ .coverage
24
+ .coverage.*
25
+ coverage.xml
26
+
27
+ # IDE & editor
28
+ .vscode/
29
+ .idea/
30
+ *.swp
31
+ *.swo
32
+ *~
33
+
34
+ # OS
35
+ .DS_Store
36
+ Thumbs.db
37
+ .directory
38
+
39
+ # Jupyter
40
+ .ipynb_checkpoints/
41
+ *.ipynb
42
+
43
+ # Environment
44
+ .env
45
+ .env.*
46
+
47
+ # Data (large CSV / parquet — do not commit raw or processed data)
48
+ *.csv
49
+ *.parquet
50
+ *.processed.parquet
51
+
52
+ # Tests — not included in distribution
53
+ tests/
54
+
55
+ # Python type stubs
56
+ *.pyi
57
+
58
+ # Logs
59
+ *.log
@@ -0,0 +1,18 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026-present Nianyu Su <mirakelor@outlook.com>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
6
+ associated documentation files (the "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
9
+ following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be included in all copies or substantial
12
+ portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
15
+ LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
16
+ EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18
+ USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: globalmind
3
+ Version: 0.0.1
4
+ Summary: GMP (Global Mind Project) mental health data analysis toolkit
5
+ Project-URL: Documentation, https://github.com/Mirakelor/globalmind#readme
6
+ Project-URL: Issues, https://github.com/Mirakelor/globalmind/issues
7
+ Project-URL: Source, https://github.com/Mirakelor/globalmind
8
+ Author-email: Nianyu Su <mirakelor@outlook.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE.txt
11
+ Keywords: DSM-5,GMP,Global-Mind-Project,MHQ,Mind-Health-Quotient,data-science,mental-health,polars,psychometrics,public-health,questionnaire,survey-analysis,wellbeing
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Programming Language :: Python
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: Implementation :: CPython
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: polars>=1.0.0
20
+ Description-Content-Type: text/markdown
21
+
22
+ # globalmind
23
+
24
+ [![PyPI - Version](https://img.shields.io/pypi/v/globalmind.svg)](https://pypi.org/project/globalmind)
25
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/globalmind.svg)](https://pypi.org/project/globalmind)
26
+
27
+ -----
28
+
29
+ Python pipeline for the [Global Mind Project (GMP)](https://sapienlabs.org/global-mind-project/) — ingest, filter, profile, and
30
+ classify mental health data collected through the Mind Health Quotient (MHQ).
31
+
32
+ ## Background
33
+
34
+ The Global Mind database contains mental health profiles from nearly 2 million
35
+ internet-enabled respondents across 130+ countries in 17+ languages, with
36
+ 1,000–2,000 new responses added each day. Data are collected using the **Mind
37
+ Health Quotient (MHQ)** — an online assessment developed from a review of over
38
+ 10,000 questions drawn from 126 commonly used mental health tools spanning 10
39
+ disorders. The MHQ consists of 47 items that rate individual aspects of mind
40
+ health on a 1–9 scale, together with aggregate scores, demographics, and
41
+ lifestyle/life‑context factors.
42
+
43
+ `globalmind` provides a pure‑Polars pipeline to go from raw CSV exports to
44
+ DSM‑5 diagnostic classifications in a few lines of code. All operations are
45
+ **lazy** (build a query plan, `.collect()` once at the end) for memory‑safe
46
+ processing of the full dataset.
47
+
48
+ ## Features
49
+
50
+ ### Data loading & cleaning
51
+ - **`read_table(path)`** — scan CSV with automatic N/A → null conversion,
52
+ pipe‑delimited multi‑select splitting (21 columns), and stray‑pipe cleanup on
53
+ single‑select categoricals.
54
+ - **`clean_data(df)`** — apply four quality filters:
55
+ - completion time ≥ 7 minutes
56
+ - response variance across 47 rating items (std dev ≥ 0.2)
57
+ - comprehension check (`understanding` ≠ "No")
58
+ - countries with ≥ 1,000 responses
59
+
60
+ ### 205‑column schema
61
+ - **`COLUMN_DESCRIPTIONS`** — dictionary mapping every column name to an
62
+ English description with a Chinese gloss.
63
+ - **`describe_column(name)`** — lookup helper.
64
+
65
+ ### Symptom identification & DSM‑5 mapping
66
+ - **`identify_symptoms(df)`** — flags each of the 47 MHQ items as a clinical
67
+ symptom per DSM‑5 thresholds:
68
+ - *Problem items* (20): threshold ≥ 8 on a 1–9 severity scale
69
+ - *Spectrum items* (27): threshold ≤ 1 (challenge end of the spectrum)
70
+ - Adds 47 `_symptom` boolean columns + a `symptom_count` column.
71
+ - **`mapping_to_DSM5(df)`** — data‑driven rule engine classifying 10 disorder
72
+ categories: Depression, Anxiety, Bipolar, PTSD, OCD, Schizophrenia, Eating
73
+ Disorder, Addiction, ADHD, ASD.
74
+
75
+ ## Installation
76
+
77
+ ```console
78
+ pip install globalmind
79
+ ```
80
+
81
+ Requires Python ≥ 3.10 and `polars ≥ 1.0`.
82
+
83
+ ## Quick start
84
+
85
+ ```python
86
+ from globalmind import (
87
+ read_table, clean_data,
88
+ identify_symptoms, mapping_to_DSM5,
89
+ )
90
+
91
+ df = read_table("gmp_data.csv")
92
+ df = clean_data(df)
93
+ df = identify_symptoms(df)
94
+ df = mapping_to_DSM5(df)
95
+ df.collect() # all operations are lazy
96
+ ```
97
+
98
+ ## References
99
+
100
+ - **Data cleaning criteria** — Bala, Jerzy, Oleksii Sukhoi, Jennifer Jane
101
+ Newson, Priscila Pereira Machado, Mark Lawrence, and Tara C. Thiagarajan.
102
+ "Estimation of the Nature and Magnitude of Mental Distress in the Population
103
+ Associated with Ultra-Processed Food Consumption." *Frontiers in Nutrition* 12
104
+ (November 2025): 1562286.
105
+ [https://doi.org/10.3389/fnut.2025.1562286](https://doi.org/10.3389/fnut.2025.1562286)
106
+ - **Symptom thresholds & DSM‑5 mapping** — Newson, Jennifer Jane, Vladyslav
107
+ Pastukh, and Tara C. Thiagarajan. "Poor Separation of Clinical Symptom
108
+ Profiles by DSM-5 Disorder Criteria." *Frontiers in Psychiatry* 12 (November
109
+ 2021): 775762.
110
+ [https://doi.org/10.3389/fpsyt.2021.775762](https://doi.org/10.3389/fpsyt.2021.775762)
111
+
112
+ ## License
113
+
114
+ `globalmind` is distributed under the terms of the
115
+ [MIT](https://spdx.org/licenses/MIT.html) license.
@@ -0,0 +1,94 @@
1
+ # globalmind
2
+
3
+ [![PyPI - Version](https://img.shields.io/pypi/v/globalmind.svg)](https://pypi.org/project/globalmind)
4
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/globalmind.svg)](https://pypi.org/project/globalmind)
5
+
6
+ -----
7
+
8
+ Python pipeline for the [Global Mind Project (GMP)](https://sapienlabs.org/global-mind-project/) — ingest, filter, profile, and
9
+ classify mental health data collected through the Mind Health Quotient (MHQ).
10
+
11
+ ## Background
12
+
13
+ The Global Mind database contains mental health profiles from nearly 2 million
14
+ internet-enabled respondents across 130+ countries in 17+ languages, with
15
+ 1,000–2,000 new responses added each day. Data are collected using the **Mind
16
+ Health Quotient (MHQ)** — an online assessment developed from a review of over
17
+ 10,000 questions drawn from 126 commonly used mental health tools spanning 10
18
+ disorders. The MHQ consists of 47 items that rate individual aspects of mind
19
+ health on a 1–9 scale, together with aggregate scores, demographics, and
20
+ lifestyle/life‑context factors.
21
+
22
+ `globalmind` provides a pure‑Polars pipeline to go from raw CSV exports to
23
+ DSM‑5 diagnostic classifications in a few lines of code. All operations are
24
+ **lazy** (build a query plan, `.collect()` once at the end) for memory‑safe
25
+ processing of the full dataset.
26
+
27
+ ## Features
28
+
29
+ ### Data loading & cleaning
30
+ - **`read_table(path)`** — scan CSV with automatic N/A → null conversion,
31
+ pipe‑delimited multi‑select splitting (21 columns), and stray‑pipe cleanup on
32
+ single‑select categoricals.
33
+ - **`clean_data(df)`** — apply four quality filters:
34
+ - completion time ≥ 7 minutes
35
+ - response variance across 47 rating items (std dev ≥ 0.2)
36
+ - comprehension check (`understanding` ≠ "No")
37
+ - countries with ≥ 1,000 responses
38
+
39
+ ### 205‑column schema
40
+ - **`COLUMN_DESCRIPTIONS`** — dictionary mapping every column name to an
41
+ English description with a Chinese gloss.
42
+ - **`describe_column(name)`** — lookup helper.
43
+
44
+ ### Symptom identification & DSM‑5 mapping
45
+ - **`identify_symptoms(df)`** — flags each of the 47 MHQ items as a clinical
46
+ symptom per DSM‑5 thresholds:
47
+ - *Problem items* (20): threshold ≥ 8 on a 1–9 severity scale
48
+ - *Spectrum items* (27): threshold ≤ 1 (challenge end of the spectrum)
49
+ - Adds 47 `_symptom` boolean columns + a `symptom_count` column.
50
+ - **`mapping_to_DSM5(df)`** — data‑driven rule engine classifying 10 disorder
51
+ categories: Depression, Anxiety, Bipolar, PTSD, OCD, Schizophrenia, Eating
52
+ Disorder, Addiction, ADHD, ASD.
53
+
54
+ ## Installation
55
+
56
+ ```console
57
+ pip install globalmind
58
+ ```
59
+
60
+ Requires Python ≥ 3.10 and `polars ≥ 1.0`.
61
+
62
+ ## Quick start
63
+
64
+ ```python
65
+ from globalmind import (
66
+ read_table, clean_data,
67
+ identify_symptoms, mapping_to_DSM5,
68
+ )
69
+
70
+ df = read_table("gmp_data.csv")
71
+ df = clean_data(df)
72
+ df = identify_symptoms(df)
73
+ df = mapping_to_DSM5(df)
74
+ df.collect() # all operations are lazy
75
+ ```
76
+
77
+ ## References
78
+
79
+ - **Data cleaning criteria** — Bala, Jerzy, Oleksii Sukhoi, Jennifer Jane
80
+ Newson, Priscila Pereira Machado, Mark Lawrence, and Tara C. Thiagarajan.
81
+ "Estimation of the Nature and Magnitude of Mental Distress in the Population
82
+ Associated with Ultra-Processed Food Consumption." *Frontiers in Nutrition* 12
83
+ (November 2025): 1562286.
84
+ [https://doi.org/10.3389/fnut.2025.1562286](https://doi.org/10.3389/fnut.2025.1562286)
85
+ - **Symptom thresholds & DSM‑5 mapping** — Newson, Jennifer Jane, Vladyslav
86
+ Pastukh, and Tara C. Thiagarajan. "Poor Separation of Clinical Symptom
87
+ Profiles by DSM-5 Disorder Criteria." *Frontiers in Psychiatry* 12 (November
88
+ 2021): 775762.
89
+ [https://doi.org/10.3389/fpsyt.2021.775762](https://doi.org/10.3389/fpsyt.2021.775762)
90
+
91
+ ## License
92
+
93
+ `globalmind` is distributed under the terms of the
94
+ [MIT](https://spdx.org/licenses/MIT.html) license.
@@ -0,0 +1,73 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "globalmind"
7
+ dynamic = ["version"]
8
+ description = 'GMP (Global Mind Project) mental health data analysis toolkit'
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ keywords = [
13
+ "GMP", "Global-Mind-Project", "MHQ", "Mind-Health-Quotient",
14
+ "mental-health", "wellbeing", "DSM-5", "psychometrics",
15
+ "questionnaire", "survey-analysis", "public-health",
16
+ "polars", "data-science",
17
+ ]
18
+ authors = [
19
+ { name = "Nianyu Su", email = "mirakelor@outlook.com" },
20
+ ]
21
+ classifiers = [
22
+ "Development Status :: 4 - Beta",
23
+ "Programming Language :: Python",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Programming Language :: Python :: Implementation :: CPython",
28
+ ]
29
+ dependencies = [
30
+ "polars>=1.0.0"
31
+ ]
32
+
33
+ [project.urls]
34
+ Documentation = "https://github.com/Mirakelor/globalmind#readme"
35
+ Issues = "https://github.com/Mirakelor/globalmind/issues"
36
+ Source = "https://github.com/Mirakelor/globalmind"
37
+
38
+ [tool.hatch.version]
39
+ path = "src/globalmind/__about__.py"
40
+
41
+ [tool.hatch.envs.types]
42
+ extra-dependencies = [
43
+ "mypy>=1.0.0",
44
+ ]
45
+ [tool.hatch.envs.types.scripts]
46
+ check = "mypy --install-types --non-interactive {args:src/globalmind tests}"
47
+ [tool.hatch.envs.test]
48
+ dependencies = [
49
+ "pytest>=8.0.0",
50
+ "pytest-cov>=5.0.0",
51
+ ]
52
+ [tool.hatch.envs.test.scripts]
53
+ test = "pytest -v {args:tests}"
54
+ cov = "pytest --cov=src/globalmind --cov-report=term-missing {args:tests}"
55
+
56
+ [tool.coverage.run]
57
+ source_pkgs = ["globalmind", "tests"]
58
+ branch = true
59
+ parallel = true
60
+ omit = [
61
+ "src/globalmind/__about__.py",
62
+ ]
63
+
64
+ [tool.coverage.paths]
65
+ globalmind = ["src/globalmind", "*/globalmind/src/globalmind"]
66
+ tests = ["tests", "*/globalmind/tests"]
67
+
68
+ [tool.coverage.report]
69
+ exclude_lines = [
70
+ "no cov",
71
+ "if __name__ == .__main__.:",
72
+ "if TYPE_CHECKING:",
73
+ ]
@@ -0,0 +1,4 @@
1
+ # SPDX-FileCopyrightText: 2026-present Nianyu Su <mirakelor@outlook.com>
2
+ #
3
+ # SPDX-License-Identifier: MIT
4
+ __version__ = "0.0.1"
@@ -0,0 +1,20 @@
1
+ # SPDX-FileCopyrightText: 2026-present Nianyu Su <mirakelor@outlook.com>
2
+ #
3
+ # SPDX-License-Identifier: MIT
4
+
5
+ """GlobalMind: GMP (Global Mind Project) data analysis toolkit."""
6
+
7
+ from globalmind.data import read_table, clean_data
8
+ from globalmind.schema import COLUMN_DESCRIPTIONS, describe_column
9
+ from globalmind.symptom import identify_symptoms, mapping_to_DSM5
10
+ from globalmind.__about__ import __version__
11
+
12
+ __all__ = [
13
+ "read_table",
14
+ "clean_data",
15
+ "COLUMN_DESCRIPTIONS",
16
+ "describe_column",
17
+ "identify_symptoms",
18
+ "mapping_to_DSM5",
19
+ "__version__",
20
+ ]
@@ -0,0 +1,123 @@
1
+ from pathlib import Path
2
+ import polars as pl
3
+
4
+ # Columns whose values are pipe-delimited multi-select strings.
5
+ _PIPE_DELIMITED_COLS: list[str] = [
6
+ "live_close_nature",
7
+ "time_nature",
8
+ "sleep_problem_type",
9
+ "work_factors",
10
+ "substance_use",
11
+ "medical_condition_type",
12
+ "treatment_type_new",
13
+ "treatment_type",
14
+ "trauma_childhood",
15
+ "trauma_adulthood",
16
+ "friendship_type",
17
+ "parental_support",
18
+ "help_seeking",
19
+ "mental_health_disorder",
20
+ "internet_restrictions",
21
+ "sm_impact",
22
+ "ai_use_general",
23
+ "ai_use_social",
24
+ "ai_impact_personal",
25
+ "ai_impact_work",
26
+ "immersion_nature",
27
+ ]
28
+
29
+ # Clean | artifacts from single-select categorical columns
30
+ _SINGLE_SELECT_PIPE_COLS: list[str] = [
31
+ "ethnicity",
32
+ "income_household",
33
+ "city",
34
+ "country",
35
+ "employment_sector",
36
+ "job_sector",
37
+ "work_activity",
38
+ "family_situation",
39
+ ]
40
+
41
+ # Rating questions (1-9 scale, all MHQ items except categorical _type cols)
42
+ _RATING_COLS = [
43
+ "adapt_to_change", "self_worth_confidence", "creativity_problem_solving",
44
+ "drive_motivation", "stability_calmness", "sleep_quality",
45
+ "self_control_impulsivity", "ability_learn", "coordination", "relationships",
46
+ "emotional_resilience", "planning_organization", "physical_intimacy",
47
+ "speech_language", "memory", "social_cooperation", "decision_risk",
48
+ "curiosity_enthusiasm", "energy", "emotional_control", "focus_concentration",
49
+ "appetite_regulation", "empathy", "sensory_sensitivity", "self_image",
50
+ "outlook_optimism", "selective_attention", "restless_hyperactive",
51
+ "fear_anxiety", "infections", "aggression", "avoidance", "obsessive_thoughts",
52
+ "mood_swings", "detached_reality", "nightmares", "addictions", "anger",
53
+ "suicidal_thoughts", "pain", "guilt_blame", "hallucinations", "flashblacks",
54
+ "repetitive_actions", "sad_hopeless", "physical_health", "confusion",
55
+ ]
56
+
57
+ def read_table(path: str) -> pl.LazyFrame:
58
+ """Reads a CSV file and returns a Polars LazyFrame.
59
+ Args:
60
+ path (str): The path to the CSV file.
61
+ Returns:
62
+ pl.LazyFrame: The resulting Polars LazyFrame.
63
+ Raises:
64
+ FileNotFoundError: If the specified file does not exist.
65
+ NotImplementedError: If the file format is not supported.
66
+ """
67
+ source = Path(path)
68
+ if not source.exists():
69
+ raise FileNotFoundError(f"File not found: {path}")
70
+ if source.suffix.lower() == ".csv":
71
+ _score_cols = [
72
+ "overall_mhq_score", "cognition_score", "adapt_resilience_score",
73
+ "drive_motivation_score", "mood_outlook_score", "social_self_score",
74
+ "mind_body_score",
75
+ ]
76
+ df = pl.scan_csv(
77
+ path,
78
+ infer_schema_length=0,
79
+ null_values=["N/A", "NA", "null", "", "Prefer not to say"],
80
+ schema_overrides={c: pl.Float64 for c in _RATING_COLS + _score_cols},
81
+ )
82
+ df = df.with_columns([
83
+ pl.col(col).str.split("|").list.eval(pl.element().filter(pl.element().str.strip_chars() != "")).alias(col)
84
+ if col != "substance_use"
85
+ else pl.col(col).str.split("|").list.eval(pl.element().str.replace_all("##PIPE##", "|").filter(pl.element().str.strip_chars() != "")).alias(col)
86
+ for col in _PIPE_DELIMITED_COLS
87
+ ])
88
+ df = df.with_columns([
89
+ pl.col(col).str.strip_chars("|").str.replace_all(r"\|+", ",").alias(col)
90
+ for col in _SINGLE_SELECT_PIPE_COLS
91
+ ])
92
+ return df
93
+ else:
94
+ raise NotImplementedError(f"Unsupported file format: {source.suffix}")
95
+
96
+ def clean_data(df: pl.LazyFrame) -> pl.LazyFrame:
97
+ """Apply GMP data quality filters.
98
+
99
+ Removes records if:
100
+ - time_to_complete < 7 minutes
101
+ - same option selected for all rating questions (std dev < 0.2)
102
+ - responded 'No' to 'Did you find this assessment easy to understand?'
103
+ - country has fewer than 1,000 responses
104
+ """
105
+ # Filter out records with time_to_complete < 7 minutes
106
+ df = df.with_columns(
107
+ (pl.col("time_to_complete").str.split(":").list.get(0).cast(pl.Int64) * 60 + pl.col("time_to_complete").str.split(":").list.get(1).cast(pl.Int64))
108
+ .alias("time_to_complete_minutes")
109
+ ).filter(pl.col("time_to_complete_minutes") >= 7).drop("time_to_complete_minutes")
110
+
111
+ # Filter out records with low variance in rating questions
112
+ df = df.with_columns(
113
+ pl.concat_list([pl.col(col) for col in _RATING_COLS]).list.std().alias("_ratings_std")
114
+ ).filter(pl.col("_ratings_std") >= 0.2).drop("_ratings_std")
115
+
116
+ # Filter out records that responded 'No' to 'Did you find this assessment easy to understand?'
117
+ df = df.filter(pl.col("understanding") != "No")
118
+
119
+ # Filter out countries with fewer than 1,000 responses
120
+ country_counts = df.group_by("country").len().filter(pl.col("len") >= 1000).select("country")
121
+ df = df.join(country_counts, on="country", how="semi")
122
+
123
+ return df
@@ -0,0 +1,348 @@
1
+ """Column name → human-readable description mapping for the GMP (Global Mind Project) dataset.
2
+
3
+ Grouped by topic for easy browsing and documentation generation.
4
+ """
5
+
6
+ # =============================================================================
7
+ # Meta & timestamps
8
+ # =============================================================================
9
+ META = {
10
+ "language": "Survey language (问卷语言)",
11
+ "start_date_utc": "Assessment start time in UTC (评估开始时间 UTC)",
12
+ "day": "Day of assessment (评估日期 日)",
13
+ "month": "Month of assessment (评估日期 月)",
14
+ "year": "Year of assessment (评估日期 年)",
15
+ "submit_data_utc": "Submission time in UTC (提交时间 UTC)",
16
+ "time_to_complete": "Time taken to complete the assessment (完成耗时)",
17
+ }
18
+
19
+ # =============================================================================
20
+ # MHQ aggregate scores (7)
21
+ # =============================================================================
22
+ MHQ_SCORES = {
23
+ "overall_mhq_score": "Overall MHQ score (MHQ 总得分)",
24
+ "cognition_score": "Cognition score (认知能力得分)",
25
+ "adapt_resilience_score": "Adaptation & resilience score (适应与韧性得分)",
26
+ "drive_motivation_score": "Drive & motivation score (驱动力与动机得分)",
27
+ "mood_outlook_score": "Mood & outlook score (情绪与展望得分)",
28
+ "social_self_score": "Social self score (社交自我得分)",
29
+ "mind_body_score": "Mind-body connection score (身心连接得分)",
30
+ }
31
+
32
+ # =============================================================================
33
+ # MHQ 47 individual assessment items
34
+ # =============================================================================
35
+ MHQ_ITEMS = {
36
+ # --- Cognition ---
37
+ "understanding": "Understanding (理解能力)",
38
+ "ability_learn": "Ability to learn (学习能力)",
39
+ "memory": "Memory (记忆力)",
40
+ "creativity_problem_solving": "Creativity & problem solving (创造力与解决问题)",
41
+ "planning_organization": "Planning & organization (计划与组织)",
42
+ "selective_attention": "Selective attention (选择性注意力)",
43
+ "focus_concentration": "Focus & concentration (专注力)",
44
+ "speech_language": "Speech & language (言语与语言)",
45
+ "confusion": "Confusion (困惑感)",
46
+
47
+ # --- Mood & emotion ---
48
+ "stability_calmness": "Stability & calmness (情绪稳定与冷静)",
49
+ "emotional_resilience": "Emotional resilience (情绪韧性)",
50
+ "emotional_control": "Emotional control (情绪控制)",
51
+ "mood_swings": "Mood swings (情绪波动)",
52
+ "sad_hopeless": "Sadness & hopelessness (悲伤/绝望)",
53
+ "outlook_optimism": "Outlook & optimism (乐观展望)",
54
+ "fear_anxiety": "Fear & anxiety (恐惧/焦虑)",
55
+ "anger": "Anger (愤怒)",
56
+ "guilt_blame": "Guilt & self-blame (内疚/自责)",
57
+ "guilt_blame_type": "Guilt/blame trigger type (内疚/自责触发类型)",
58
+
59
+ # --- Self ---
60
+ "self_worth_confidence": "Self-worth & confidence (自我价值/自信)",
61
+ "self_image": "Self-image (自我形象)",
62
+ "drive_motivation": "Drive & motivation (驱动力与动机)",
63
+ "curiosity_enthusiasm": "Curiosity & enthusiasm (好奇心与热情)",
64
+ "energy": "Energy level (精力水平)",
65
+ "adapt_to_change": "Adaptability to change (适应变化能力)",
66
+ "decision_risk": "Decision-making & risk assessment (决策与风险评估)",
67
+
68
+ # --- Social ---
69
+ "relationships": "Relationship quality (人际关系质量)",
70
+ "social_cooperation": "Social cooperation (社交合作能力)",
71
+ "empathy": "Empathy (同理心)",
72
+ "coordination": "Coordination (协调能力)",
73
+ "physical_intimacy": "Physical intimacy (身体亲密)",
74
+ "aggression": "Aggression (攻击性)",
75
+ "avoidance": "Social avoidance (社交回避)",
76
+
77
+ # --- Mind-body ---
78
+ "sleep_quality": "Sleep quality (睡眠质量)",
79
+ "appetite_regulation": "Appetite regulation (食欲调节)",
80
+ "physical_health": "Physical health perception (身体健康感受)",
81
+ "self_control_impulsivity": "Self-control & impulsivity (自控力/冲动控制)",
82
+ "sensory_sensitivity": "Sensory sensitivity (感官敏感度)",
83
+ "pain": "Physical pain (身体疼痛)",
84
+
85
+ # --- Clinical ---
86
+ "restless_hyperactive": "Restlessness & hyperactivity (烦躁/多动)",
87
+ "obsessive_thoughts": "Obsessive thoughts (强迫思维)",
88
+ "obsessive_thoughts_type": "Obsessive thought type (强迫思维类型)",
89
+ "detached_reality": "Detached reality (现实感脱离)",
90
+ "nightmares": "Nightmares (噩梦)",
91
+ "hallucinations": "Hallucinations (幻觉)",
92
+ "flashblacks": "Flashbacks (闪回)",
93
+ "repetitive_actions": "Repetitive actions (重复行为)",
94
+ "addictions": "Addictive behaviors (成瘾行为)",
95
+ "infections": "Infections (感染)",
96
+ "suicidal_thoughts": "Suicidal thoughts (自杀念头)",
97
+ }
98
+
99
+ # =============================================================================
100
+ # Core demographics
101
+ # =============================================================================
102
+ DEMOGRAPHICS_CORE = {
103
+ "age": "Age group (年龄段)",
104
+ "biological_sex": "Biological sex (生理性别)",
105
+ "gender_diff": "Gender identity vs. biological sex difference (性别认同与生理性别差异)",
106
+ "gender_identity": "Gender identity (性别认同)",
107
+ "gender": "Gender (性别)",
108
+ "ethnicity": "Ethnicity (种族/民族)",
109
+ "country": "Country (国家)",
110
+ "state": "State / province / region (州/省/地区)",
111
+ "rural_urban": "Rural vs. urban classification (城乡分类)",
112
+ "city": "City (城市)",
113
+ "education": "Education level (教育程度)",
114
+ "employment": "Employment status (就业状态)",
115
+ }
116
+
117
+ # =============================================================================
118
+ # Work & employment
119
+ # =============================================================================
120
+ WORK = {
121
+ "employment_sector": "Employment sector (就业行业)",
122
+ "job_role": "Job role / position (职位/角色)",
123
+ "income_household": "Household income level (家庭收入水平)",
124
+ "veteran_status_US": "US veteran status (美国退伍军人身份)",
125
+ "productivity_absent": "Days absent from work — past 4 weeks (缺勤天数 过去4周)",
126
+ "productivity_unproductive": "Unproductive work days — past 4 weeks (低效工作天数 过去4周)",
127
+ "team_situation": "Team working arrangement (团队工作模式)",
128
+ "job_features": "Job features description (工作特征描述)",
129
+ "work_situation": "Work mode — remote / on-site / hybrid (工作模式 远程/现场/混合)",
130
+ "organization_size": "Organization size (组织规模)",
131
+ "job_duration": "Time in current role (在当前岗位时长)",
132
+ "work_control_time": "Control over work schedule (对工作时间安排的控制)",
133
+ "work_control_job": "Autonomy over how work is done (对工作方式的自主权)",
134
+ "work_amount": "Workload pressure (工作量压力)",
135
+ "work_purpose": "Sense of purpose & meaning at work (工作目标感/意义感)",
136
+ "work_learning": "Learning opportunities at work (工作中学习机会)",
137
+ "work_colleagues": "Relationship with colleagues (与同事关系)",
138
+ "work_manager": "Relationship with manager/supervisor (与上司关系)",
139
+ "work_informed": "Being kept informed at work (工作信息知情度)",
140
+ "work_recognition": "Recognition at work (工作认可度)",
141
+ "work_factors": "Work stress factors — multi-select (工作压力因素 多选)",
142
+ "job_sector": "Industry sector (所属行业)",
143
+ "work_activity": "Type of work activity (工作活动类型)",
144
+ }
145
+
146
+ # =============================================================================
147
+ # Lifestyle — diet, exercise, sleep, socialising
148
+ # =============================================================================
149
+ LIFESTYLE = {
150
+ "cantril": "Cantril ladder — overall life evaluation 1-9 (Cantril 阶梯 总体生活评价 1-9)",
151
+ "sleep_freq": "Frequency of getting enough sleep (充足睡眠频率)",
152
+ "sleep_problem_type": "Sleep problem type — multi-select (睡眠问题类型 多选)",
153
+ "exercise_freq": "Exercise frequency — ≥30 min/session (运动频率 ≥30分钟/次)",
154
+ "UPF_freq": "Ultra-processed food consumption frequency (超加工食品摄入频率)",
155
+ "fruit_veg_freq": "Fresh fruit & vegetable intake frequency (新鲜蔬果摄入频率)",
156
+ "organic_fruit_veg_freq": "Organic fruit & vegetable consumption (有机蔬果食用频率)",
157
+ "sugary_food_freq": "Sweet/sugary food or dessert frequency (甜食/含糖食物频率)",
158
+ "meat_diet": "Meat consumption habits (肉类摄入习惯)",
159
+ "fish_diet": "Fish/shellfish consumption habits (鱼类/贝类摄入习惯)",
160
+ "plastic_food": "Food/drinks from plastic containers (塑料容器装食物/饮品频率)",
161
+ "plastic_hot_food": "Hot food from plastic containers (塑料容器装热食频率)",
162
+ "plastic_hot_drink": "Hot drinks in paper cups (纸杯热饮频率)",
163
+ "social_freq": "In-person socializing frequency (线下社交频率)",
164
+ }
165
+
166
+ # =============================================================================
167
+ # Substance use, medical conditions & treatment
168
+ # =============================================================================
169
+ SUBSTANCE_AND_MEDICAL = {
170
+ "substance_use": "Substance use — multi-select (物质使用 多选)",
171
+ "medical_condition_presence": "Whether diagnosed with a medical condition (是否有确诊疾病)",
172
+ "medical_condition_type": "Medical condition type — multi-select (疾病类型 多选)",
173
+ "treatment_status": "Whether currently receiving treatment (是否正在接受治疗)",
174
+ "help_seeking": "Whether sought help (是否寻求过帮助)",
175
+ "treatment_type_new": "Treatment type — new version, multi-select (治疗类型 新版 多选)",
176
+ "treatment_type": "Treatment type — old version, multi-select (治疗类型 旧版 多选)",
177
+ "therapy_efficacy": "Perceived effectiveness of psychological therapy (心理治疗有效性评价)",
178
+ "medication_efficacy": "Perceived effectiveness of medication (药物治疗有效性评价)",
179
+ "brain_stim_efficacy": "Perceived effectiveness of brain stimulation (脑刺激治疗效果)",
180
+ "neurofeedback_efficacy": "Perceived effectiveness of neurofeedback (神经反馈治疗效果)",
181
+ "mental_health_disorder": "Mental health disorder diagnosis (心理健康障碍诊断)",
182
+ }
183
+
184
+ # =============================================================================
185
+ # Trauma & adversity
186
+ # =============================================================================
187
+ TRAUMA = {
188
+ "trauma_childhood": "Childhood trauma experiences — before age 18, multi-select (童年创伤经历 18岁前 多选)",
189
+ "trauma_adulthood": "Adult trauma experiences — after age 18, multi-select (成年创伤经历 18岁后 多选)",
190
+ "trauma_life_old": "Life trauma — archived old version (人生创伤经历 旧版 已归档)",
191
+ }
192
+
193
+ # =============================================================================
194
+ # Family, friendship & community
195
+ # =============================================================================
196
+ FAMILY_AND_FRIENDS = {
197
+ "family_situation": "Current family situation (当前家庭状况)",
198
+ "children_num": "Number of children (子女数量)",
199
+ "household_size": "Number of people sharing household (共同居住人数)",
200
+ "siblings_num": "Number of siblings growing up — archived (成长中兄弟姐妹数 已归档)",
201
+ "friends_num": "Number of close friends — archived (密友数量 已归档)",
202
+ "friends_childhood": "Friends known since childhood — archived (童年至今朋友数量 已归档)",
203
+ "friends_proximity": "Close friends live nearby — archived (密友是否住附近 已归档)",
204
+ "friendship_type": "Mode of interaction with friends — archived (与朋友的互动方式 已归档)",
205
+ "friends_help_out": "Whether friends would help out when in need (是否有朋友能帮忙)",
206
+ "friends_confide_in": "Whether has friends to confide in — archived (是否有朋友可倾诉 已归档)",
207
+ "household_nature": "Nature of household growing up — conflict/stable, archived (成长家庭冲突/稳定性 已归档)",
208
+ "household_description": "Household growing up — warm/distant, archived (成长家庭温暖/疏离程度 已归档)",
209
+ "parental_support": "Type of parental/caregiver support — archived (父母/照顾者支持类型 已归档)",
210
+ "family_proximity": "Adult family living nearby — archived (成年家人是否住附近 已归档)",
211
+ "family_relationships": "Quality of relationships with adult family (与成年家人关系质量)",
212
+ }
213
+
214
+ # =============================================================================
215
+ # Faith & religion
216
+ # =============================================================================
217
+ FAITH = {
218
+ "spirituality_connection": "Spiritual/transcendent connection — archived (灵性/超越连接感 已归档)",
219
+ "love_feelings": "Extent of loving feelings towards others (对他人的关爱程度)",
220
+ "religious_identity": "Religious identity — archived (宗教身份认同 已归档)",
221
+ "religious_practice": "Whether actively practices religion — archived (是否践行宗教活动 已归档)",
222
+ "individual_collective": "Individualism vs. collectivism orientation — archived (个人主义 vs 集体主义倾向 已归档)",
223
+ }
224
+
225
+ # =============================================================================
226
+ # Technology — smartphones, social media, AI, VR, gaming, nature
227
+ # =============================================================================
228
+ TECHNOLOGY = {
229
+ # Smartphone & tablet
230
+ "smartphone_own_old": "Owns a smartphone — old version, archived (是否拥有手机 旧版 已归档)",
231
+ "smartphone_age_access": "Age first obtained a smartphone (首次获得手机年龄)",
232
+ "smartphone_school_old": "Smartphone use at school — old version, archived (学校使用手机情况 旧版 已归档)",
233
+ "smartphone_class_old": "Smartphone use in class — old version, archived (课堂上使用手机 旧版 已归档)",
234
+ "smartphone_ownership _age": "Age of smartphone ownership (拥有手机年龄)",
235
+ "smartphone_friends": "Number of friends with smartphones (手机朋友数量)",
236
+ "smartphone_school_age": "Age school provided a smartphone (学校提供手机年龄)",
237
+ "smartphone_class": "Smartphone use in class — new version (课堂使用手机 新版)",
238
+ "smartphone_recess": "Smartphone use during recess (课间使用手机)",
239
+ "tablet_ownership_age": "Age obtained a tablet (获得平板电脑年龄)",
240
+ "smartphone_tablet_age": "Age obtained a smartphone/tablet (获得手机/平板年龄)",
241
+ "laptop_school_age": "Age school provided a laptop (学校提供笔记本年龄)",
242
+ "laptop_class": "Laptop use in class (课堂使用笔记本)",
243
+ "internet_restrictions": "Internet access restrictions (上网限制)",
244
+
245
+ # Social media
246
+ "social_media_age": "Age started using social media (开始使用社交媒体年龄)",
247
+ "social_media_freq": "Social media use frequency (社交媒体使用频率)",
248
+ "sm_freq_new": "Social media use frequency — new version (社交媒体使用频率 新版)",
249
+ "sm_impact": "Social media impact on life — multi-select (社交媒体对生活的影响 多选)",
250
+
251
+ # AI
252
+ "ai_freq": "AI tool usage frequency (AI 工具使用频率)",
253
+ "ai_use_general": "AI general use cases — multi-select (AI 通用用途 多选)",
254
+ "ai_use_social": "AI social/emotional use cases — multi-select (AI 社交/情感用途 多选)",
255
+ "ai_impact_personal": "AI impact on personal life — multi-select (AI 对个人的影响 多选)",
256
+ "ai_impact_work": "AI impact on work — multi-select (AI 对工作的影响 多选)",
257
+
258
+ # VR, gaming & nature
259
+ "vr_freq": "VR headset usage frequency (VR 头显使用频率)",
260
+ "gaming_freq": "Video gaming frequency (电子游戏频率)",
261
+ "time_nature": "Time spent in natural environments (在自然环境中时长)",
262
+ "live_close_nature": "Whether lives close to nature (是否居住在自然附近)",
263
+ "immersion_nature": "Most-frequented natural environment types — multi-select (最常接触的自然环境类型 多选)",
264
+ }
265
+
266
+ # =============================================================================
267
+ # Benchmarking scales — PHQ-9, GAD-7, life satisfaction
268
+ # =============================================================================
269
+ BENCHMARKING = {
270
+ # PHQ-9 (depression screening)
271
+ "PHQ9_interest": "PHQ-9: Little interest or pleasure in doing things (做事缺乏兴趣或乐趣)",
272
+ "PHQ9_depressed": "PHQ-9: Feeling down, depressed, or hopeless (感到沮丧、抑郁或绝望)",
273
+ "PHQ9_sleep": "PHQ-9: Trouble falling/staying asleep or sleeping too much (入睡困难、易醒或嗜睡)",
274
+ "PHQ9_energy": "PHQ-9: Feeling tired or having little energy (感到疲倦或缺乏精力)",
275
+ "PHQ9_failure": "PHQ-9: Poor appetite or overeating (食欲不振或暴饮暴食)",
276
+ "PHQ9_appetite": "PHQ-9: Feeling bad about yourself — failure, letting family down (自我否定/觉得自己失败)",
277
+ "PHQ9_concentration": "PHQ-9: Trouble concentrating on things (注意力难以集中)",
278
+ "PHQ9_movement": "PHQ-9: Moving/speaking slowly or being fidgety/restless (动作迟缓或焦躁不安)",
279
+ "PHQ9_self_harm": "PHQ-9: Thoughts of self-harm or being better off dead (自伤或自杀念头)",
280
+ # GAD-7 (anxiety screening)
281
+ "GAD7_nervous": "GAD-7: Feeling nervous, anxious, or on edge (感到紧张、焦虑)",
282
+ "GAD7_worry": "GAD-7: Not being able to stop or control worrying (无法停止或控制担忧)",
283
+ "GAD7_self_control": "GAD-7: Worrying too much about different things (对很多事情过度担忧)",
284
+ "GAD7_relax": "GAD-7: Trouble relaxing (难以放松)",
285
+ "GAD7_restless": "GAD-7: Being so restless that it is hard to sit still (坐立不安)",
286
+ "GAD7_irritable": "GAD-7: Becoming easily annoyed or irritable (容易烦躁或易怒)",
287
+ "GAD7_afraid": "GAD-7: Feeling afraid as if something awful might happen (感到害怕)",
288
+ "GAD7_impact": "GAD-7: How difficult these problems made daily life (上述问题对生活的影响程度)",
289
+ # Life satisfaction
290
+ "life_satisfaction": "Overall life satisfaction 1-9 (总体生活满意度 1-9)",
291
+ }
292
+
293
+ # =============================================================================
294
+ # Momentary assessments — archived
295
+ # =============================================================================
296
+ MOMENTARY = {
297
+ "time_day": "Time of day when assessment taken — archived (评估时间段 已归档)",
298
+ "mood_current": "Current mood at assessment time — archived (当前情绪 已归档)",
299
+ "alertness_current": "Current alertness level — archived (当前清醒程度 已归档)",
300
+ "sleep_prevous_night": "Hours slept the previous night — archived (前一晚睡眠时长 已归档)",
301
+ "time_last_meal": "Time since last meal — archived (距上次进食时间 已归档)",
302
+ "physical_complaints": "Physical complaints at assessment time — archived (当前身体不适症状 已归档)",
303
+ "pregnancy": "Whether currently pregnant — archived (是否怀孕 已归档)",
304
+ }
305
+
306
+ # =============================================================================
307
+ # COVID-19 impact — archived
308
+ # =============================================================================
309
+ COVID = {
310
+ "covid_health": "COVID-19: health & social impact — archived (新冠疫情 健康与社交影响 已归档)",
311
+ "covid_finance": "COVID-19: financial impact — archived (新冠疫情 财务影响 已归档)",
312
+ }
313
+
314
+ # =============================================================================
315
+ # Repeat respondent identifiers (anonymous, derived from email)
316
+ # =============================================================================
317
+ DEDUP = {
318
+ "Repeat identifier": "Repeat respondent identifier — anonymous, derived from email (重复受访者标识符, 由邮箱匿名化生成)",
319
+ "Repeat identifier 2": "Repeat respondent identifier — backup (重复受访者标识符 备用)",
320
+ }
321
+
322
+ # =============================================================================
323
+ # Flat aggregated dict for direct column-name lookup
324
+ # =============================================================================
325
+ COLUMN_DESCRIPTIONS: dict[str, str] = {}
326
+ for _group in [
327
+ META,
328
+ MHQ_SCORES,
329
+ MHQ_ITEMS,
330
+ DEMOGRAPHICS_CORE,
331
+ WORK,
332
+ LIFESTYLE,
333
+ SUBSTANCE_AND_MEDICAL,
334
+ TRAUMA,
335
+ FAMILY_AND_FRIENDS,
336
+ FAITH,
337
+ TECHNOLOGY,
338
+ BENCHMARKING,
339
+ MOMENTARY,
340
+ COVID,
341
+ DEDUP,
342
+ ]:
343
+ COLUMN_DESCRIPTIONS.update(_group)
344
+
345
+
346
+ def describe_column(name: str) -> str:
347
+ """Return the human-readable description for a column, or the column name itself if not found."""
348
+ return COLUMN_DESCRIPTIONS.get(name, name)
@@ -0,0 +1,244 @@
1
+ import polars as pl
2
+
3
+ # 20 Problem items — clinical threshold ≥8
4
+ _PROBLEM_COLS = [
5
+ "restless_hyperactive", # Restlessness & Hyperactivity
6
+ "fear_anxiety", # Fear & Anxiety
7
+ "infections", # Susceptibility to Infections
8
+ "aggression", # Aggression Towards Others
9
+ "avoidance", # Avoidance & Withdrawal
10
+ "obsessive_thoughts", # Unwanted, Strange or Obsessive Thoughts
11
+ "mood_swings", # Mood Swings
12
+ "detached_reality", # Sense of being detached from reality
13
+ "nightmares", # Nightmares
14
+ "addictions", # Addictions
15
+ "anger", # Anger & Irritability
16
+ "suicidal_thoughts", # Suicidal Thoughts or Intentions
17
+ "pain", # Experience of Pain
18
+ "guilt_blame", # Guilt & Blame
19
+ "hallucinations", # Hallucinations
20
+ "flashblacks", # Traumatic Flashbacks
21
+ "repetitive_actions", # Repetitive or Compulsive Actions
22
+ "sad_hopeless", # Feelings of Sadness, Distress or Hopelessness
23
+ "physical_health", # Physical Health Issues
24
+ "confusion", # Confusion or Slowed Thinking
25
+ ]
26
+
27
+ # 27 Spectrum items — clinical threshold ≤1
28
+ _SPECTRUM_COLS = [
29
+ "adapt_to_change", # Adaptability to Change
30
+ "self_worth_confidence", # Self Worth & Confidence
31
+ "creativity_problem_solving", # Creativity & Problem Solving
32
+ "drive_motivation", # Drive & Motivation
33
+ "stability_calmness", # Stability & Calmness
34
+ "sleep_quality", # Sleep Quality
35
+ "self_control_impulsivity", # Self Control & Impulsivity
36
+ "ability_learn", # Ability to Learn
37
+ "coordination", # Coordination
38
+ "relationships", # Relationships with others
39
+ "emotional_resilience", # Emotional Resilience
40
+ "planning_organization", # Planning & Organisation
41
+ "physical_intimacy", # Physical Intimacy
42
+ "speech_language", # Speech & Language
43
+ "memory", # Memory
44
+ "social_cooperation", # Social interactions & Cooperation
45
+ "decision_risk", # Decision-making & Risk-taking
46
+ "curiosity_enthusiasm", # Curiosity, Interest & Enthusiasm
47
+ "energy", # Energy Level
48
+ "emotional_control", # Emotional Control
49
+ "focus_concentration", # Focus & Concentration
50
+ "appetite_regulation", # Appetite Regulation
51
+ "empathy", # Empathy
52
+ "sensory_sensitivity", # Sensory Sensitivity
53
+ "self_image", # Self-Image
54
+ "outlook_optimism", # Outlook & Optimism
55
+ "selective_attention", # Selective Attention
56
+ ]
57
+
58
+ # DSM‑5 mapping rules
59
+ _SYM = "_symptom"
60
+
61
+ _DSM5_RULES: dict[str, dict] = {
62
+ "DSM5_depression": {
63
+ "core_groups": [0, 1], "min_groups": 5,
64
+ "groups": [
65
+ [f"drive_motivation{_SYM}", f"curiosity_enthusiasm{_SYM}"],
66
+ [f"sad_hopeless{_SYM}", f"outlook_optimism{_SYM}"],
67
+ [f"appetite_regulation{_SYM}"], [f"confusion{_SYM}"], [f"energy{_SYM}"],
68
+ [f"self_worth_confidence{_SYM}", f"self_image{_SYM}", f"guilt_blame{_SYM}"],
69
+ [f"focus_concentration{_SYM}", f"selective_attention{_SYM}", f"decision_risk{_SYM}"],
70
+ [f"suicidal_thoughts{_SYM}"],
71
+ ],
72
+ },
73
+ "DSM5_anxiety": {
74
+ "required": [f"fear_anxiety{_SYM}"], "required_groups": [0], "min_groups": 3,
75
+ "groups": [
76
+ [f"stability_calmness{_SYM}", f"emotional_control{_SYM}"],
77
+ [f"restless_hyperactive{_SYM}"], [f"energy{_SYM}"],
78
+ [f"focus_concentration{_SYM}", f"selective_attention{_SYM}"],
79
+ [f"anger{_SYM}"], [f"pain{_SYM}"], [f"sleep_quality{_SYM}"], [f"avoidance{_SYM}"],
80
+ ],
81
+ },
82
+ "DSM5_bipolar": {
83
+ "required": [f"mood_swings{_SYM}"], "core_groups": [0, 1], "min_groups": 5,
84
+ "groups": [
85
+ [f"drive_motivation{_SYM}", f"curiosity_enthusiasm{_SYM}"],
86
+ [f"sad_hopeless{_SYM}", f"outlook_optimism{_SYM}"],
87
+ [f"appetite_regulation{_SYM}"], [f"confusion{_SYM}"], [f"energy{_SYM}"],
88
+ [f"self_worth_confidence{_SYM}", f"self_image{_SYM}", f"guilt_blame{_SYM}"],
89
+ [f"focus_concentration{_SYM}", f"selective_attention{_SYM}", f"decision_risk{_SYM}"],
90
+ [f"suicidal_thoughts{_SYM}"],
91
+ ],
92
+ },
93
+ "DSM5_ptsd": {
94
+ "trauma_required": True, "core_groups": [0],
95
+ "required": [f"avoidance{_SYM}"], "min_groups": 2,
96
+ "groups": [
97
+ [f"flashblacks{_SYM}", f"nightmares{_SYM}", f"obsessive_thoughts{_SYM}"],
98
+ [f"memory{_SYM}"],
99
+ [f"self_worth_confidence{_SYM}", f"self_image{_SYM}", f"outlook_optimism{_SYM}"],
100
+ [f"guilt_blame{_SYM}"], [f"sad_hopeless{_SYM}"],
101
+ [f"curiosity_enthusiasm{_SYM}", f"drive_motivation{_SYM}"],
102
+ [f"relationships{_SYM}"],
103
+ ],
104
+ },
105
+ "DSM5_ocd": {
106
+ "required": [f"obsessive_thoughts{_SYM}", f"repetitive_actions{_SYM}", f"fear_anxiety{_SYM}"],
107
+ "min_groups": 1,
108
+ "groups": [
109
+ [f"stability_calmness{_SYM}"], [f"self_control_impulsivity{_SYM}"],
110
+ [f"emotional_control{_SYM}"],
111
+ ],
112
+ },
113
+ "DSM5_schizophrenia": {
114
+ "required": [f"obsessive_thoughts{_SYM}", f"hallucinations{_SYM}"], "min_groups": 1,
115
+ "groups": [
116
+ [f"speech_language{_SYM}"], [f"repetitive_actions{_SYM}"],
117
+ [f"drive_motivation{_SYM}", f"relationships{_SYM}",
118
+ f"social_cooperation{_SYM}", f"curiosity_enthusiasm{_SYM}"],
119
+ ],
120
+ },
121
+ "DSM5_eating": {
122
+ "required": [f"appetite_regulation{_SYM}", f"fear_anxiety{_SYM}", f"self_image{_SYM}"],
123
+ "groups": [],
124
+ },
125
+ "DSM5_addiction": {
126
+ "required": [f"addictions{_SYM}"], "min_groups": 2,
127
+ "groups": [
128
+ [f"decision_risk{_SYM}"], [f"emotional_control{_SYM}"], [f"avoidance{_SYM}"],
129
+ [f"relationships{_SYM}"], [f"self_control_impulsivity{_SYM}"],
130
+ ],
131
+ },
132
+ "DSM5_adhd": {
133
+ "min_groups": 4,
134
+ "groups": [
135
+ [f"focus_concentration{_SYM}"], [f"selective_attention{_SYM}"],
136
+ [f"drive_motivation{_SYM}"], [f"planning_organization{_SYM}"], [f"memory{_SYM}"],
137
+ ],
138
+ },
139
+ "DSM5_asd": {
140
+ "min_groups": 3,
141
+ "groups": [
142
+ [f"social_cooperation{_SYM}"], [f"relationships{_SYM}"],
143
+ [f"repetitive_actions{_SYM}"], [f"adapt_to_change{_SYM}"],
144
+ [f"sensory_sensitivity{_SYM}"],
145
+ [f"selective_attention{_SYM}", f"focus_concentration{_SYM}"],
146
+ ],
147
+ },
148
+ }
149
+
150
+ def _group_flags(df: pl.LazyFrame, groups: list[list[str]]) -> list[pl.Expr]:
151
+ """Return one boolean expression per group (True if any symptom in group is True)."""
152
+ return [
153
+ pl.any_horizontal(
154
+ [
155
+ pl.col(c).fill_null(False)
156
+ for c in grp
157
+ ]
158
+ )
159
+ for grp in groups
160
+ ]
161
+
162
+ def identify_symptoms(df: pl.LazyFrame) -> pl.LazyFrame:
163
+ """Identify symptoms based on clinical thresholds for problem and spectrum items.
164
+ Args:
165
+ df (pl.LazyFrame): Input Polars LazyFrame containing the data.
166
+ Returns:
167
+ pl.LazyFrame: A new Polars LazyFrame with additional columns indicating the presence of symptoms.
168
+ """
169
+ # Identify problem symptoms (clinical threshold ≥8)
170
+ for col in _PROBLEM_COLS:
171
+ df = df.with_columns(
172
+ (pl.col(col) >= 8).alias(f"{col}_symptom")
173
+ )
174
+
175
+ # Identify spectrum symptoms (clinical threshold ≤1)
176
+ for col in _SPECTRUM_COLS:
177
+ df = df.with_columns(
178
+ (pl.col(col) <= 1).alias(f"{col}_symptom")
179
+ )
180
+
181
+ df = df.with_columns(
182
+ pl.sum_horizontal(
183
+ [pl.col(f"{c}_symptom") for c in _PROBLEM_COLS + _SPECTRUM_COLS]
184
+ ).alias("symptom_count")
185
+ )
186
+
187
+ return df
188
+
189
+ def mapping_to_DSM5(df: pl.LazyFrame) -> pl.LazyFrame:
190
+ """Map the identified symptoms to DSM-5 categories.
191
+ Args:
192
+ df (pl.LazyFrame): Input Polars LazyFrame containing the data with symptom indicators.
193
+ Returns:
194
+ pl.LazyFrame: A new Polars LazyFrame with additional columns indicating DSM-5 categories.
195
+ """
196
+
197
+ # Pre‑compute a "has trauma" flag for PTSD
198
+ df = df.with_columns(
199
+ pl.when(
200
+ pl.col("trauma_childhood").list.len() > 0
201
+ & ~pl.col("trauma_childhood").list.contains(
202
+ "I did not experience any of the above during my childhood"
203
+ )
204
+ )
205
+ .then(True)
206
+ .when(
207
+ pl.col("trauma_adulthood").list.len() > 0
208
+ & ~pl.col("trauma_adulthood").list.contains(
209
+ "I did not experience any of the above"
210
+ )
211
+ )
212
+ .then(True)
213
+ .otherwise(False)
214
+ .alias("_has_trauma")
215
+ )
216
+
217
+ for label, rule in _DSM5_RULES.items():
218
+ g = _group_flags(df, rule["groups"])
219
+ ok = pl.lit(True)
220
+
221
+ # Required single columns (ALL must be True)
222
+ for c in rule.get("required", []):
223
+ ok = ok & pl.col(c).fill_null(False)
224
+
225
+ # Required groups (each must be True)
226
+ for gi in rule.get("required_groups", []):
227
+ ok = ok & g[gi]
228
+
229
+ # Core group check (≥1 of this set must be True)
230
+ if "core_groups" in rule:
231
+ ok = ok & pl.any_horizontal([g[i] for i in rule["core_groups"]])
232
+
233
+ # Min total groups check
234
+ if rule.get("min_groups"):
235
+ total = pl.sum_horizontal([*g])
236
+ ok = ok & (total >= rule["min_groups"])
237
+
238
+ # Special: trauma flag
239
+ if rule.get("trauma_required"):
240
+ ok = ok & pl.col("_has_trauma")
241
+
242
+ df = df.with_columns(ok.alias(label))
243
+
244
+ return df.drop("_has_trauma")