globalmind 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- globalmind-0.0.1/.gitignore +59 -0
- globalmind-0.0.1/LICENSE.txt +18 -0
- globalmind-0.0.1/PKG-INFO +115 -0
- globalmind-0.0.1/README.md +94 -0
- globalmind-0.0.1/pyproject.toml +73 -0
- globalmind-0.0.1/src/globalmind/__about__.py +4 -0
- globalmind-0.0.1/src/globalmind/__init__.py +20 -0
- globalmind-0.0.1/src/globalmind/data.py +123 -0
- globalmind-0.0.1/src/globalmind/schema.py +348 -0
- globalmind-0.0.1/src/globalmind/symptom.py +244 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Byte-compiled / cache
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.egg-info/
|
|
5
|
+
.pytest_cache/
|
|
6
|
+
|
|
7
|
+
# Build
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
*.egg
|
|
11
|
+
*.whl
|
|
12
|
+
|
|
13
|
+
# Virtual environments
|
|
14
|
+
.venv/
|
|
15
|
+
venv/
|
|
16
|
+
env/
|
|
17
|
+
|
|
18
|
+
# Hatch
|
|
19
|
+
.hatch/
|
|
20
|
+
|
|
21
|
+
# Coverage
|
|
22
|
+
htmlcov/
|
|
23
|
+
.coverage
|
|
24
|
+
.coverage.*
|
|
25
|
+
coverage.xml
|
|
26
|
+
|
|
27
|
+
# IDE & editor
|
|
28
|
+
.vscode/
|
|
29
|
+
.idea/
|
|
30
|
+
*.swp
|
|
31
|
+
*.swo
|
|
32
|
+
*~
|
|
33
|
+
|
|
34
|
+
# OS
|
|
35
|
+
.DS_Store
|
|
36
|
+
Thumbs.db
|
|
37
|
+
.directory
|
|
38
|
+
|
|
39
|
+
# Jupyter
|
|
40
|
+
.ipynb_checkpoints/
|
|
41
|
+
*.ipynb
|
|
42
|
+
|
|
43
|
+
# Environment
|
|
44
|
+
.env
|
|
45
|
+
.env.*
|
|
46
|
+
|
|
47
|
+
# Data (large CSV / parquet — do not commit raw or processed data)
|
|
48
|
+
*.csv
|
|
49
|
+
*.parquet
|
|
50
|
+
*.processed.parquet
|
|
51
|
+
|
|
52
|
+
# Tests — not included in distribution
|
|
53
|
+
tests/
|
|
54
|
+
|
|
55
|
+
# Python type stubs
|
|
56
|
+
*.pyi
|
|
57
|
+
|
|
58
|
+
# Logs
|
|
59
|
+
*.log
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026-present Nianyu Su <mirakelor@outlook.com>
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
|
|
6
|
+
associated documentation files (the "Software"), to deal in the Software without restriction, including
|
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8
|
+
copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
|
|
9
|
+
following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial
|
|
12
|
+
portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
|
|
15
|
+
LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
|
|
16
|
+
EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
|
18
|
+
USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: globalmind
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: GMP (Global Mind Project) mental health data analysis toolkit
|
|
5
|
+
Project-URL: Documentation, https://github.com/Mirakelor/globalmind#readme
|
|
6
|
+
Project-URL: Issues, https://github.com/Mirakelor/globalmind/issues
|
|
7
|
+
Project-URL: Source, https://github.com/Mirakelor/globalmind
|
|
8
|
+
Author-email: Nianyu Su <mirakelor@outlook.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE.txt
|
|
11
|
+
Keywords: DSM-5,GMP,Global-Mind-Project,MHQ,Mind-Health-Quotient,data-science,mental-health,polars,psychometrics,public-health,questionnaire,survey-analysis,wellbeing
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: polars>=1.0.0
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# globalmind
|
|
23
|
+
|
|
24
|
+
[](https://pypi.org/project/globalmind)
|
|
25
|
+
[](https://pypi.org/project/globalmind)
|
|
26
|
+
|
|
27
|
+
-----
|
|
28
|
+
|
|
29
|
+
Python pipeline for the [Global Mind Project (GMP)](https://sapienlabs.org/global-mind-project/) — ingest, filter, profile, and
|
|
30
|
+
classify mental health data collected through the Mind Health Quotient (MHQ).
|
|
31
|
+
|
|
32
|
+
## Background
|
|
33
|
+
|
|
34
|
+
The Global Mind database contains mental health profiles from nearly 2 million
|
|
35
|
+
internet-enabled respondents across 130+ countries in 17+ languages, with
|
|
36
|
+
1,000–2,000 new responses added each day. Data are collected using the **Mind
|
|
37
|
+
Health Quotient (MHQ)** — an online assessment developed from a review of over
|
|
38
|
+
10,000 questions drawn from 126 commonly used mental health tools spanning 10
|
|
39
|
+
disorders. The MHQ consists of 47 items that rate individual aspects of mind
|
|
40
|
+
health on a 1–9 scale, together with aggregate scores, demographics, and
|
|
41
|
+
lifestyle/life‑context factors.
|
|
42
|
+
|
|
43
|
+
`globalmind` provides a pure‑Polars pipeline to go from raw CSV exports to
|
|
44
|
+
DSM‑5 diagnostic classifications in a few lines of code. All operations are
|
|
45
|
+
**lazy** (build a query plan, `.collect()` once at the end) for memory‑safe
|
|
46
|
+
processing of the full dataset.
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
### Data loading & cleaning
|
|
51
|
+
- **`read_table(path)`** — scan CSV with automatic N/A → null conversion,
|
|
52
|
+
pipe‑delimited multi‑select splitting (21 columns), and stray‑pipe cleanup on
|
|
53
|
+
single‑select categoricals.
|
|
54
|
+
- **`clean_data(df)`** — apply four quality filters:
|
|
55
|
+
- completion time ≥ 7 minutes
|
|
56
|
+
- response variance across 47 rating items (std dev ≥ 0.2)
|
|
57
|
+
- comprehension check (`understanding` ≠ "No")
|
|
58
|
+
- countries with ≥ 1,000 responses
|
|
59
|
+
|
|
60
|
+
### 205‑column schema
|
|
61
|
+
- **`COLUMN_DESCRIPTIONS`** — dictionary mapping every column name to an
|
|
62
|
+
English description with a Chinese gloss.
|
|
63
|
+
- **`describe_column(name)`** — lookup helper.
|
|
64
|
+
|
|
65
|
+
### Symptom identification & DSM‑5 mapping
|
|
66
|
+
- **`identify_symptoms(df)`** — flags each of the 47 MHQ items as a clinical
|
|
67
|
+
symptom per DSM‑5 thresholds:
|
|
68
|
+
- *Problem items* (20): threshold ≥ 8 on a 1–9 severity scale
|
|
69
|
+
- *Spectrum items* (27): threshold ≤ 1 (challenge end of the spectrum)
|
|
70
|
+
- Adds 47 `_symptom` boolean columns + a `symptom_count` column.
|
|
71
|
+
- **`mapping_to_DSM5(df)`** — data‑driven rule engine classifying 10 disorder
|
|
72
|
+
categories: Depression, Anxiety, Bipolar, PTSD, OCD, Schizophrenia, Eating
|
|
73
|
+
Disorder, Addiction, ADHD, ASD.
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
```console
|
|
78
|
+
pip install globalmind
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Requires Python ≥ 3.10 and `polars ≥ 1.0`.
|
|
82
|
+
|
|
83
|
+
## Quick start
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from globalmind import (
|
|
87
|
+
read_table, clean_data,
|
|
88
|
+
identify_symptoms, mapping_to_DSM5,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
df = read_table("gmp_data.csv")
|
|
92
|
+
df = clean_data(df)
|
|
93
|
+
df = identify_symptoms(df)
|
|
94
|
+
df = mapping_to_DSM5(df)
|
|
95
|
+
df.collect() # all operations are lazy
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## References
|
|
99
|
+
|
|
100
|
+
- **Data cleaning criteria** — Bala, Jerzy, Oleksii Sukhoi, Jennifer Jane
|
|
101
|
+
Newson, Priscila Pereira Machado, Mark Lawrence, and Tara C. Thiagarajan.
|
|
102
|
+
"Estimation of the Nature and Magnitude of Mental Distress in the Population
|
|
103
|
+
Associated with Ultra-Processed Food Consumption." *Frontiers in Nutrition* 12
|
|
104
|
+
(November 2025): 1562286.
|
|
105
|
+
[https://doi.org/10.3389/fnut.2025.1562286](https://doi.org/10.3389/fnut.2025.1562286)
|
|
106
|
+
- **Symptom thresholds & DSM‑5 mapping** — Newson, Jennifer Jane, Vladyslav
|
|
107
|
+
Pastukh, and Tara C. Thiagarajan. "Poor Separation of Clinical Symptom
|
|
108
|
+
Profiles by DSM-5 Disorder Criteria." *Frontiers in Psychiatry* 12 (November
|
|
109
|
+
2021): 775762.
|
|
110
|
+
[https://doi.org/10.3389/fpsyt.2021.775762](https://doi.org/10.3389/fpsyt.2021.775762)
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
`globalmind` is distributed under the terms of the
|
|
115
|
+
[MIT](https://spdx.org/licenses/MIT.html) license.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# globalmind
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/globalmind)
|
|
4
|
+
[](https://pypi.org/project/globalmind)
|
|
5
|
+
|
|
6
|
+
-----
|
|
7
|
+
|
|
8
|
+
Python pipeline for the [Global Mind Project (GMP)](https://sapienlabs.org/global-mind-project/) — ingest, filter, profile, and
|
|
9
|
+
classify mental health data collected through the Mind Health Quotient (MHQ).
|
|
10
|
+
|
|
11
|
+
## Background
|
|
12
|
+
|
|
13
|
+
The Global Mind database contains mental health profiles from nearly 2 million
|
|
14
|
+
internet-enabled respondents across 130+ countries in 17+ languages, with
|
|
15
|
+
1,000–2,000 new responses added each day. Data are collected using the **Mind
|
|
16
|
+
Health Quotient (MHQ)** — an online assessment developed from a review of over
|
|
17
|
+
10,000 questions drawn from 126 commonly used mental health tools spanning 10
|
|
18
|
+
disorders. The MHQ consists of 47 items that rate individual aspects of mind
|
|
19
|
+
health on a 1–9 scale, together with aggregate scores, demographics, and
|
|
20
|
+
lifestyle/life‑context factors.
|
|
21
|
+
|
|
22
|
+
`globalmind` provides a pure‑Polars pipeline to go from raw CSV exports to
|
|
23
|
+
DSM‑5 diagnostic classifications in a few lines of code. All operations are
|
|
24
|
+
**lazy** (build a query plan, `.collect()` once at the end) for memory‑safe
|
|
25
|
+
processing of the full dataset.
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
### Data loading & cleaning
|
|
30
|
+
- **`read_table(path)`** — scan CSV with automatic N/A → null conversion,
|
|
31
|
+
pipe‑delimited multi‑select splitting (21 columns), and stray‑pipe cleanup on
|
|
32
|
+
single‑select categoricals.
|
|
33
|
+
- **`clean_data(df)`** — apply four quality filters:
|
|
34
|
+
- completion time ≥ 7 minutes
|
|
35
|
+
- response variance across 47 rating items (std dev ≥ 0.2)
|
|
36
|
+
- comprehension check (`understanding` ≠ "No")
|
|
37
|
+
- countries with ≥ 1,000 responses
|
|
38
|
+
|
|
39
|
+
### 205‑column schema
|
|
40
|
+
- **`COLUMN_DESCRIPTIONS`** — dictionary mapping every column name to an
|
|
41
|
+
English description with a Chinese gloss.
|
|
42
|
+
- **`describe_column(name)`** — lookup helper.
|
|
43
|
+
|
|
44
|
+
### Symptom identification & DSM‑5 mapping
|
|
45
|
+
- **`identify_symptoms(df)`** — flags each of the 47 MHQ items as a clinical
|
|
46
|
+
symptom per DSM‑5 thresholds:
|
|
47
|
+
- *Problem items* (20): threshold ≥ 8 on a 1–9 severity scale
|
|
48
|
+
- *Spectrum items* (27): threshold ≤ 1 (challenge end of the spectrum)
|
|
49
|
+
- Adds 47 `_symptom` boolean columns + a `symptom_count` column.
|
|
50
|
+
- **`mapping_to_DSM5(df)`** — data‑driven rule engine classifying 10 disorder
|
|
51
|
+
categories: Depression, Anxiety, Bipolar, PTSD, OCD, Schizophrenia, Eating
|
|
52
|
+
Disorder, Addiction, ADHD, ASD.
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```console
|
|
57
|
+
pip install globalmind
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Requires Python ≥ 3.10 and `polars ≥ 1.0`.
|
|
61
|
+
|
|
62
|
+
## Quick start
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from globalmind import (
|
|
66
|
+
read_table, clean_data,
|
|
67
|
+
identify_symptoms, mapping_to_DSM5,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
df = read_table("gmp_data.csv")
|
|
71
|
+
df = clean_data(df)
|
|
72
|
+
df = identify_symptoms(df)
|
|
73
|
+
df = mapping_to_DSM5(df)
|
|
74
|
+
df.collect() # all operations are lazy
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## References
|
|
78
|
+
|
|
79
|
+
- **Data cleaning criteria** — Bala, Jerzy, Oleksii Sukhoi, Jennifer Jane
|
|
80
|
+
Newson, Priscila Pereira Machado, Mark Lawrence, and Tara C. Thiagarajan.
|
|
81
|
+
"Estimation of the Nature and Magnitude of Mental Distress in the Population
|
|
82
|
+
Associated with Ultra-Processed Food Consumption." *Frontiers in Nutrition* 12
|
|
83
|
+
(November 2025): 1562286.
|
|
84
|
+
[https://doi.org/10.3389/fnut.2025.1562286](https://doi.org/10.3389/fnut.2025.1562286)
|
|
85
|
+
- **Symptom thresholds & DSM‑5 mapping** — Newson, Jennifer Jane, Vladyslav
|
|
86
|
+
Pastukh, and Tara C. Thiagarajan. "Poor Separation of Clinical Symptom
|
|
87
|
+
Profiles by DSM-5 Disorder Criteria." *Frontiers in Psychiatry* 12 (November
|
|
88
|
+
2021): 775762.
|
|
89
|
+
[https://doi.org/10.3389/fpsyt.2021.775762](https://doi.org/10.3389/fpsyt.2021.775762)
|
|
90
|
+
|
|
91
|
+
## License
|
|
92
|
+
|
|
93
|
+
`globalmind` is distributed under the terms of the
|
|
94
|
+
[MIT](https://spdx.org/licenses/MIT.html) license.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "globalmind"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = 'GMP (Global Mind Project) mental health data analysis toolkit'
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
keywords = [
|
|
13
|
+
"GMP", "Global-Mind-Project", "MHQ", "Mind-Health-Quotient",
|
|
14
|
+
"mental-health", "wellbeing", "DSM-5", "psychometrics",
|
|
15
|
+
"questionnaire", "survey-analysis", "public-health",
|
|
16
|
+
"polars", "data-science",
|
|
17
|
+
]
|
|
18
|
+
authors = [
|
|
19
|
+
{ name = "Nianyu Su", email = "mirakelor@outlook.com" },
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
|
+
"Programming Language :: Python",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Programming Language :: Python :: Implementation :: CPython",
|
|
28
|
+
]
|
|
29
|
+
dependencies = [
|
|
30
|
+
"polars>=1.0.0"
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Documentation = "https://github.com/Mirakelor/globalmind#readme"
|
|
35
|
+
Issues = "https://github.com/Mirakelor/globalmind/issues"
|
|
36
|
+
Source = "https://github.com/Mirakelor/globalmind"
|
|
37
|
+
|
|
38
|
+
[tool.hatch.version]
|
|
39
|
+
path = "src/globalmind/__about__.py"
|
|
40
|
+
|
|
41
|
+
[tool.hatch.envs.types]
|
|
42
|
+
extra-dependencies = [
|
|
43
|
+
"mypy>=1.0.0",
|
|
44
|
+
]
|
|
45
|
+
[tool.hatch.envs.types.scripts]
|
|
46
|
+
check = "mypy --install-types --non-interactive {args:src/globalmind tests}"
|
|
47
|
+
[tool.hatch.envs.test]
|
|
48
|
+
dependencies = [
|
|
49
|
+
"pytest>=8.0.0",
|
|
50
|
+
"pytest-cov>=5.0.0",
|
|
51
|
+
]
|
|
52
|
+
[tool.hatch.envs.test.scripts]
|
|
53
|
+
test = "pytest -v {args:tests}"
|
|
54
|
+
cov = "pytest --cov=src/globalmind --cov-report=term-missing {args:tests}"
|
|
55
|
+
|
|
56
|
+
[tool.coverage.run]
|
|
57
|
+
source_pkgs = ["globalmind", "tests"]
|
|
58
|
+
branch = true
|
|
59
|
+
parallel = true
|
|
60
|
+
omit = [
|
|
61
|
+
"src/globalmind/__about__.py",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
[tool.coverage.paths]
|
|
65
|
+
globalmind = ["src/globalmind", "*/globalmind/src/globalmind"]
|
|
66
|
+
tests = ["tests", "*/globalmind/tests"]
|
|
67
|
+
|
|
68
|
+
[tool.coverage.report]
|
|
69
|
+
exclude_lines = [
|
|
70
|
+
"no cov",
|
|
71
|
+
"if __name__ == .__main__.:",
|
|
72
|
+
"if TYPE_CHECKING:",
|
|
73
|
+
]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2026-present Nianyu Su <mirakelor@outlook.com>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
"""GlobalMind: GMP (Global Mind Project) data analysis toolkit."""
|
|
6
|
+
|
|
7
|
+
from globalmind.data import read_table, clean_data
|
|
8
|
+
from globalmind.schema import COLUMN_DESCRIPTIONS, describe_column
|
|
9
|
+
from globalmind.symptom import identify_symptoms, mapping_to_DSM5
|
|
10
|
+
from globalmind.__about__ import __version__
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"read_table",
|
|
14
|
+
"clean_data",
|
|
15
|
+
"COLUMN_DESCRIPTIONS",
|
|
16
|
+
"describe_column",
|
|
17
|
+
"identify_symptoms",
|
|
18
|
+
"mapping_to_DSM5",
|
|
19
|
+
"__version__",
|
|
20
|
+
]
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import polars as pl
|
|
3
|
+
|
|
4
|
+
# Columns whose values are pipe-delimited multi-select strings.
|
|
5
|
+
_PIPE_DELIMITED_COLS: list[str] = [
|
|
6
|
+
"live_close_nature",
|
|
7
|
+
"time_nature",
|
|
8
|
+
"sleep_problem_type",
|
|
9
|
+
"work_factors",
|
|
10
|
+
"substance_use",
|
|
11
|
+
"medical_condition_type",
|
|
12
|
+
"treatment_type_new",
|
|
13
|
+
"treatment_type",
|
|
14
|
+
"trauma_childhood",
|
|
15
|
+
"trauma_adulthood",
|
|
16
|
+
"friendship_type",
|
|
17
|
+
"parental_support",
|
|
18
|
+
"help_seeking",
|
|
19
|
+
"mental_health_disorder",
|
|
20
|
+
"internet_restrictions",
|
|
21
|
+
"sm_impact",
|
|
22
|
+
"ai_use_general",
|
|
23
|
+
"ai_use_social",
|
|
24
|
+
"ai_impact_personal",
|
|
25
|
+
"ai_impact_work",
|
|
26
|
+
"immersion_nature",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Clean | artifacts from single-select categorical columns
|
|
30
|
+
_SINGLE_SELECT_PIPE_COLS: list[str] = [
|
|
31
|
+
"ethnicity",
|
|
32
|
+
"income_household",
|
|
33
|
+
"city",
|
|
34
|
+
"country",
|
|
35
|
+
"employment_sector",
|
|
36
|
+
"job_sector",
|
|
37
|
+
"work_activity",
|
|
38
|
+
"family_situation",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
# Rating questions (1-9 scale, all MHQ items except categorical _type cols)
|
|
42
|
+
_RATING_COLS = [
|
|
43
|
+
"adapt_to_change", "self_worth_confidence", "creativity_problem_solving",
|
|
44
|
+
"drive_motivation", "stability_calmness", "sleep_quality",
|
|
45
|
+
"self_control_impulsivity", "ability_learn", "coordination", "relationships",
|
|
46
|
+
"emotional_resilience", "planning_organization", "physical_intimacy",
|
|
47
|
+
"speech_language", "memory", "social_cooperation", "decision_risk",
|
|
48
|
+
"curiosity_enthusiasm", "energy", "emotional_control", "focus_concentration",
|
|
49
|
+
"appetite_regulation", "empathy", "sensory_sensitivity", "self_image",
|
|
50
|
+
"outlook_optimism", "selective_attention", "restless_hyperactive",
|
|
51
|
+
"fear_anxiety", "infections", "aggression", "avoidance", "obsessive_thoughts",
|
|
52
|
+
"mood_swings", "detached_reality", "nightmares", "addictions", "anger",
|
|
53
|
+
"suicidal_thoughts", "pain", "guilt_blame", "hallucinations", "flashblacks",
|
|
54
|
+
"repetitive_actions", "sad_hopeless", "physical_health", "confusion",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
def read_table(path: str) -> pl.LazyFrame:
|
|
58
|
+
"""Reads a CSV file and returns a Polars LazyFrame.
|
|
59
|
+
Args:
|
|
60
|
+
path (str): The path to the CSV file.
|
|
61
|
+
Returns:
|
|
62
|
+
pl.LazyFrame: The resulting Polars LazyFrame.
|
|
63
|
+
Raises:
|
|
64
|
+
FileNotFoundError: If the specified file does not exist.
|
|
65
|
+
NotImplementedError: If the file format is not supported.
|
|
66
|
+
"""
|
|
67
|
+
source = Path(path)
|
|
68
|
+
if not source.exists():
|
|
69
|
+
raise FileNotFoundError(f"File not found: {path}")
|
|
70
|
+
if source.suffix.lower() == ".csv":
|
|
71
|
+
_score_cols = [
|
|
72
|
+
"overall_mhq_score", "cognition_score", "adapt_resilience_score",
|
|
73
|
+
"drive_motivation_score", "mood_outlook_score", "social_self_score",
|
|
74
|
+
"mind_body_score",
|
|
75
|
+
]
|
|
76
|
+
df = pl.scan_csv(
|
|
77
|
+
path,
|
|
78
|
+
infer_schema_length=0,
|
|
79
|
+
null_values=["N/A", "NA", "null", "", "Prefer not to say"],
|
|
80
|
+
schema_overrides={c: pl.Float64 for c in _RATING_COLS + _score_cols},
|
|
81
|
+
)
|
|
82
|
+
df = df.with_columns([
|
|
83
|
+
pl.col(col).str.split("|").list.eval(pl.element().filter(pl.element().str.strip_chars() != "")).alias(col)
|
|
84
|
+
if col != "substance_use"
|
|
85
|
+
else pl.col(col).str.split("|").list.eval(pl.element().str.replace_all("##PIPE##", "|").filter(pl.element().str.strip_chars() != "")).alias(col)
|
|
86
|
+
for col in _PIPE_DELIMITED_COLS
|
|
87
|
+
])
|
|
88
|
+
df = df.with_columns([
|
|
89
|
+
pl.col(col).str.strip_chars("|").str.replace_all(r"\|+", ",").alias(col)
|
|
90
|
+
for col in _SINGLE_SELECT_PIPE_COLS
|
|
91
|
+
])
|
|
92
|
+
return df
|
|
93
|
+
else:
|
|
94
|
+
raise NotImplementedError(f"Unsupported file format: {source.suffix}")
|
|
95
|
+
|
|
96
|
+
def clean_data(df: pl.LazyFrame) -> pl.LazyFrame:
|
|
97
|
+
"""Apply GMP data quality filters.
|
|
98
|
+
|
|
99
|
+
Removes records if:
|
|
100
|
+
- time_to_complete < 7 minutes
|
|
101
|
+
- same option selected for all rating questions (std dev < 0.2)
|
|
102
|
+
- responded 'No' to 'Did you find this assessment easy to understand?'
|
|
103
|
+
- country has fewer than 1,000 responses
|
|
104
|
+
"""
|
|
105
|
+
# Filter out records with time_to_complete < 7 minutes
|
|
106
|
+
df = df.with_columns(
|
|
107
|
+
(pl.col("time_to_complete").str.split(":").list.get(0).cast(pl.Int64) * 60 + pl.col("time_to_complete").str.split(":").list.get(1).cast(pl.Int64))
|
|
108
|
+
.alias("time_to_complete_minutes")
|
|
109
|
+
).filter(pl.col("time_to_complete_minutes") >= 7).drop("time_to_complete_minutes")
|
|
110
|
+
|
|
111
|
+
# Filter out records with low variance in rating questions
|
|
112
|
+
df = df.with_columns(
|
|
113
|
+
pl.concat_list([pl.col(col) for col in _RATING_COLS]).list.std().alias("_ratings_std")
|
|
114
|
+
).filter(pl.col("_ratings_std") >= 0.2).drop("_ratings_std")
|
|
115
|
+
|
|
116
|
+
# Filter out records that responded 'No' to 'Did you find this assessment easy to understand?'
|
|
117
|
+
df = df.filter(pl.col("understanding") != "No")
|
|
118
|
+
|
|
119
|
+
# Filter out countries with fewer than 1,000 responses
|
|
120
|
+
country_counts = df.group_by("country").len().filter(pl.col("len") >= 1000).select("country")
|
|
121
|
+
df = df.join(country_counts, on="country", how="semi")
|
|
122
|
+
|
|
123
|
+
return df
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""Column name → human-readable description mapping for the GMP (Global Mind Project) dataset.
|
|
2
|
+
|
|
3
|
+
Grouped by topic for easy browsing and documentation generation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# =============================================================================
|
|
7
|
+
# Meta & timestamps
|
|
8
|
+
# =============================================================================
|
|
9
|
+
META = {
|
|
10
|
+
"language": "Survey language (问卷语言)",
|
|
11
|
+
"start_date_utc": "Assessment start time in UTC (评估开始时间 UTC)",
|
|
12
|
+
"day": "Day of assessment (评估日期 日)",
|
|
13
|
+
"month": "Month of assessment (评估日期 月)",
|
|
14
|
+
"year": "Year of assessment (评估日期 年)",
|
|
15
|
+
"submit_data_utc": "Submission time in UTC (提交时间 UTC)",
|
|
16
|
+
"time_to_complete": "Time taken to complete the assessment (完成耗时)",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
# =============================================================================
|
|
20
|
+
# MHQ aggregate scores (7)
|
|
21
|
+
# =============================================================================
|
|
22
|
+
MHQ_SCORES = {
|
|
23
|
+
"overall_mhq_score": "Overall MHQ score (MHQ 总得分)",
|
|
24
|
+
"cognition_score": "Cognition score (认知能力得分)",
|
|
25
|
+
"adapt_resilience_score": "Adaptation & resilience score (适应与韧性得分)",
|
|
26
|
+
"drive_motivation_score": "Drive & motivation score (驱动力与动机得分)",
|
|
27
|
+
"mood_outlook_score": "Mood & outlook score (情绪与展望得分)",
|
|
28
|
+
"social_self_score": "Social self score (社交自我得分)",
|
|
29
|
+
"mind_body_score": "Mind-body connection score (身心连接得分)",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# =============================================================================
|
|
33
|
+
# MHQ 47 individual assessment items
|
|
34
|
+
# =============================================================================
|
|
35
|
+
MHQ_ITEMS = {
|
|
36
|
+
# --- Cognition ---
|
|
37
|
+
"understanding": "Understanding (理解能力)",
|
|
38
|
+
"ability_learn": "Ability to learn (学习能力)",
|
|
39
|
+
"memory": "Memory (记忆力)",
|
|
40
|
+
"creativity_problem_solving": "Creativity & problem solving (创造力与解决问题)",
|
|
41
|
+
"planning_organization": "Planning & organization (计划与组织)",
|
|
42
|
+
"selective_attention": "Selective attention (选择性注意力)",
|
|
43
|
+
"focus_concentration": "Focus & concentration (专注力)",
|
|
44
|
+
"speech_language": "Speech & language (言语与语言)",
|
|
45
|
+
"confusion": "Confusion (困惑感)",
|
|
46
|
+
|
|
47
|
+
# --- Mood & emotion ---
|
|
48
|
+
"stability_calmness": "Stability & calmness (情绪稳定与冷静)",
|
|
49
|
+
"emotional_resilience": "Emotional resilience (情绪韧性)",
|
|
50
|
+
"emotional_control": "Emotional control (情绪控制)",
|
|
51
|
+
"mood_swings": "Mood swings (情绪波动)",
|
|
52
|
+
"sad_hopeless": "Sadness & hopelessness (悲伤/绝望)",
|
|
53
|
+
"outlook_optimism": "Outlook & optimism (乐观展望)",
|
|
54
|
+
"fear_anxiety": "Fear & anxiety (恐惧/焦虑)",
|
|
55
|
+
"anger": "Anger (愤怒)",
|
|
56
|
+
"guilt_blame": "Guilt & self-blame (内疚/自责)",
|
|
57
|
+
"guilt_blame_type": "Guilt/blame trigger type (内疚/自责触发类型)",
|
|
58
|
+
|
|
59
|
+
# --- Self ---
|
|
60
|
+
"self_worth_confidence": "Self-worth & confidence (自我价值/自信)",
|
|
61
|
+
"self_image": "Self-image (自我形象)",
|
|
62
|
+
"drive_motivation": "Drive & motivation (驱动力与动机)",
|
|
63
|
+
"curiosity_enthusiasm": "Curiosity & enthusiasm (好奇心与热情)",
|
|
64
|
+
"energy": "Energy level (精力水平)",
|
|
65
|
+
"adapt_to_change": "Adaptability to change (适应变化能力)",
|
|
66
|
+
"decision_risk": "Decision-making & risk assessment (决策与风险评估)",
|
|
67
|
+
|
|
68
|
+
# --- Social ---
|
|
69
|
+
"relationships": "Relationship quality (人际关系质量)",
|
|
70
|
+
"social_cooperation": "Social cooperation (社交合作能力)",
|
|
71
|
+
"empathy": "Empathy (同理心)",
|
|
72
|
+
"coordination": "Coordination (协调能力)",
|
|
73
|
+
"physical_intimacy": "Physical intimacy (身体亲密)",
|
|
74
|
+
"aggression": "Aggression (攻击性)",
|
|
75
|
+
"avoidance": "Social avoidance (社交回避)",
|
|
76
|
+
|
|
77
|
+
# --- Mind-body ---
|
|
78
|
+
"sleep_quality": "Sleep quality (睡眠质量)",
|
|
79
|
+
"appetite_regulation": "Appetite regulation (食欲调节)",
|
|
80
|
+
"physical_health": "Physical health perception (身体健康感受)",
|
|
81
|
+
"self_control_impulsivity": "Self-control & impulsivity (自控力/冲动控制)",
|
|
82
|
+
"sensory_sensitivity": "Sensory sensitivity (感官敏感度)",
|
|
83
|
+
"pain": "Physical pain (身体疼痛)",
|
|
84
|
+
|
|
85
|
+
# --- Clinical ---
|
|
86
|
+
"restless_hyperactive": "Restlessness & hyperactivity (烦躁/多动)",
|
|
87
|
+
"obsessive_thoughts": "Obsessive thoughts (强迫思维)",
|
|
88
|
+
"obsessive_thoughts_type": "Obsessive thought type (强迫思维类型)",
|
|
89
|
+
"detached_reality": "Detached reality (现实感脱离)",
|
|
90
|
+
"nightmares": "Nightmares (噩梦)",
|
|
91
|
+
"hallucinations": "Hallucinations (幻觉)",
|
|
92
|
+
"flashblacks": "Flashbacks (闪回)",
|
|
93
|
+
"repetitive_actions": "Repetitive actions (重复行为)",
|
|
94
|
+
"addictions": "Addictive behaviors (成瘾行为)",
|
|
95
|
+
"infections": "Infections (感染)",
|
|
96
|
+
"suicidal_thoughts": "Suicidal thoughts (自杀念头)",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# =============================================================================
|
|
100
|
+
# Core demographics
|
|
101
|
+
# =============================================================================
|
|
102
|
+
DEMOGRAPHICS_CORE = {
|
|
103
|
+
"age": "Age group (年龄段)",
|
|
104
|
+
"biological_sex": "Biological sex (生理性别)",
|
|
105
|
+
"gender_diff": "Gender identity vs. biological sex difference (性别认同与生理性别差异)",
|
|
106
|
+
"gender_identity": "Gender identity (性别认同)",
|
|
107
|
+
"gender": "Gender (性别)",
|
|
108
|
+
"ethnicity": "Ethnicity (种族/民族)",
|
|
109
|
+
"country": "Country (国家)",
|
|
110
|
+
"state": "State / province / region (州/省/地区)",
|
|
111
|
+
"rural_urban": "Rural vs. urban classification (城乡分类)",
|
|
112
|
+
"city": "City (城市)",
|
|
113
|
+
"education": "Education level (教育程度)",
|
|
114
|
+
"employment": "Employment status (就业状态)",
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# =============================================================================
|
|
118
|
+
# Work & employment
|
|
119
|
+
# =============================================================================
|
|
120
|
+
WORK = {
|
|
121
|
+
"employment_sector": "Employment sector (就业行业)",
|
|
122
|
+
"job_role": "Job role / position (职位/角色)",
|
|
123
|
+
"income_household": "Household income level (家庭收入水平)",
|
|
124
|
+
"veteran_status_US": "US veteran status (美国退伍军人身份)",
|
|
125
|
+
"productivity_absent": "Days absent from work — past 4 weeks (缺勤天数 过去4周)",
|
|
126
|
+
"productivity_unproductive": "Unproductive work days — past 4 weeks (低效工作天数 过去4周)",
|
|
127
|
+
"team_situation": "Team working arrangement (团队工作模式)",
|
|
128
|
+
"job_features": "Job features description (工作特征描述)",
|
|
129
|
+
"work_situation": "Work mode — remote / on-site / hybrid (工作模式 远程/现场/混合)",
|
|
130
|
+
"organization_size": "Organization size (组织规模)",
|
|
131
|
+
"job_duration": "Time in current role (在当前岗位时长)",
|
|
132
|
+
"work_control_time": "Control over work schedule (对工作时间安排的控制)",
|
|
133
|
+
"work_control_job": "Autonomy over how work is done (对工作方式的自主权)",
|
|
134
|
+
"work_amount": "Workload pressure (工作量压力)",
|
|
135
|
+
"work_purpose": "Sense of purpose & meaning at work (工作目标感/意义感)",
|
|
136
|
+
"work_learning": "Learning opportunities at work (工作中学习机会)",
|
|
137
|
+
"work_colleagues": "Relationship with colleagues (与同事关系)",
|
|
138
|
+
"work_manager": "Relationship with manager/supervisor (与上司关系)",
|
|
139
|
+
"work_informed": "Being kept informed at work (工作信息知情度)",
|
|
140
|
+
"work_recognition": "Recognition at work (工作认可度)",
|
|
141
|
+
"work_factors": "Work stress factors — multi-select (工作压力因素 多选)",
|
|
142
|
+
"job_sector": "Industry sector (所属行业)",
|
|
143
|
+
"work_activity": "Type of work activity (工作活动类型)",
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# =============================================================================
|
|
147
|
+
# Lifestyle — diet, exercise, sleep, socialising
|
|
148
|
+
# =============================================================================
|
|
149
|
+
LIFESTYLE = {
|
|
150
|
+
"cantril": "Cantril ladder — overall life evaluation 1-9 (Cantril 阶梯 总体生活评价 1-9)",
|
|
151
|
+
"sleep_freq": "Frequency of getting enough sleep (充足睡眠频率)",
|
|
152
|
+
"sleep_problem_type": "Sleep problem type — multi-select (睡眠问题类型 多选)",
|
|
153
|
+
"exercise_freq": "Exercise frequency — ≥30 min/session (运动频率 ≥30分钟/次)",
|
|
154
|
+
"UPF_freq": "Ultra-processed food consumption frequency (超加工食品摄入频率)",
|
|
155
|
+
"fruit_veg_freq": "Fresh fruit & vegetable intake frequency (新鲜蔬果摄入频率)",
|
|
156
|
+
"organic_fruit_veg_freq": "Organic fruit & vegetable consumption (有机蔬果食用频率)",
|
|
157
|
+
"sugary_food_freq": "Sweet/sugary food or dessert frequency (甜食/含糖食物频率)",
|
|
158
|
+
"meat_diet": "Meat consumption habits (肉类摄入习惯)",
|
|
159
|
+
"fish_diet": "Fish/shellfish consumption habits (鱼类/贝类摄入习惯)",
|
|
160
|
+
"plastic_food": "Food/drinks from plastic containers (塑料容器装食物/饮品频率)",
|
|
161
|
+
"plastic_hot_food": "Hot food from plastic containers (塑料容器装热食频率)",
|
|
162
|
+
"plastic_hot_drink": "Hot drinks in paper cups (纸杯热饮频率)",
|
|
163
|
+
"social_freq": "In-person socializing frequency (线下社交频率)",
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
# =============================================================================
|
|
167
|
+
# Substance use, medical conditions & treatment
|
|
168
|
+
# =============================================================================
|
|
169
|
+
SUBSTANCE_AND_MEDICAL = {
|
|
170
|
+
"substance_use": "Substance use — multi-select (物质使用 多选)",
|
|
171
|
+
"medical_condition_presence": "Whether diagnosed with a medical condition (是否有确诊疾病)",
|
|
172
|
+
"medical_condition_type": "Medical condition type — multi-select (疾病类型 多选)",
|
|
173
|
+
"treatment_status": "Whether currently receiving treatment (是否正在接受治疗)",
|
|
174
|
+
"help_seeking": "Whether sought help (是否寻求过帮助)",
|
|
175
|
+
"treatment_type_new": "Treatment type — new version, multi-select (治疗类型 新版 多选)",
|
|
176
|
+
"treatment_type": "Treatment type — old version, multi-select (治疗类型 旧版 多选)",
|
|
177
|
+
"therapy_efficacy": "Perceived effectiveness of psychological therapy (心理治疗有效性评价)",
|
|
178
|
+
"medication_efficacy": "Perceived effectiveness of medication (药物治疗有效性评价)",
|
|
179
|
+
"brain_stim_efficacy": "Perceived effectiveness of brain stimulation (脑刺激治疗效果)",
|
|
180
|
+
"neurofeedback_efficacy": "Perceived effectiveness of neurofeedback (神经反馈治疗效果)",
|
|
181
|
+
"mental_health_disorder": "Mental health disorder diagnosis (心理健康障碍诊断)",
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
# =============================================================================
|
|
185
|
+
# Trauma & adversity
|
|
186
|
+
# =============================================================================
|
|
187
|
+
TRAUMA = {
|
|
188
|
+
"trauma_childhood": "Childhood trauma experiences — before age 18, multi-select (童年创伤经历 18岁前 多选)",
|
|
189
|
+
"trauma_adulthood": "Adult trauma experiences — after age 18, multi-select (成年创伤经历 18岁后 多选)",
|
|
190
|
+
"trauma_life_old": "Life trauma — archived old version (人生创伤经历 旧版 已归档)",
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
# =============================================================================
|
|
194
|
+
# Family, friendship & community
|
|
195
|
+
# =============================================================================
|
|
196
|
+
FAMILY_AND_FRIENDS = {
|
|
197
|
+
"family_situation": "Current family situation (当前家庭状况)",
|
|
198
|
+
"children_num": "Number of children (子女数量)",
|
|
199
|
+
"household_size": "Number of people sharing household (共同居住人数)",
|
|
200
|
+
"siblings_num": "Number of siblings growing up — archived (成长中兄弟姐妹数 已归档)",
|
|
201
|
+
"friends_num": "Number of close friends — archived (密友数量 已归档)",
|
|
202
|
+
"friends_childhood": "Friends known since childhood — archived (童年至今朋友数量 已归档)",
|
|
203
|
+
"friends_proximity": "Close friends live nearby — archived (密友是否住附近 已归档)",
|
|
204
|
+
"friendship_type": "Mode of interaction with friends — archived (与朋友的互动方式 已归档)",
|
|
205
|
+
"friends_help_out": "Whether friends would help out when in need (是否有朋友能帮忙)",
|
|
206
|
+
"friends_confide_in": "Whether has friends to confide in — archived (是否有朋友可倾诉 已归档)",
|
|
207
|
+
"household_nature": "Nature of household growing up — conflict/stable, archived (成长家庭冲突/稳定性 已归档)",
|
|
208
|
+
"household_description": "Household growing up — warm/distant, archived (成长家庭温暖/疏离程度 已归档)",
|
|
209
|
+
"parental_support": "Type of parental/caregiver support — archived (父母/照顾者支持类型 已归档)",
|
|
210
|
+
"family_proximity": "Adult family living nearby — archived (成年家人是否住附近 已归档)",
|
|
211
|
+
"family_relationships": "Quality of relationships with adult family (与成年家人关系质量)",
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# =============================================================================
|
|
215
|
+
# Faith & religion
|
|
216
|
+
# =============================================================================
|
|
217
|
+
FAITH = {
|
|
218
|
+
"spirituality_connection": "Spiritual/transcendent connection — archived (灵性/超越连接感 已归档)",
|
|
219
|
+
"love_feelings": "Extent of loving feelings towards others (对他人的关爱程度)",
|
|
220
|
+
"religious_identity": "Religious identity — archived (宗教身份认同 已归档)",
|
|
221
|
+
"religious_practice": "Whether actively practices religion — archived (是否践行宗教活动 已归档)",
|
|
222
|
+
"individual_collective": "Individualism vs. collectivism orientation — archived (个人主义 vs 集体主义倾向 已归档)",
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
# =============================================================================
|
|
226
|
+
# Technology — smartphones, social media, AI, VR, gaming, nature
|
|
227
|
+
# =============================================================================
|
|
228
|
+
TECHNOLOGY = {
|
|
229
|
+
# Smartphone & tablet
|
|
230
|
+
"smartphone_own_old": "Owns a smartphone — old version, archived (是否拥有手机 旧版 已归档)",
|
|
231
|
+
"smartphone_age_access": "Age first obtained a smartphone (首次获得手机年龄)",
|
|
232
|
+
"smartphone_school_old": "Smartphone use at school — old version, archived (学校使用手机情况 旧版 已归档)",
|
|
233
|
+
"smartphone_class_old": "Smartphone use in class — old version, archived (课堂上使用手机 旧版 已归档)",
|
|
234
|
+
"smartphone_ownership _age": "Age of smartphone ownership (拥有手机年龄)",
|
|
235
|
+
"smartphone_friends": "Number of friends with smartphones (手机朋友数量)",
|
|
236
|
+
"smartphone_school_age": "Age school provided a smartphone (学校提供手机年龄)",
|
|
237
|
+
"smartphone_class": "Smartphone use in class — new version (课堂使用手机 新版)",
|
|
238
|
+
"smartphone_recess": "Smartphone use during recess (课间使用手机)",
|
|
239
|
+
"tablet_ownership_age": "Age obtained a tablet (获得平板电脑年龄)",
|
|
240
|
+
"smartphone_tablet_age": "Age obtained a smartphone/tablet (获得手机/平板年龄)",
|
|
241
|
+
"laptop_school_age": "Age school provided a laptop (学校提供笔记本年龄)",
|
|
242
|
+
"laptop_class": "Laptop use in class (课堂使用笔记本)",
|
|
243
|
+
"internet_restrictions": "Internet access restrictions (上网限制)",
|
|
244
|
+
|
|
245
|
+
# Social media
|
|
246
|
+
"social_media_age": "Age started using social media (开始使用社交媒体年龄)",
|
|
247
|
+
"social_media_freq": "Social media use frequency (社交媒体使用频率)",
|
|
248
|
+
"sm_freq_new": "Social media use frequency — new version (社交媒体使用频率 新版)",
|
|
249
|
+
"sm_impact": "Social media impact on life — multi-select (社交媒体对生活的影响 多选)",
|
|
250
|
+
|
|
251
|
+
# AI
|
|
252
|
+
"ai_freq": "AI tool usage frequency (AI 工具使用频率)",
|
|
253
|
+
"ai_use_general": "AI general use cases — multi-select (AI 通用用途 多选)",
|
|
254
|
+
"ai_use_social": "AI social/emotional use cases — multi-select (AI 社交/情感用途 多选)",
|
|
255
|
+
"ai_impact_personal": "AI impact on personal life — multi-select (AI 对个人的影响 多选)",
|
|
256
|
+
"ai_impact_work": "AI impact on work — multi-select (AI 对工作的影响 多选)",
|
|
257
|
+
|
|
258
|
+
# VR, gaming & nature
|
|
259
|
+
"vr_freq": "VR headset usage frequency (VR 头显使用频率)",
|
|
260
|
+
"gaming_freq": "Video gaming frequency (电子游戏频率)",
|
|
261
|
+
"time_nature": "Time spent in natural environments (在自然环境中时长)",
|
|
262
|
+
"live_close_nature": "Whether lives close to nature (是否居住在自然附近)",
|
|
263
|
+
"immersion_nature": "Most-frequented natural environment types — multi-select (最常接触的自然环境类型 多选)",
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
# =============================================================================
|
|
267
|
+
# Benchmarking scales — PHQ-9, GAD-7, life satisfaction
|
|
268
|
+
# =============================================================================
|
|
269
|
+
BENCHMARKING = {
|
|
270
|
+
# PHQ-9 (depression screening)
|
|
271
|
+
"PHQ9_interest": "PHQ-9: Little interest or pleasure in doing things (做事缺乏兴趣或乐趣)",
|
|
272
|
+
"PHQ9_depressed": "PHQ-9: Feeling down, depressed, or hopeless (感到沮丧、抑郁或绝望)",
|
|
273
|
+
"PHQ9_sleep": "PHQ-9: Trouble falling/staying asleep or sleeping too much (入睡困难、易醒或嗜睡)",
|
|
274
|
+
"PHQ9_energy": "PHQ-9: Feeling tired or having little energy (感到疲倦或缺乏精力)",
|
|
275
|
+
"PHQ9_failure": "PHQ-9: Poor appetite or overeating (食欲不振或暴饮暴食)",
|
|
276
|
+
"PHQ9_appetite": "PHQ-9: Feeling bad about yourself — failure, letting family down (自我否定/觉得自己失败)",
|
|
277
|
+
"PHQ9_concentration": "PHQ-9: Trouble concentrating on things (注意力难以集中)",
|
|
278
|
+
"PHQ9_movement": "PHQ-9: Moving/speaking slowly or being fidgety/restless (动作迟缓或焦躁不安)",
|
|
279
|
+
"PHQ9_self_harm": "PHQ-9: Thoughts of self-harm or being better off dead (自伤或自杀念头)",
|
|
280
|
+
# GAD-7 (anxiety screening)
|
|
281
|
+
"GAD7_nervous": "GAD-7: Feeling nervous, anxious, or on edge (感到紧张、焦虑)",
|
|
282
|
+
"GAD7_worry": "GAD-7: Not being able to stop or control worrying (无法停止或控制担忧)",
|
|
283
|
+
"GAD7_self_control": "GAD-7: Worrying too much about different things (对很多事情过度担忧)",
|
|
284
|
+
"GAD7_relax": "GAD-7: Trouble relaxing (难以放松)",
|
|
285
|
+
"GAD7_restless": "GAD-7: Being so restless that it is hard to sit still (坐立不安)",
|
|
286
|
+
"GAD7_irritable": "GAD-7: Becoming easily annoyed or irritable (容易烦躁或易怒)",
|
|
287
|
+
"GAD7_afraid": "GAD-7: Feeling afraid as if something awful might happen (感到害怕)",
|
|
288
|
+
"GAD7_impact": "GAD-7: How difficult these problems made daily life (上述问题对生活的影响程度)",
|
|
289
|
+
# Life satisfaction
|
|
290
|
+
"life_satisfaction": "Overall life satisfaction 1-9 (总体生活满意度 1-9)",
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
# =============================================================================
|
|
294
|
+
# Momentary assessments — archived
|
|
295
|
+
# =============================================================================
|
|
296
|
+
MOMENTARY = {
|
|
297
|
+
"time_day": "Time of day when assessment taken — archived (评估时间段 已归档)",
|
|
298
|
+
"mood_current": "Current mood at assessment time — archived (当前情绪 已归档)",
|
|
299
|
+
"alertness_current": "Current alertness level — archived (当前清醒程度 已归档)",
|
|
300
|
+
"sleep_prevous_night": "Hours slept the previous night — archived (前一晚睡眠时长 已归档)",
|
|
301
|
+
"time_last_meal": "Time since last meal — archived (距上次进食时间 已归档)",
|
|
302
|
+
"physical_complaints": "Physical complaints at assessment time — archived (当前身体不适症状 已归档)",
|
|
303
|
+
"pregnancy": "Whether currently pregnant — archived (是否怀孕 已归档)",
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
# =============================================================================
|
|
307
|
+
# COVID-19 impact — archived
|
|
308
|
+
# =============================================================================
|
|
309
|
+
COVID = {
|
|
310
|
+
"covid_health": "COVID-19: health & social impact — archived (新冠疫情 健康与社交影响 已归档)",
|
|
311
|
+
"covid_finance": "COVID-19: financial impact — archived (新冠疫情 财务影响 已归档)",
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
# =============================================================================
|
|
315
|
+
# Repeat respondent identifiers (anonymous, derived from email)
|
|
316
|
+
# =============================================================================
|
|
317
|
+
DEDUP = {
|
|
318
|
+
"Repeat identifier": "Repeat respondent identifier — anonymous, derived from email (重复受访者标识符, 由邮箱匿名化生成)",
|
|
319
|
+
"Repeat identifier 2": "Repeat respondent identifier — backup (重复受访者标识符 备用)",
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
# =============================================================================
|
|
323
|
+
# Flat aggregated dict for direct column-name lookup
|
|
324
|
+
# =============================================================================
|
|
325
|
+
COLUMN_DESCRIPTIONS: dict[str, str] = {}
|
|
326
|
+
for _group in [
|
|
327
|
+
META,
|
|
328
|
+
MHQ_SCORES,
|
|
329
|
+
MHQ_ITEMS,
|
|
330
|
+
DEMOGRAPHICS_CORE,
|
|
331
|
+
WORK,
|
|
332
|
+
LIFESTYLE,
|
|
333
|
+
SUBSTANCE_AND_MEDICAL,
|
|
334
|
+
TRAUMA,
|
|
335
|
+
FAMILY_AND_FRIENDS,
|
|
336
|
+
FAITH,
|
|
337
|
+
TECHNOLOGY,
|
|
338
|
+
BENCHMARKING,
|
|
339
|
+
MOMENTARY,
|
|
340
|
+
COVID,
|
|
341
|
+
DEDUP,
|
|
342
|
+
]:
|
|
343
|
+
COLUMN_DESCRIPTIONS.update(_group)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def describe_column(name: str) -> str:
|
|
347
|
+
"""Return the human-readable description for a column, or the column name itself if not found."""
|
|
348
|
+
return COLUMN_DESCRIPTIONS.get(name, name)
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
# 20 Problem items — clinical threshold ≥8
|
|
4
|
+
_PROBLEM_COLS = [
|
|
5
|
+
"restless_hyperactive", # Restlessness & Hyperactivity
|
|
6
|
+
"fear_anxiety", # Fear & Anxiety
|
|
7
|
+
"infections", # Susceptibility to Infections
|
|
8
|
+
"aggression", # Aggression Towards Others
|
|
9
|
+
"avoidance", # Avoidance & Withdrawal
|
|
10
|
+
"obsessive_thoughts", # Unwanted, Strange or Obsessive Thoughts
|
|
11
|
+
"mood_swings", # Mood Swings
|
|
12
|
+
"detached_reality", # Sense of being detached from reality
|
|
13
|
+
"nightmares", # Nightmares
|
|
14
|
+
"addictions", # Addictions
|
|
15
|
+
"anger", # Anger & Irritability
|
|
16
|
+
"suicidal_thoughts", # Suicidal Thoughts or Intentions
|
|
17
|
+
"pain", # Experience of Pain
|
|
18
|
+
"guilt_blame", # Guilt & Blame
|
|
19
|
+
"hallucinations", # Hallucinations
|
|
20
|
+
"flashblacks", # Traumatic Flashbacks
|
|
21
|
+
"repetitive_actions", # Repetitive or Compulsive Actions
|
|
22
|
+
"sad_hopeless", # Feelings of Sadness, Distress or Hopelessness
|
|
23
|
+
"physical_health", # Physical Health Issues
|
|
24
|
+
"confusion", # Confusion or Slowed Thinking
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# 27 Spectrum items — clinical threshold ≤1
|
|
28
|
+
_SPECTRUM_COLS = [
|
|
29
|
+
"adapt_to_change", # Adaptability to Change
|
|
30
|
+
"self_worth_confidence", # Self Worth & Confidence
|
|
31
|
+
"creativity_problem_solving", # Creativity & Problem Solving
|
|
32
|
+
"drive_motivation", # Drive & Motivation
|
|
33
|
+
"stability_calmness", # Stability & Calmness
|
|
34
|
+
"sleep_quality", # Sleep Quality
|
|
35
|
+
"self_control_impulsivity", # Self Control & Impulsivity
|
|
36
|
+
"ability_learn", # Ability to Learn
|
|
37
|
+
"coordination", # Coordination
|
|
38
|
+
"relationships", # Relationships with others
|
|
39
|
+
"emotional_resilience", # Emotional Resilience
|
|
40
|
+
"planning_organization", # Planning & Organisation
|
|
41
|
+
"physical_intimacy", # Physical Intimacy
|
|
42
|
+
"speech_language", # Speech & Language
|
|
43
|
+
"memory", # Memory
|
|
44
|
+
"social_cooperation", # Social interactions & Cooperation
|
|
45
|
+
"decision_risk", # Decision-making & Risk-taking
|
|
46
|
+
"curiosity_enthusiasm", # Curiosity, Interest & Enthusiasm
|
|
47
|
+
"energy", # Energy Level
|
|
48
|
+
"emotional_control", # Emotional Control
|
|
49
|
+
"focus_concentration", # Focus & Concentration
|
|
50
|
+
"appetite_regulation", # Appetite Regulation
|
|
51
|
+
"empathy", # Empathy
|
|
52
|
+
"sensory_sensitivity", # Sensory Sensitivity
|
|
53
|
+
"self_image", # Self-Image
|
|
54
|
+
"outlook_optimism", # Outlook & Optimism
|
|
55
|
+
"selective_attention", # Selective Attention
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
# DSM‑5 mapping rules
|
|
59
|
+
_SYM = "_symptom"
|
|
60
|
+
|
|
61
|
+
_DSM5_RULES: dict[str, dict] = {
|
|
62
|
+
"DSM5_depression": {
|
|
63
|
+
"core_groups": [0, 1], "min_groups": 5,
|
|
64
|
+
"groups": [
|
|
65
|
+
[f"drive_motivation{_SYM}", f"curiosity_enthusiasm{_SYM}"],
|
|
66
|
+
[f"sad_hopeless{_SYM}", f"outlook_optimism{_SYM}"],
|
|
67
|
+
[f"appetite_regulation{_SYM}"], [f"confusion{_SYM}"], [f"energy{_SYM}"],
|
|
68
|
+
[f"self_worth_confidence{_SYM}", f"self_image{_SYM}", f"guilt_blame{_SYM}"],
|
|
69
|
+
[f"focus_concentration{_SYM}", f"selective_attention{_SYM}", f"decision_risk{_SYM}"],
|
|
70
|
+
[f"suicidal_thoughts{_SYM}"],
|
|
71
|
+
],
|
|
72
|
+
},
|
|
73
|
+
"DSM5_anxiety": {
|
|
74
|
+
"required": [f"fear_anxiety{_SYM}"], "required_groups": [0], "min_groups": 3,
|
|
75
|
+
"groups": [
|
|
76
|
+
[f"stability_calmness{_SYM}", f"emotional_control{_SYM}"],
|
|
77
|
+
[f"restless_hyperactive{_SYM}"], [f"energy{_SYM}"],
|
|
78
|
+
[f"focus_concentration{_SYM}", f"selective_attention{_SYM}"],
|
|
79
|
+
[f"anger{_SYM}"], [f"pain{_SYM}"], [f"sleep_quality{_SYM}"], [f"avoidance{_SYM}"],
|
|
80
|
+
],
|
|
81
|
+
},
|
|
82
|
+
"DSM5_bipolar": {
|
|
83
|
+
"required": [f"mood_swings{_SYM}"], "core_groups": [0, 1], "min_groups": 5,
|
|
84
|
+
"groups": [
|
|
85
|
+
[f"drive_motivation{_SYM}", f"curiosity_enthusiasm{_SYM}"],
|
|
86
|
+
[f"sad_hopeless{_SYM}", f"outlook_optimism{_SYM}"],
|
|
87
|
+
[f"appetite_regulation{_SYM}"], [f"confusion{_SYM}"], [f"energy{_SYM}"],
|
|
88
|
+
[f"self_worth_confidence{_SYM}", f"self_image{_SYM}", f"guilt_blame{_SYM}"],
|
|
89
|
+
[f"focus_concentration{_SYM}", f"selective_attention{_SYM}", f"decision_risk{_SYM}"],
|
|
90
|
+
[f"suicidal_thoughts{_SYM}"],
|
|
91
|
+
],
|
|
92
|
+
},
|
|
93
|
+
"DSM5_ptsd": {
|
|
94
|
+
"trauma_required": True, "core_groups": [0],
|
|
95
|
+
"required": [f"avoidance{_SYM}"], "min_groups": 2,
|
|
96
|
+
"groups": [
|
|
97
|
+
[f"flashblacks{_SYM}", f"nightmares{_SYM}", f"obsessive_thoughts{_SYM}"],
|
|
98
|
+
[f"memory{_SYM}"],
|
|
99
|
+
[f"self_worth_confidence{_SYM}", f"self_image{_SYM}", f"outlook_optimism{_SYM}"],
|
|
100
|
+
[f"guilt_blame{_SYM}"], [f"sad_hopeless{_SYM}"],
|
|
101
|
+
[f"curiosity_enthusiasm{_SYM}", f"drive_motivation{_SYM}"],
|
|
102
|
+
[f"relationships{_SYM}"],
|
|
103
|
+
],
|
|
104
|
+
},
|
|
105
|
+
"DSM5_ocd": {
|
|
106
|
+
"required": [f"obsessive_thoughts{_SYM}", f"repetitive_actions{_SYM}", f"fear_anxiety{_SYM}"],
|
|
107
|
+
"min_groups": 1,
|
|
108
|
+
"groups": [
|
|
109
|
+
[f"stability_calmness{_SYM}"], [f"self_control_impulsivity{_SYM}"],
|
|
110
|
+
[f"emotional_control{_SYM}"],
|
|
111
|
+
],
|
|
112
|
+
},
|
|
113
|
+
"DSM5_schizophrenia": {
|
|
114
|
+
"required": [f"obsessive_thoughts{_SYM}", f"hallucinations{_SYM}"], "min_groups": 1,
|
|
115
|
+
"groups": [
|
|
116
|
+
[f"speech_language{_SYM}"], [f"repetitive_actions{_SYM}"],
|
|
117
|
+
[f"drive_motivation{_SYM}", f"relationships{_SYM}",
|
|
118
|
+
f"social_cooperation{_SYM}", f"curiosity_enthusiasm{_SYM}"],
|
|
119
|
+
],
|
|
120
|
+
},
|
|
121
|
+
"DSM5_eating": {
|
|
122
|
+
"required": [f"appetite_regulation{_SYM}", f"fear_anxiety{_SYM}", f"self_image{_SYM}"],
|
|
123
|
+
"groups": [],
|
|
124
|
+
},
|
|
125
|
+
"DSM5_addiction": {
|
|
126
|
+
"required": [f"addictions{_SYM}"], "min_groups": 2,
|
|
127
|
+
"groups": [
|
|
128
|
+
[f"decision_risk{_SYM}"], [f"emotional_control{_SYM}"], [f"avoidance{_SYM}"],
|
|
129
|
+
[f"relationships{_SYM}"], [f"self_control_impulsivity{_SYM}"],
|
|
130
|
+
],
|
|
131
|
+
},
|
|
132
|
+
"DSM5_adhd": {
|
|
133
|
+
"min_groups": 4,
|
|
134
|
+
"groups": [
|
|
135
|
+
[f"focus_concentration{_SYM}"], [f"selective_attention{_SYM}"],
|
|
136
|
+
[f"drive_motivation{_SYM}"], [f"planning_organization{_SYM}"], [f"memory{_SYM}"],
|
|
137
|
+
],
|
|
138
|
+
},
|
|
139
|
+
"DSM5_asd": {
|
|
140
|
+
"min_groups": 3,
|
|
141
|
+
"groups": [
|
|
142
|
+
[f"social_cooperation{_SYM}"], [f"relationships{_SYM}"],
|
|
143
|
+
[f"repetitive_actions{_SYM}"], [f"adapt_to_change{_SYM}"],
|
|
144
|
+
[f"sensory_sensitivity{_SYM}"],
|
|
145
|
+
[f"selective_attention{_SYM}", f"focus_concentration{_SYM}"],
|
|
146
|
+
],
|
|
147
|
+
},
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
def _group_flags(df: pl.LazyFrame, groups: list[list[str]]) -> list[pl.Expr]:
|
|
151
|
+
"""Return one boolean expression per group (True if any symptom in group is True)."""
|
|
152
|
+
return [
|
|
153
|
+
pl.any_horizontal(
|
|
154
|
+
[
|
|
155
|
+
pl.col(c).fill_null(False)
|
|
156
|
+
for c in grp
|
|
157
|
+
]
|
|
158
|
+
)
|
|
159
|
+
for grp in groups
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
def identify_symptoms(df: pl.LazyFrame) -> pl.LazyFrame:
|
|
163
|
+
"""Identify symptoms based on clinical thresholds for problem and spectrum items.
|
|
164
|
+
Args:
|
|
165
|
+
df (pl.LazyFrame): Input Polars LazyFrame containing the data.
|
|
166
|
+
Returns:
|
|
167
|
+
pl.LazyFrame: A new Polars LazyFrame with additional columns indicating the presence of symptoms.
|
|
168
|
+
"""
|
|
169
|
+
# Identify problem symptoms (clinical threshold ≥8)
|
|
170
|
+
for col in _PROBLEM_COLS:
|
|
171
|
+
df = df.with_columns(
|
|
172
|
+
(pl.col(col) >= 8).alias(f"{col}_symptom")
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Identify spectrum symptoms (clinical threshold ≤1)
|
|
176
|
+
for col in _SPECTRUM_COLS:
|
|
177
|
+
df = df.with_columns(
|
|
178
|
+
(pl.col(col) <= 1).alias(f"{col}_symptom")
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
df = df.with_columns(
|
|
182
|
+
pl.sum_horizontal(
|
|
183
|
+
[pl.col(f"{c}_symptom") for c in _PROBLEM_COLS + _SPECTRUM_COLS]
|
|
184
|
+
).alias("symptom_count")
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return df
|
|
188
|
+
|
|
189
|
+
def mapping_to_DSM5(df: pl.LazyFrame) -> pl.LazyFrame:
|
|
190
|
+
"""Map the identified symptoms to DSM-5 categories.
|
|
191
|
+
Args:
|
|
192
|
+
df (pl.LazyFrame): Input Polars LazyFrame containing the data with symptom indicators.
|
|
193
|
+
Returns:
|
|
194
|
+
pl.LazyFrame: A new Polars LazyFrame with additional columns indicating DSM-5 categories.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
# Pre‑compute a "has trauma" flag for PTSD
|
|
198
|
+
df = df.with_columns(
|
|
199
|
+
pl.when(
|
|
200
|
+
pl.col("trauma_childhood").list.len() > 0
|
|
201
|
+
& ~pl.col("trauma_childhood").list.contains(
|
|
202
|
+
"I did not experience any of the above during my childhood"
|
|
203
|
+
)
|
|
204
|
+
)
|
|
205
|
+
.then(True)
|
|
206
|
+
.when(
|
|
207
|
+
pl.col("trauma_adulthood").list.len() > 0
|
|
208
|
+
& ~pl.col("trauma_adulthood").list.contains(
|
|
209
|
+
"I did not experience any of the above"
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
.then(True)
|
|
213
|
+
.otherwise(False)
|
|
214
|
+
.alias("_has_trauma")
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
for label, rule in _DSM5_RULES.items():
|
|
218
|
+
g = _group_flags(df, rule["groups"])
|
|
219
|
+
ok = pl.lit(True)
|
|
220
|
+
|
|
221
|
+
# Required single columns (ALL must be True)
|
|
222
|
+
for c in rule.get("required", []):
|
|
223
|
+
ok = ok & pl.col(c).fill_null(False)
|
|
224
|
+
|
|
225
|
+
# Required groups (each must be True)
|
|
226
|
+
for gi in rule.get("required_groups", []):
|
|
227
|
+
ok = ok & g[gi]
|
|
228
|
+
|
|
229
|
+
# Core group check (≥1 of this set must be True)
|
|
230
|
+
if "core_groups" in rule:
|
|
231
|
+
ok = ok & pl.any_horizontal([g[i] for i in rule["core_groups"]])
|
|
232
|
+
|
|
233
|
+
# Min total groups check
|
|
234
|
+
if rule.get("min_groups"):
|
|
235
|
+
total = pl.sum_horizontal([*g])
|
|
236
|
+
ok = ok & (total >= rule["min_groups"])
|
|
237
|
+
|
|
238
|
+
# Special: trauma flag
|
|
239
|
+
if rule.get("trauma_required"):
|
|
240
|
+
ok = ok & pl.col("_has_trauma")
|
|
241
|
+
|
|
242
|
+
df = df.with_columns(ok.alias(label))
|
|
243
|
+
|
|
244
|
+
return df.drop("_has_trauma")
|