m2datakit 0.1.97__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- m2datakit/__init__.py +16 -0
- m2datakit/core/__init__.py +13 -0
- m2datakit/core/codebook.py +108 -0
- m2datakit/core/config.py +86 -0
- m2datakit/core/export.py +68 -0
- m2datakit/core/helpers.py +490 -0
- m2datakit/core/importers.py +463 -0
- m2datakit/core/log.py +66 -0
- m2datakit/core/map.py +97 -0
- m2datakit/core/pipeline.py +475 -0
- m2datakit/core/plot.py +86 -0
- m2datakit/core/utils.py +71 -0
- m2datakit/core/validate.py +57 -0
- m2datakit/tasks/__init__.py +52 -0
- m2datakit/tasks/color_dots.py +271 -0
- m2datakit/tasks/color_match.py +183 -0
- m2datakit/tasks/color_shapes.py +198 -0
- m2datakit/tasks/digit_span.py +104 -0
- m2datakit/tasks/even_odd.py +231 -0
- m2datakit/tasks/go_no_go.py +105 -0
- m2datakit/tasks/go_no_go_fade.py +141 -0
- m2datakit/tasks/grid_memory.py +119 -0
- m2datakit/tasks/iat.py +317 -0
- m2datakit/tasks/jolo.py +237 -0
- m2datakit/tasks/motion.py +195 -0
- m2datakit/tasks/shopping_list.py +191 -0
- m2datakit/tasks/stroop.py +248 -0
- m2datakit/tasks/symbol_number_matching.py +38 -0
- m2datakit/tasks/symbol_search.py +70 -0
- m2datakit/tasks/trailmaking.py +98 -0
- m2datakit-0.1.97.dist-info/METADATA +866 -0
- m2datakit-0.1.97.dist-info/RECORD +35 -0
- m2datakit-0.1.97.dist-info/WHEEL +5 -0
- m2datakit-0.1.97.dist-info/licenses/LICENSE +674 -0
- m2datakit-0.1.97.dist-info/top_level.txt +1 -0
m2datakit/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# m2datakit/__init__.py
|
|
2
|
+
|
|
3
|
+
# Convenience imports available at package import time
|
|
4
|
+
import glob # noqa: F401
|
|
5
|
+
import json # noqa: F401
|
|
6
|
+
import os # noqa: F401
|
|
7
|
+
import re # noqa: F401
|
|
8
|
+
|
|
9
|
+
from dotenv import load_dotenv # noqa: F401
|
|
10
|
+
|
|
11
|
+
from . import core, tasks
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"core",
|
|
15
|
+
"tasks",
|
|
16
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from . import pipeline # Expose pipeline cleanly
|
|
2
|
+
from . import log
|
|
3
|
+
from .config import settings
|
|
4
|
+
from .helpers import summarize_common_metadata
|
|
5
|
+
from .utils import get_package_version
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"log",
|
|
9
|
+
"pipeline",
|
|
10
|
+
"settings",
|
|
11
|
+
"get_package_version",
|
|
12
|
+
"summarize_common_metadata",
|
|
13
|
+
]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from docx import Document
|
|
3
|
+
from fpdf import FPDF
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def make_codebook(df, output_format='markdown', filename='codebook', custom_descriptions=None):
|
|
7
|
+
"""
|
|
8
|
+
Generate a skimr-style codebook from a pandas DataFrame and export it.
|
|
9
|
+
|
|
10
|
+
Parameters:
|
|
11
|
+
- df: pandas DataFrame
|
|
12
|
+
- output_format: 'markdown', 'docx', or 'pdf'
|
|
13
|
+
- filename: base name for output file
|
|
14
|
+
- custom_descriptions: dict of {column_name: description} overrides
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def describe_variable(col, series):
|
|
18
|
+
if pd.api.types.is_numeric_dtype(series):
|
|
19
|
+
return {
|
|
20
|
+
'n': len(series),
|
|
21
|
+
'missing': series.isna().sum(),
|
|
22
|
+
'mean': round(series.mean(), 2),
|
|
23
|
+
'std': round(series.std(), 2),
|
|
24
|
+
'min': series.min(),
|
|
25
|
+
'max': series.max()
|
|
26
|
+
}
|
|
27
|
+
elif pd.api.types.is_categorical_dtype(series) or series.dtype == object:
|
|
28
|
+
top = series.mode().iloc[0] if not series.mode().empty else ''
|
|
29
|
+
return {
|
|
30
|
+
'n': len(series),
|
|
31
|
+
'missing': series.isna().sum(),
|
|
32
|
+
'unique': series.nunique(),
|
|
33
|
+
'top': top,
|
|
34
|
+
'freq': series.value_counts().iloc[0] if not series.value_counts().empty else ''
|
|
35
|
+
}
|
|
36
|
+
else:
|
|
37
|
+
return {
|
|
38
|
+
'n': len(series),
|
|
39
|
+
'missing': series.isna().sum(),
|
|
40
|
+
'example': str(series.dropna().iloc[0]) if not series.dropna().empty else ''
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
rows = []
|
|
44
|
+
for col in df.columns:
|
|
45
|
+
desc = custom_descriptions.get(col) if custom_descriptions else None
|
|
46
|
+
if not desc:
|
|
47
|
+
desc = col.replace('_', ' ').capitalize()
|
|
48
|
+
summary = describe_variable(col, df[col])
|
|
49
|
+
summary['variable'] = col
|
|
50
|
+
summary['type'] = str(df[col].dtype)
|
|
51
|
+
summary['description'] = desc
|
|
52
|
+
rows.append(summary)
|
|
53
|
+
|
|
54
|
+
codebook_df = pd.DataFrame(rows)
|
|
55
|
+
|
|
56
|
+
# Reorder columns
|
|
57
|
+
cols_order = ['variable', 'type', 'description'] + [col for col in codebook_df.columns if col not in ['variable', 'type', 'description']]
|
|
58
|
+
codebook_df = codebook_df[cols_order]
|
|
59
|
+
|
|
60
|
+
# MARKDOWN
|
|
61
|
+
if output_format == 'markdown':
|
|
62
|
+
with open(f'{filename}.md', 'w') as f:
|
|
63
|
+
f.write('# Codebook\n\n')
|
|
64
|
+
for _, row in codebook_df.iterrows():
|
|
65
|
+
f.write(f"### {row['variable']} ({row['type']})\n")
|
|
66
|
+
f.write(f"**Description**: {row['description']}\n\n")
|
|
67
|
+
for key, val in row.items():
|
|
68
|
+
if key not in ['variable', 'type', 'description']:
|
|
69
|
+
f.write(f"- {key}: {val}\n")
|
|
70
|
+
f.write('\n')
|
|
71
|
+
|
|
72
|
+
# DOCX
|
|
73
|
+
elif output_format == 'docx':
|
|
74
|
+
doc = Document()
|
|
75
|
+
doc.add_heading('Codebook', level=1)
|
|
76
|
+
for _, row in codebook_df.iterrows():
|
|
77
|
+
doc.add_heading(f"{row['variable']} ({row['type']})", level=2)
|
|
78
|
+
doc.add_paragraph(f"Description: {row['description']}")
|
|
79
|
+
for key, val in row.items():
|
|
80
|
+
if key not in ['variable', 'type', 'description']:
|
|
81
|
+
doc.add_paragraph(f"{key}: {val}")
|
|
82
|
+
doc.save(f"{filename}.docx")
|
|
83
|
+
|
|
84
|
+
# PDF
|
|
85
|
+
elif output_format == 'pdf':
|
|
86
|
+
pdf = FPDF()
|
|
87
|
+
pdf.add_page()
|
|
88
|
+
pdf.set_font("Arial", size=10)
|
|
89
|
+
pdf.set_auto_page_break(auto=True, margin=15)
|
|
90
|
+
pdf.multi_cell(0, 10, txt="Codebook", align='L')
|
|
91
|
+
for _, row in codebook_df.iterrows():
|
|
92
|
+
pdf.set_font("Arial", style='B', size=10)
|
|
93
|
+
pdf.cell(0, 10, f"{row['variable']} ({row['type']})", ln=True)
|
|
94
|
+
pdf.set_font("Arial", size=10)
|
|
95
|
+
pdf.multi_cell(0, 10, f"Description: {row['description']}")
|
|
96
|
+
for key, val in row.items():
|
|
97
|
+
if key not in ['variable', 'type', 'description']:
|
|
98
|
+
line = f"{key}: {val}"
|
|
99
|
+
pdf.multi_cell(0, 10, line)
|
|
100
|
+
pdf.ln(5)
|
|
101
|
+
pdf.output(f"{filename}.pdf")
|
|
102
|
+
|
|
103
|
+
else:
|
|
104
|
+
raise ValueError("Format must be one of: 'markdown', 'docx', 'pdf'")
|
|
105
|
+
|
|
106
|
+
print(f"Codebook saved to {filename}.{output_format}")
|
|
107
|
+
|
|
108
|
+
|
m2datakit/core/config.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from importlib.metadata import version
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Settings(BaseModel):
|
|
11
|
+
PACKAGE_VERSION: str = Field(default_factory=lambda: version("m2datakit"))
|
|
12
|
+
|
|
13
|
+
# ABSTRACT ALL IDS BY PROVIDER
|
|
14
|
+
DEDUP_IDS_METRICWIRE: List[str] = [
|
|
15
|
+
"userId",
|
|
16
|
+
"submissionSessionId",
|
|
17
|
+
"activityId",
|
|
18
|
+
]
|
|
19
|
+
DEDUP_IDS_UAS: List[str] = [
|
|
20
|
+
"userId",
|
|
21
|
+
"submissionSessionId",
|
|
22
|
+
"activityId",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
DEDUP_IDS_MONGODB: List[str] = []
|
|
26
|
+
DEDUP_IDS_QUALTRICS: List[str] = [
|
|
27
|
+
"index",
|
|
28
|
+
"ResponseId",
|
|
29
|
+
"M2C2_ASSESSMENT_ORDER",
|
|
30
|
+
"M2C2_AUTO_ADVANCE",
|
|
31
|
+
"M2C2_LANGUAGE",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# def unnest_trial_level_data(df: pd.DataFrame, drop_duplicates=True, column_order: List[str] = None) -> pd.DataFrame:
|
|
35
|
+
# column_order = column_order or ["participant_id", "session_id", "group", "wave", "activity_id", "study_id", "document_uuid"]
|
|
36
|
+
# trial_df = trial_df.drop_duplicates(subset=["activity_uuid", "session_uuid", "trial_begin_iso8601_timestamp"])
|
|
37
|
+
|
|
38
|
+
STANDARD_GROUPING_FOR_AGGREGATION_LEGACY: List[str] = [
|
|
39
|
+
"study_uid",
|
|
40
|
+
"user_uid",
|
|
41
|
+
"uuid",
|
|
42
|
+
"activity_name",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
STANDARD_GROUPING_FOR_AGGREGATION: List[str] = [
|
|
46
|
+
"study_uid",
|
|
47
|
+
"user_uid",
|
|
48
|
+
"session_uuid",
|
|
49
|
+
"activity_name",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
STANDARD_GROUPING_FOR_AGGREGATION_QUALTRICS: List[str] = ["ResponseId"]
|
|
53
|
+
|
|
54
|
+
STANDARD_GROUPING_FOR_AGGREGATION_METRICWIRE: List[str] = [
|
|
55
|
+
"userId",
|
|
56
|
+
"submissionSessionId",
|
|
57
|
+
"activityId",
|
|
58
|
+
]
|
|
59
|
+
STANDARD_GROUPING_FOR_AGGREGATION_UAS: List[str] = [
|
|
60
|
+
"userId",
|
|
61
|
+
"submissionSessionId",
|
|
62
|
+
"activityId",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def QUALTRICS_TRIAL_DATA_REGEX(self):
|
|
67
|
+
return re.compile(r"(M2C2_ASSESSMENT_\d+)_TRIAL_DATA_(\d+)")
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def DEFAULT_FUNC_MAP_SCORING(self):
|
|
71
|
+
from .map import DEFAULT_FUNC_MAP_SCORING
|
|
72
|
+
|
|
73
|
+
return DEFAULT_FUNC_MAP_SCORING
|
|
74
|
+
|
|
75
|
+
# DEFAULTS FOR UI
|
|
76
|
+
DEFAULT_PLOT_COLOR: str = "steelblue"
|
|
77
|
+
DEFAULT_PLOT_DPI: int = 150
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
settings = Settings()
|
|
81
|
+
|
|
82
|
+
# === Logging Configuration (opt-in) ===
|
|
83
|
+
LOGGING_ENABLED = os.getenv("M2DATAKIT_ENABLE_LOGGING", "0").lower() in {"1", "true", "yes"}
|
|
84
|
+
LOGFIRE_ENABLED = os.getenv("M2DATAKIT_ENABLE_LOGFIRE", "0").lower() in {"1", "true", "yes"}
|
|
85
|
+
LOG_DIR = Path(os.getenv("M2DATAKIT_LOG_DIR", str(Path.cwd() / "logs")))
|
|
86
|
+
LOG_FILE = LOG_DIR / "events.jsonl"
|
m2datakit/core/export.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
def export_dataframe(df, file_name, format=".csv", table_name="my_table", **kwargs):
|
|
2
|
+
"""
|
|
3
|
+
Exports a Pandas DataFrame to a specified file format, including raw SQL `INSERT` statements.
|
|
4
|
+
|
|
5
|
+
Parameters:
|
|
6
|
+
df (pd.DataFrame): The DataFrame to export.
|
|
7
|
+
file_name (str): The file name (without extension) to export the DataFrame to.
|
|
8
|
+
format (str): The file format (e.g., '.csv', '.json', '.xlsx', '.sql', '.parquet', etc.).
|
|
9
|
+
table_name (str): Table name for SQL `INSERT` statements (used only when format='.sql').
|
|
10
|
+
**kwargs: Additional keyword arguments for Pandas export functions.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
str: The full file name of the exported file.
|
|
14
|
+
"""
|
|
15
|
+
try:
|
|
16
|
+
file_name_with_extension = f"{file_name}{format}"
|
|
17
|
+
|
|
18
|
+
# Export logic for supported formats
|
|
19
|
+
if format == ".csv":
|
|
20
|
+
df.to_csv(file_name_with_extension, index=False, **kwargs)
|
|
21
|
+
elif format == ".json":
|
|
22
|
+
df.to_json(file_name_with_extension, orient="records", **kwargs)
|
|
23
|
+
elif format == ".xlsx":
|
|
24
|
+
df.to_excel(file_name_with_extension, index=False, **kwargs)
|
|
25
|
+
elif format == ".parquet":
|
|
26
|
+
df.to_parquet(file_name_with_extension, index=False, **kwargs)
|
|
27
|
+
elif format == ".html":
|
|
28
|
+
df.to_html(file_name_with_extension, index=False, **kwargs)
|
|
29
|
+
elif format == ".pkl":
|
|
30
|
+
df.to_pickle(file_name_with_extension, **kwargs)
|
|
31
|
+
elif format == ".txt":
|
|
32
|
+
df.to_csv(file_name_with_extension, index=False, sep="\t", **kwargs)
|
|
33
|
+
else:
|
|
34
|
+
raise ValueError(f"Unsupported file format: {format}")
|
|
35
|
+
|
|
36
|
+
return file_name_with_extension
|
|
37
|
+
|
|
38
|
+
except Exception as e:
|
|
39
|
+
raise RuntimeError(f"Export failed: {e}") from e
|
|
40
|
+
|
|
41
|
+
import json
|
|
42
|
+
from datetime import date
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def export_jsonld_metadata(df, filename="dataset_metadata.json"):
|
|
46
|
+
today = date.today().isoformat()
|
|
47
|
+
metadata = {
|
|
48
|
+
"@context": "https://schema.org/",
|
|
49
|
+
"@type": "Dataset",
|
|
50
|
+
"name": "M2C2Kit Dataset",
|
|
51
|
+
"description": "This dataset contains M2C2kit data processed via the DataKit Python package.",
|
|
52
|
+
"creator": {"@type": "Person", "name": "Nelson Roque", "affiliation": "M2C2"},
|
|
53
|
+
"dateCreated": today,
|
|
54
|
+
"variableMeasured": [],
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
for col in df.columns:
|
|
58
|
+
metadata["variableMeasured"].append(
|
|
59
|
+
{
|
|
60
|
+
"@type": "PropertyValue",
|
|
61
|
+
"name": col,
|
|
62
|
+
"description": f"Auto-description for {col}",
|
|
63
|
+
"value": str(df[col].dtype),
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
with open(filename, "w") as f:
|
|
68
|
+
json.dump(metadata, f, indent=2)
|