m2datakit 0.1.97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
m2datakit/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ # m2datakit/__init__.py
2
+
3
+ # Convenience imports available at package import time
4
+ import glob # noqa: F401
5
+ import json # noqa: F401
6
+ import os # noqa: F401
7
+ import re # noqa: F401
8
+
9
+ from dotenv import load_dotenv # noqa: F401
10
+
11
+ from . import core, tasks
12
+
13
+ __all__ = [
14
+ "core",
15
+ "tasks",
16
+ ]
@@ -0,0 +1,13 @@
1
+ from . import pipeline # Expose pipeline cleanly
2
+ from . import log
3
+ from .config import settings
4
+ from .helpers import summarize_common_metadata
5
+ from .utils import get_package_version
6
+
7
+ __all__ = [
8
+ "log",
9
+ "pipeline",
10
+ "settings",
11
+ "get_package_version",
12
+ "summarize_common_metadata",
13
+ ]
@@ -0,0 +1,108 @@
1
+ import pandas as pd
2
+ from docx import Document
3
+ from fpdf import FPDF
4
+
5
+
6
+ def make_codebook(df, output_format='markdown', filename='codebook', custom_descriptions=None):
7
+ """
8
+ Generate a skimr-style codebook from a pandas DataFrame and export it.
9
+
10
+ Parameters:
11
+ - df: pandas DataFrame
12
+ - output_format: 'markdown', 'docx', or 'pdf'
13
+ - filename: base name for output file
14
+ - custom_descriptions: dict of {column_name: description} overrides
15
+ """
16
+
17
+ def describe_variable(col, series):
18
+ if pd.api.types.is_numeric_dtype(series):
19
+ return {
20
+ 'n': len(series),
21
+ 'missing': series.isna().sum(),
22
+ 'mean': round(series.mean(), 2),
23
+ 'std': round(series.std(), 2),
24
+ 'min': series.min(),
25
+ 'max': series.max()
26
+ }
27
+ elif pd.api.types.is_categorical_dtype(series) or series.dtype == object:
28
+ top = series.mode().iloc[0] if not series.mode().empty else ''
29
+ return {
30
+ 'n': len(series),
31
+ 'missing': series.isna().sum(),
32
+ 'unique': series.nunique(),
33
+ 'top': top,
34
+ 'freq': series.value_counts().iloc[0] if not series.value_counts().empty else ''
35
+ }
36
+ else:
37
+ return {
38
+ 'n': len(series),
39
+ 'missing': series.isna().sum(),
40
+ 'example': str(series.dropna().iloc[0]) if not series.dropna().empty else ''
41
+ }
42
+
43
+ rows = []
44
+ for col in df.columns:
45
+ desc = custom_descriptions.get(col) if custom_descriptions else None
46
+ if not desc:
47
+ desc = col.replace('_', ' ').capitalize()
48
+ summary = describe_variable(col, df[col])
49
+ summary['variable'] = col
50
+ summary['type'] = str(df[col].dtype)
51
+ summary['description'] = desc
52
+ rows.append(summary)
53
+
54
+ codebook_df = pd.DataFrame(rows)
55
+
56
+ # Reorder columns
57
+ cols_order = ['variable', 'type', 'description'] + [col for col in codebook_df.columns if col not in ['variable', 'type', 'description']]
58
+ codebook_df = codebook_df[cols_order]
59
+
60
+ # MARKDOWN
61
+ if output_format == 'markdown':
62
+ with open(f'{filename}.md', 'w') as f:
63
+ f.write('# Codebook\n\n')
64
+ for _, row in codebook_df.iterrows():
65
+ f.write(f"### {row['variable']} ({row['type']})\n")
66
+ f.write(f"**Description**: {row['description']}\n\n")
67
+ for key, val in row.items():
68
+ if key not in ['variable', 'type', 'description']:
69
+ f.write(f"- {key}: {val}\n")
70
+ f.write('\n')
71
+
72
+ # DOCX
73
+ elif output_format == 'docx':
74
+ doc = Document()
75
+ doc.add_heading('Codebook', level=1)
76
+ for _, row in codebook_df.iterrows():
77
+ doc.add_heading(f"{row['variable']} ({row['type']})", level=2)
78
+ doc.add_paragraph(f"Description: {row['description']}")
79
+ for key, val in row.items():
80
+ if key not in ['variable', 'type', 'description']:
81
+ doc.add_paragraph(f"{key}: {val}")
82
+ doc.save(f"{filename}.docx")
83
+
84
+ # PDF
85
+ elif output_format == 'pdf':
86
+ pdf = FPDF()
87
+ pdf.add_page()
88
+ pdf.set_font("Arial", size=10)
89
+ pdf.set_auto_page_break(auto=True, margin=15)
90
+ pdf.multi_cell(0, 10, txt="Codebook", align='L')
91
+ for _, row in codebook_df.iterrows():
92
+ pdf.set_font("Arial", style='B', size=10)
93
+ pdf.cell(0, 10, f"{row['variable']} ({row['type']})", ln=True)
94
+ pdf.set_font("Arial", size=10)
95
+ pdf.multi_cell(0, 10, f"Description: {row['description']}")
96
+ for key, val in row.items():
97
+ if key not in ['variable', 'type', 'description']:
98
+ line = f"{key}: {val}"
99
+ pdf.multi_cell(0, 10, line)
100
+ pdf.ln(5)
101
+ pdf.output(f"{filename}.pdf")
102
+
103
+ else:
104
+ raise ValueError("Format must be one of: 'markdown', 'docx', 'pdf'")
105
+
106
+ print(f"Codebook saved to {filename}.{output_format}")
107
+
108
+
@@ -0,0 +1,86 @@
1
+ import os
2
+ import re
3
+ from importlib.metadata import version
4
+ from pathlib import Path
5
+ from typing import List
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class Settings(BaseModel):
11
+ PACKAGE_VERSION: str = Field(default_factory=lambda: version("m2datakit"))
12
+
13
+ # ABSTRACT ALL IDS BY PROVIDER
14
+ DEDUP_IDS_METRICWIRE: List[str] = [
15
+ "userId",
16
+ "submissionSessionId",
17
+ "activityId",
18
+ ]
19
+ DEDUP_IDS_UAS: List[str] = [
20
+ "userId",
21
+ "submissionSessionId",
22
+ "activityId",
23
+ ]
24
+
25
+ DEDUP_IDS_MONGODB: List[str] = []
26
+ DEDUP_IDS_QUALTRICS: List[str] = [
27
+ "index",
28
+ "ResponseId",
29
+ "M2C2_ASSESSMENT_ORDER",
30
+ "M2C2_AUTO_ADVANCE",
31
+ "M2C2_LANGUAGE",
32
+ ]
33
+
34
+ # def unnest_trial_level_data(df: pd.DataFrame, drop_duplicates=True, column_order: List[str] = None) -> pd.DataFrame:
35
+ # column_order = column_order or ["participant_id", "session_id", "group", "wave", "activity_id", "study_id", "document_uuid"]
36
+ # trial_df = trial_df.drop_duplicates(subset=["activity_uuid", "session_uuid", "trial_begin_iso8601_timestamp"])
37
+
38
+ STANDARD_GROUPING_FOR_AGGREGATION_LEGACY: List[str] = [
39
+ "study_uid",
40
+ "user_uid",
41
+ "uuid",
42
+ "activity_name",
43
+ ]
44
+
45
+ STANDARD_GROUPING_FOR_AGGREGATION: List[str] = [
46
+ "study_uid",
47
+ "user_uid",
48
+ "session_uuid",
49
+ "activity_name",
50
+ ]
51
+
52
+ STANDARD_GROUPING_FOR_AGGREGATION_QUALTRICS: List[str] = ["ResponseId"]
53
+
54
+ STANDARD_GROUPING_FOR_AGGREGATION_METRICWIRE: List[str] = [
55
+ "userId",
56
+ "submissionSessionId",
57
+ "activityId",
58
+ ]
59
+ STANDARD_GROUPING_FOR_AGGREGATION_UAS: List[str] = [
60
+ "userId",
61
+ "submissionSessionId",
62
+ "activityId",
63
+ ]
64
+
65
+ @property
66
+ def QUALTRICS_TRIAL_DATA_REGEX(self):
67
+ return re.compile(r"(M2C2_ASSESSMENT_\d+)_TRIAL_DATA_(\d+)")
68
+
69
+ @property
70
+ def DEFAULT_FUNC_MAP_SCORING(self):
71
+ from .map import DEFAULT_FUNC_MAP_SCORING
72
+
73
+ return DEFAULT_FUNC_MAP_SCORING
74
+
75
+ # DEFAULTS FOR UI
76
+ DEFAULT_PLOT_COLOR: str = "steelblue"
77
+ DEFAULT_PLOT_DPI: int = 150
78
+
79
+
80
+ settings = Settings()
81
+
82
+ # === Logging Configuration (opt-in) ===
83
+ LOGGING_ENABLED = os.getenv("M2DATAKIT_ENABLE_LOGGING", "0").lower() in {"1", "true", "yes"}
84
+ LOGFIRE_ENABLED = os.getenv("M2DATAKIT_ENABLE_LOGFIRE", "0").lower() in {"1", "true", "yes"}
85
+ LOG_DIR = Path(os.getenv("M2DATAKIT_LOG_DIR", str(Path.cwd() / "logs")))
86
+ LOG_FILE = LOG_DIR / "events.jsonl"
@@ -0,0 +1,68 @@
1
+ def export_dataframe(df, file_name, format=".csv", table_name="my_table", **kwargs):
2
+ """
3
+ Exports a Pandas DataFrame to a specified file format, including raw SQL `INSERT` statements.
4
+
5
+ Parameters:
6
+ df (pd.DataFrame): The DataFrame to export.
7
+ file_name (str): The file name (without extension) to export the DataFrame to.
8
+ format (str): The file format (e.g., '.csv', '.json', '.xlsx', '.sql', '.parquet', etc.).
9
+ table_name (str): Table name for SQL `INSERT` statements (used only when format='.sql').
10
+ **kwargs: Additional keyword arguments for Pandas export functions.
11
+
12
+ Returns:
13
+ str: The full file name of the exported file.
14
+ """
15
+ try:
16
+ file_name_with_extension = f"{file_name}{format}"
17
+
18
+ # Export logic for supported formats
19
+ if format == ".csv":
20
+ df.to_csv(file_name_with_extension, index=False, **kwargs)
21
+ elif format == ".json":
22
+ df.to_json(file_name_with_extension, orient="records", **kwargs)
23
+ elif format == ".xlsx":
24
+ df.to_excel(file_name_with_extension, index=False, **kwargs)
25
+ elif format == ".parquet":
26
+ df.to_parquet(file_name_with_extension, index=False, **kwargs)
27
+ elif format == ".html":
28
+ df.to_html(file_name_with_extension, index=False, **kwargs)
29
+ elif format == ".pkl":
30
+ df.to_pickle(file_name_with_extension, **kwargs)
31
+ elif format == ".txt":
32
+ df.to_csv(file_name_with_extension, index=False, sep="\t", **kwargs)
33
+ else:
34
+ raise ValueError(f"Unsupported file format: {format}")
35
+
36
+ return file_name_with_extension
37
+
38
+ except Exception as e:
39
+ raise RuntimeError(f"Export failed: {e}") from e
40
+
41
+ import json
42
+ from datetime import date
43
+
44
+
45
+ def export_jsonld_metadata(df, filename="dataset_metadata.json"):
46
+ today = date.today().isoformat()
47
+ metadata = {
48
+ "@context": "https://schema.org/",
49
+ "@type": "Dataset",
50
+ "name": "M2C2Kit Dataset",
51
+ "description": "This dataset contains M2C2kit data processed via the DataKit Python package.",
52
+ "creator": {"@type": "Person", "name": "Nelson Roque", "affiliation": "M2C2"},
53
+ "dateCreated": today,
54
+ "variableMeasured": [],
55
+ }
56
+
57
+ for col in df.columns:
58
+ metadata["variableMeasured"].append(
59
+ {
60
+ "@type": "PropertyValue",
61
+ "name": col,
62
+ "description": f"Auto-description for {col}",
63
+ "value": str(df[col].dtype),
64
+ }
65
+ )
66
+
67
+ with open(filename, "w") as f:
68
+ json.dump(metadata, f, indent=2)