mobilizr-python 0.94__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ include my_datasets_lib/data/*.csv
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: mobilizr-python
3
+ Version: 0.94
4
+ Summary: A library of custom datasets.
5
+ Home-page: https://github.com/emilio-dulay/my_datasets_lib
6
+ Author: Emilio Dulay
7
+ Author-email: emiliodulay19@g.ucla.edu
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Dist: pandas
11
+ Dynamic: author
12
+ Dynamic: author-email
13
+ Dynamic: classifier
14
+ Dynamic: home-page
15
+ Dynamic: requires-dist
16
+ Dynamic: summary
@@ -0,0 +1,30 @@
1
+ # UCLA IDS Data Library
2
+
3
+ A lightweight Python library for loading example datasets used in data science courses.
4
+
5
+ ---
6
+
7
+ ## Installation
8
+
9
+ Clone the repository and install locally:
10
+ git clone https://github.com/EmilioD19/UCLA-IDS-Data-Library.git
11
+ cd UCLA-IDS-Data-Library
12
+ pip install .
13
+
14
+ ## Usage
15
+ Import functions:
16
+ from my_datasets_lib import load, describe, list_datasets
17
+
18
+ ## Load a dataset
19
+ df = load("iris")
20
+ print(df.head())
21
+
22
+ ## Describe a dataset
23
+ print(describe("iris"))
24
+
25
+ ## List available datasets
26
+ print(list_datasets())
27
+
28
+ ## Error Handling
29
+ If a dataset does not exist, ValueError is raised:
30
+ load("nonexistent")
@@ -0,0 +1,45 @@
1
+ from . import datasets
2
+ import pandas as pd
3
+
4
+ def data(name):
5
+ """Load a dataset by name."""
6
+ if hasattr(datasets, name):
7
+ return getattr(datasets, name).load()
8
+ else:
9
+ raise ValueError(f"Dataset '{name}' not found in my_datasets_lib.datasets")
10
+
11
+
12
+ def describe(name):
13
+ """Get the description for a dataset by name."""
14
+ if hasattr(datasets, name):
15
+ return getattr(datasets, name).describe()
16
+ else:
17
+ raise ValueError(f"Dataset '{name}' not found in my_datasets_lib.datasets")
18
+
19
+ def list_datasets():
20
+ """Return a list of available dataset names."""
21
+ return [name for name in dir(datasets) if not name.startswith("_")]
22
+
23
+ def View(name):
24
+ """View a dataset by name."""
25
+ if hasattr(datasets, name):
26
+ df = getattr(datasets, name).load()
27
+ if isinstance(df, pd.DataFrame):
28
+ return df
29
+ else:
30
+ raise TypeError(f"Dataset '{name}' is not a pandas DataFrame.")
31
+ else:
32
+ raise ValueError(f"Dataset '{name}' not found in my_datasets_lib.datasets")
33
+
34
+ def timeuse_format(df):
35
+ df = df.replace("NOT_DISPLAYED", 0)
36
+ df[df.columns.difference(['user.id', 'timestamp', 'activities'])] = \
37
+ df[df.columns.difference(['user.id', 'timestamp', 'activities'])].apply(pd.to_numeric)
38
+
39
+ submissions = df.groupby('user.id').size().rename("submissions")
40
+
41
+ averages = df.groupby('user.id').mean(numeric_only = True)
42
+
43
+ return averages.merge(submissions, on = "user.id")
44
+
45
+
@@ -0,0 +1,7 @@
1
+ '''Makes datasets behave like bundles of submodules.'''
2
+
3
+ from . import cdc, food_ids, atu_dirty, atu_clean, timeuse_ids, timeuse_ids_clean
4
+
5
+ __all__ = ["cdc", "food_ids", "atu_dirty", "atu_clean", "timeuse_ids",
6
+ "timeuse_ids_clean"]
7
+
@@ -0,0 +1,41 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ def load():
5
+ """Load the atu_clean dataset."""
6
+ path = os.path.join(os.path.dirname(__file__), '..', 'data', 'atu_clean.csv')
7
+ df = pd.read_csv(path)
8
+ return df
9
+
10
+ def describe():
11
+ """Return a description of the atu_clean dataset."""
12
+ return '''
13
+ American Time Use Survey Data Sample - Clean
14
+ Description
15
+ A dataset containing a subset of variables from the American Time Use Survey. This dataset is a cleaned version of atu_dirty.
16
+
17
+ Usage
18
+ data(atu_clean)
19
+ Format
20
+ A data frame with 10,493 observations of 8 variables
21
+
22
+ Details
23
+ caseid. unique identifier of individual survey participant
24
+
25
+ age. the age of the respondent
26
+
27
+ sex. the sex of the respondent
28
+
29
+ fulltime_emp. the employment status of the respondent
30
+
31
+ phys_challenge. does the respondent have a physical difficulty
32
+
33
+ sleep. the length of time the person sleeps, in minutes
34
+
35
+ homework. How long the respondent spent on homework assignments, in minutes
36
+
37
+ socializing. the number of minutes the respondent spent socializing
38
+
39
+ Source
40
+ http://www.bls.gov/tus/
41
+ '''
@@ -0,0 +1,43 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ def load():
5
+ """Load the atu_dirty dataset."""
6
+ path = os.path.join(os.path.dirname(__file__), '..', 'data', 'atu_dirty.csv')
7
+ df = pd.read_csv(path)
8
+ return df
9
+
10
+ def describe():
11
+ """Return a description of the atu_dirty dataset."""
12
+ return
13
+ '''
14
+ American Time Use Survey Data Sample - Dirty
15
+ Description
16
+ A dataset containing a subset of variables from the American Time Use Survey. This dataset is "dirty", meaning it has elements which require formatting before use.
17
+
18
+ Usage
19
+ data(atu_dirty)
20
+ Format
21
+ A data frame with 10,493 observations of 8 variables
22
+
23
+ Details
24
+ caseid. unique identifier of individual survey participant
25
+
26
+ V1. the age of the respondent
27
+
28
+ V2. the gender of the respondent (1: Male, 2: Female)
29
+
30
+ V3. the employment status of the respondent
31
+
32
+ V4. does the respondent have a physical difficulty (1: Person did not report having a physical difficulty, 2: Person surveyed reported the have a physical difficulty)
33
+
34
+ V5. the length of time the person sleeps, in minutes
35
+
36
+ V6. How long the respondent spent on homework assignments, in minutes
37
+
38
+ V7. the number of minutes the respondent spent socializing
39
+
40
+ Source
41
+ http://www.bls.gov/tus/
42
+ '''.strip()
43
+
@@ -0,0 +1,61 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ def load():
5
+ """Load the cdc dataset."""
6
+ path = os.path.join(os.path.dirname(__file__), '..', 'data', 'cdc.csv')
7
+ df = pd.read_csv(path)
8
+ return df
9
+
10
+ def describe():
11
+ """Return a description of the CDC dataset."""
12
+ return """
13
+ CDC Youth Risk Behavior Survey Data
14
+
15
+ **Description**
16
+ A dataset containing responses from the 2021 CDC Youth Risk Behavior Survey.
17
+
18
+ **Usage**
19
+ `data(cdc)`
20
+
21
+ **Format**
22
+ A data frame with 17,232 observations of 32 variables.
23
+
24
+ **Details**
25
+ - **age** — age in years
26
+ - **sex** — sex assigned at birth
27
+ - **grade** — grade in school
28
+ - **height** — height of student in meters
29
+ - **weight** — weight of student in kilograms
30
+ - **seat_belt** — how often student wore a seatbelt in a motor vehicle driven by someone else
31
+ - **drive_text** — how often the student reported texting while driving in the past 30 days
32
+ - **hisp_latino** — whether or not student identifies as Hispanic or Latino
33
+ - **american_indian_or_alaska_native** — whether or not student identifies as American Indian or Alaska Native
34
+ - **asian** — whether or not student identifies as Asian
35
+ - **black_or_african_american** — whether or not student identifies as Black or African American
36
+ - **native_hawaiian_or_other_pacific_islander** — whether or not student identifies as Native Hawaiian or other Pacific Islander
37
+ - **white** — whether or not student identifies as White
38
+ - **bully_school** — did the student report being bullied at school
39
+ - **bully_electronic** — did the student report being bullied online
40
+ - **depressed** — student reported feeling depressed for 2 weeks in a row or more during the past 12 months
41
+ - **days_smoking** — number of days student reported smoking cigarettes during past 30 days
42
+ - **days_vaping** — number of days student reported vaping/smoking electronic cigarettes during past 30 days
43
+ - **sexuality** — how the student describes their sexual orientation
44
+ - **describe_weight** — student perception of their weight relative to what they believe it should be
45
+ - **drink_juice** — how often student consumed fruit juice over the previous 7 days
46
+ - **eat_fruit** — how often student ate fruit over the previous 7 days
47
+ - **eat_salad** — how often student ate salad over the previous 7 days
48
+ - **drink_soda** — how often student consumed soda over the previous 7 days
49
+ - **drink_milk** — how often student drank milk over the previous 7 days
50
+ - **eat_breakfast** — how often the student reported eating breakfast over the past 7 days
51
+ - **days_exercise_60** — how often student was active for at least 60 mins over the previous 7 days
52
+ - **screen_time** — average number of hours spent on a screen on a school day
53
+ - **number_teams** — number of sports teams played on during previous 12 months
54
+ - **hours_sleep** — reported hours of sleep on school nights
55
+ - **drink_sportdrink** — how often student consumed sports drinks over the past 7 days
56
+ - **drink_water** — how often student consumed water over the past 7 days
57
+
58
+ **Source**
59
+ http://www.cdc.gov/HealthyYouth/yrbs/index.htm
60
+ """.strip()
61
+
@@ -0,0 +1,8 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ def load():
5
+ """Load the food dataset."""
6
+ path = os.path.join(os.path.dirname(__file__), '..', 'data', 'food_ids.csv')
7
+ df = pd.read_csv(path)
8
+ return df
@@ -0,0 +1,8 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ def load():
5
+ """Load the timeuse_ids dataset."""
6
+ path = os.path.join(os.path.dirname(__file__), '..', 'data', 'timeuse_ids.csv')
7
+ df = pd.read_csv(path)
8
+ return df
@@ -0,0 +1,8 @@
1
+ import os
2
+ import pandas as pd
3
+
4
+ def load():
5
+ """Load the timeuse_ids_clean dataset."""
6
+ path = os.path.join(os.path.dirname(__file__), '..', 'data', 'timeuse_ids_clean.csv')
7
+ df = pd.read_csv(path)
8
+ return df
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: mobilizr-python
3
+ Version: 0.94
4
+ Summary: A library of custom datasets.
5
+ Home-page: https://github.com/emilio-dulay/my_datasets_lib
6
+ Author: Emilio Dulay
7
+ Author-email: emiliodulay19@g.ucla.edu
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Dist: pandas
11
+ Dynamic: author
12
+ Dynamic: author-email
13
+ Dynamic: classifier
14
+ Dynamic: home-page
15
+ Dynamic: requires-dist
16
+ Dynamic: summary
@@ -0,0 +1,16 @@
1
+ MANIFEST.in
2
+ README.md
3
+ setup.py
4
+ mobilizr-python/__init__.py
5
+ mobilizr-python/datasets/__init__.py
6
+ mobilizr-python/datasets/atu_clean.py
7
+ mobilizr-python/datasets/atu_dirty.py
8
+ mobilizr-python/datasets/cdc.py
9
+ mobilizr-python/datasets/food_ids.py
10
+ mobilizr-python/datasets/timeuse_ids.py
11
+ mobilizr-python/datasets/timeuse_ids_clean.py
12
+ mobilizr_python.egg-info/PKG-INFO
13
+ mobilizr_python.egg-info/SOURCES.txt
14
+ mobilizr_python.egg-info/dependency_links.txt
15
+ mobilizr_python.egg-info/requires.txt
16
+ mobilizr_python.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ mobilizr-python
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,19 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='mobilizr-python',
5
+ version='0.94',
6
+ packages=find_packages(),
7
+ include_package_data=True,
8
+ install_requires=[
9
+ 'pandas',
10
+ ],
11
+ description='A library of custom datasets.',
12
+ author='Emilio Dulay',
13
+ author_email='emiliodulay19@g.ucla.edu',
14
+ url="https://github.com/emilio-dulay/my_datasets_lib",
15
+ classifiers=[
16
+ 'Programming Language :: Python :: 3',
17
+ 'License :: OSI Approved :: MIT License',
18
+ ],
19
+ )