graphreduce 1.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+
5
+ from .cli.entry_point import entrypoint_cli
6
+
7
+
8
+ def main():
9
+ if sys.version_info[:3] == (3, 8):
10
+ pass
11
+ entrypoint_cli()
12
+
13
+
14
+ if __name__ == '__main__':
15
+ main()
File without changes
@@ -0,0 +1,85 @@
1
+ #!/usr/bin/env python
2
+ import sqlite3
3
+ import json
4
+ import os
5
+ import typing
6
+ import datetime
7
+
8
+ import typer
9
+ from typer import Argument, Option
10
+ import pandas as pd
11
+
12
+ # examples for using SQL engines and dialects
13
+ from graphreduce.node import SQLNode, DynamicNode
14
+ from graphreduce.graph_reduce import GraphReduce
15
+ from graphreduce.enum import SQLOpType, ComputeLayerEnum, PeriodUnit
16
+ from graphreduce.models import sqlop
17
+ from graphreduce.context import method_requires
18
+
19
+
20
+ auto_fe_cli = typer.Typer(name="auto_fe", help="Perform automated feature engineering", no_args_is_help=True)
21
+
22
+
23
+ @auto_fe_cli.command("autofefs")
24
+ def autofe_filesystem (
25
+ data_path: str = Argument(help="Path to data"),
26
+ fmt: str = Argument(help="File format"),
27
+ prefixes: str = Argument(help="json dict of filenames with prefixes (e.g., `{'test.csv':'test'}`)"),
28
+ date_keys: str = Argument(help="json dict of filenames with associated date key (e.g., `{'test.csv': 'ts'}`)"),
29
+ relationships: str = Argument(
30
+ help="json of relationships (e.g., `[{'from_node':'fname', 'from_key':'cust_id', 'to_node':'tname', 'to_key'}]`)"),
31
+ parent_node: str = Argument(
32
+ help="parent/root node to which to aggregate all of the data"
33
+ ),
34
+ cut_date: str = Argument(str(datetime.datetime.today())),
35
+ compute_layer: str = Argument("pandas"),
36
+ hops_front: int = Option(1, '-hf', '--hops-front', help='number of front hops to peform'),
37
+ hops_back: int = Option(3, '-hb', '--hops-back', help='number of back hops to perform'),
38
+ output_path: str = Option('-op', '--output-path', help='output path for the data')
39
+ ):
40
+ """
41
+ Main automated feature engineering function.
42
+ """
43
+ prefixes = json.loads(prefixes)
44
+ date_keys = json.loads(date_keys)
45
+ relationships = json.loads(relationships)
46
+
47
+ if isinstance(cut_date, str):
48
+ cut_date = datetime.datetime.strptime(cut_date, '%Y-%m-%d')
49
+
50
+ nodes = {}
51
+ if fmt in ['csv', 'parquet', 'delta', 'iceberg']:
52
+ for f in os.listdir(data_path):
53
+ print(f"adding file {f}")
54
+ nodes[f] = DynamicNode(
55
+ fpath=f"{data_path}/{f}",
56
+ fmt=f.split('.')[1],
57
+ prefix=prefixes.get(f),
58
+ compute_layer=getattr(ComputeLayerEnum, compute_layer),
59
+ date_key=date_keys.get(f, None)
60
+ )
61
+ gr = GraphReduce(
62
+ name='autofe',
63
+ parent_node=nodes[parent_node],
64
+ fmt=fmt,
65
+ cut_date=cut_date,
66
+ compute_layer=getattr(ComputeLayerEnum, compute_layer),
67
+ auto_features=True,
68
+ auto_feature_hops_front=hops_front,
69
+ auto_feature_hops_back=hops_back
70
+ )
71
+ for rel in relationships:
72
+ gr.add_entity_edge(
73
+ parent_node=nodes[rel['to_node']],
74
+ parent_key=rel['to_key'],
75
+ relation_node=nodes[rel['from_node']],
76
+ relation_key=rel['from_key'],
77
+ reduce=rel.get('reduce', True)
78
+ )
79
+ gr.do_transformations()
80
+ if not output_path:
81
+ output_path = os.path.join(
82
+ os.path.expanduser("~"),
83
+ "graphreduce_outputs/test.csv"
84
+ )
85
+ getattr(gr.parent_node.df, f"to_{fmt}")(output_path)
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env python
2
+
3
+ import typing
4
+
5
+ import typer
6
+
7
+ from .auto_fe import auto_fe_cli
8
+
9
+
10
+ entrypoint_cli_typer = typer.Typer(
11
+ no_args_is_help=True,
12
+ add_completion=False,
13
+ rich_markup_mode="markdown",
14
+ help="""
15
+ See examples at https://github.com/wesmadrigal/graphreduce
16
+ """
17
+ )
18
+
19
+ # Automated feature engineering
20
+ entrypoint_cli_typer.add_typer(auto_fe_cli, rich_help_panel="autofe")
21
+ entrypoint_cli = typer.main.get_command(entrypoint_cli_typer)
22
+ entrypoint_cli.list_commands(None)
23
+
24
+
25
+ if __name__ == '__main__':
26
+ entrypoint_cli()
graphreduce/common.py ADDED
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/env python
2
+
3
+ import pytz
4
+ from datetime import datetime
5
+ import pandas as pd
6
+ import dask.dataframe as dd
7
+ from pyspark.sql import functions as F
8
+ import pyspark
9
+ from torch_frame import stype
10
+
11
+
12
+ stype_map = {
13
+ 'numerical': [
14
+ 'min',
15
+ 'max',
16
+ 'median',
17
+ 'mean',
18
+ 'sum',
19
+ ],
20
+ 'categorical': [
21
+ 'nunique',
22
+ 'count',
23
+ 'mode',
24
+ ],
25
+ 'text_embedded': [
26
+ 'length'
27
+ ],
28
+ 'text_tokenized': [
29
+ 'length'
30
+ ],
31
+ 'multicategorical': [
32
+ 'length'
33
+ ],
34
+ 'sequence_numerical': [
35
+ 'sum',
36
+ 'min',
37
+ 'max',
38
+ 'median',
39
+ ],
40
+ 'timestamp': [
41
+ 'min',
42
+ 'max',
43
+ 'delta'
44
+ ],
45
+ 'image_embedded': [],
46
+ 'embedding': []
47
+ }
48
+
49
+
50
+ def clean_datetime_pandas(df: pd.DataFrame, col: str) -> pd.DataFrame:
51
+ df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)
52
+
53
+ # Count the number of rows before removing invalid dates
54
+ total_before = len(df)
55
+
56
+ # Remove rows where timestamp is NaT (indicating parsing failure)
57
+ df = df.dropna(subset=[col])
58
+
59
+ # Count the number of rows after removing invalid dates
60
+ total_after = len(df)
61
+
62
+ # Calculate the percentage of rows removed
63
+ percentage_removed = ((total_before - total_after) / total_before) * 100
64
+
65
+ # Print the percentage of comments removed
66
+ print(
67
+ f"Percentage of rows removed due to invalid dates: "
68
+ f"{percentage_removed:.2f}%"
69
+ )
70
+ return df
71
+
72
+
73
+ def clean_datetime_dask(df: dd.DataFrame, col: str) -> dd.DataFrame:
74
+ df[col] = dd.to_datetime(df[col])
75
+ total_before = len(df)
76
+ df = df.dropna(subset=[col])
77
+ total_after = len(df)
78
+ percentage_removed = ((total_before - total_after) / total_before) * 100
79
+ return df
80
+
81
+
82
+ def clean_datetime_spark(df, col: str) -> pyspark.sql.DataFrame:
83
+ pass
84
+
85
+
86
+
87
+ def convert_to_utc(dt):
88
+ """Converts a datetime object to UTC.
89
+
90
+ Args:
91
+ dt: The datetime object to convert.
92
+
93
+ Returns:
94
+ The datetime object converted to UTC.
95
+ """
96
+ if dt.tzinfo is None: # Naive datetime
97
+ # Assuming the original timezone is the local system time
98
+ local_tz = pytz.timezone('US/Pacific') # Replace with the actual timezone if known
99
+ dt = local_tz.localize(dt)
100
+ return dt.astimezone(pytz.UTC)
graphreduce/context.py ADDED
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env python
2
+ """A module for helping with context
3
+ in GraphReduce compute graphs.
4
+ """
5
+
6
+ # standard library
7
+ import typing
8
+
9
+ # third party
10
+ import pandas as pd
11
+ import dask.dataframe as dd
12
+ import pyspark
13
+ from structlog import get_logger
14
+
15
+ # internal
16
+ from graphreduce.node import GraphReduceNode
17
+
18
+
19
+ logger = get_logger('graphreduce.context')
20
+
21
+
22
+ def method_requires (
23
+ nodes: typing.List[GraphReduceNode] = [],
24
+ checkpoint: bool = False,
25
+ ) -> callable:
26
+ """
27
+ A decorator for ensuring the function
28
+ only runs when the calling Node has
29
+ a merged edge to the required `nodes` list.
30
+
31
+ Arguments
32
+ nodes: list of GraphReduceNode classes to require
33
+ for function execution
34
+
35
+ checkpoint: boolean of whether or not to checkpoint
36
+
37
+ Usage:
38
+ @requires(nodes=[CustomerNode, OrderNode])
39
+ def do_post_join_annotate(self):
40
+ self.df = self.df.withColumn(...)
41
+ """
42
+ def wrapit(func, nodes=nodes):
43
+ def newfunc(inst, *args, **kwargs):
44
+ for x in nodes:
45
+ if x not in inst._merged:
46
+ return None
47
+ res = func(inst, *args, **kwargs)
48
+ if hasattr(inst, '_storage_client') and checkpoint:
49
+ if not isinstance(res, None.__class__):
50
+ if res.__class__ in [pd.DataFrame, dd.DataFrame, pyspark.sql.dataframe.DataFrame]:
51
+ df = res
52
+ else:
53
+ df = inst.df
54
+
55
+ fname = func.__name__
56
+ if hasattr(inst, '_checkpoints') and fname in inst._checkpoints:
57
+ return res
58
+ else:
59
+ name = f"{inst.__class__.__name__}_{fname}.{inst.fmt}"
60
+ # checkpoint
61
+ inst._storage_client.offload(
62
+ df,
63
+ name
64
+ )
65
+ path = inst._storage_client.get_path(name)
66
+ # reload
67
+ inst.df = inst._storage_client.load(path)
68
+
69
+ # add function to list of checkpoints.
70
+ inst._checkpoints.append(fname)
71
+ return res
72
+ return newfunc
73
+ return wrapit
graphreduce/enum.py ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env
2
+
3
+ import enum
4
+
5
+ class PeriodUnit(enum.Enum):
6
+ second = 'second'
7
+ minute = 'minute'
8
+ hour = 'hour'
9
+ day = 'day'
10
+ week = 'week'
11
+ month = 'month'
12
+ year = 'year'
13
+
14
+ class ComputeLayerEnum(enum.Enum):
15
+ # File-based compute layers.
16
+ pandas = 'pandas'
17
+ polars = 'polars'
18
+ dask = 'dask'
19
+ spark = 'spark'
20
+ ray = 'ray'
21
+ # SQL dialects.
22
+ athena = 'athena'
23
+ snowflake = 'snowflake'
24
+ redshift = 'redshift'
25
+ postgres = 'postgres'
26
+ mysql = 'mysql'
27
+ sqlite = 'sqlite'
28
+ databricks = 'databricks'
29
+ daft = 'daft'
30
+
31
+
32
+ class StorageFormatEnum(enum.Enum):
33
+ csv = 'csv'
34
+ parquet = 'parquet'
35
+ tsv = 'tsv'
36
+ delta = 'delta'
37
+
38
+ class ProviderEnum(enum.Enum):
39
+ local = 'local'
40
+ s3 = 's3'
41
+ blog = 'blob'
42
+ gcs = 'gcs'
43
+ hdfs = 'hdfs'
44
+
45
+
46
+ class SQLOpType(enum.Enum):
47
+ # All things related to selecting
48
+ # data get housed under this op type.
49
+ # Even case statements, if in the select
50
+ # part of the query, should go here.
51
+ select = 'select'
52
+ from_ = 'from'
53
+ # All aspects of where clauses use this.
54
+ where = 'where'
55
+ # The anatomy of a given aggregation
56
+ # typically consists of a select
57
+ # AND a group by statement.
58
+ # This op should be used only for the
59
+ # actual grouping portion and a separate
60
+ # `select` op type should be used
61
+ # for the columns and aggregation functions.
62
+ agg = 'group by'
63
+ aggfunc = 'aggfunc'
64
+ order = 'order by'
65
+ having = 'having'
66
+ window = 'window'
67
+ custom = 'custom'