graphreduce 1.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphreduce/__init__.py +0 -0
- graphreduce/__main__.py +15 -0
- graphreduce/cli/__init__.py +0 -0
- graphreduce/cli/auto_fe.py +85 -0
- graphreduce/cli/entry_point.py +26 -0
- graphreduce/common.py +100 -0
- graphreduce/context.py +73 -0
- graphreduce/enum.py +67 -0
- graphreduce/graph_reduce.py +907 -0
- graphreduce/metadata.py +28 -0
- graphreduce/models/__init__.py +29 -0
- graphreduce/node.py +1710 -0
- graphreduce/storage.py +103 -0
- graphreduce-1.7.3.dist-info/METADATA +309 -0
- graphreduce-1.7.3.dist-info/RECORD +20 -0
- graphreduce-1.7.3.dist-info/WHEEL +5 -0
- graphreduce-1.7.3.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_datasets.py +125 -0
- tests/test_graph_reduce.py +547 -0
graphreduce/__init__.py
ADDED
|
File without changes
|
graphreduce/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
import sqlite3
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import typing
|
|
6
|
+
import datetime
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from typer import Argument, Option
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
# examples for using SQL engines and dialects
|
|
13
|
+
from graphreduce.node import SQLNode, DynamicNode
|
|
14
|
+
from graphreduce.graph_reduce import GraphReduce
|
|
15
|
+
from graphreduce.enum import SQLOpType, ComputeLayerEnum, PeriodUnit
|
|
16
|
+
from graphreduce.models import sqlop
|
|
17
|
+
from graphreduce.context import method_requires
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
auto_fe_cli = typer.Typer(name="auto_fe", help="Perform automated feature engineering", no_args_is_help=True)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@auto_fe_cli.command("autofefs")
|
|
24
|
+
def autofe_filesystem (
|
|
25
|
+
data_path: str = Argument(help="Path to data"),
|
|
26
|
+
fmt: str = Argument(help="File format"),
|
|
27
|
+
prefixes: str = Argument(help="json dict of filenames with prefixes (e.g., `{'test.csv':'test'}`)"),
|
|
28
|
+
date_keys: str = Argument(help="json dict of filenames with associated date key (e.g., `{'test.csv': 'ts'}`)"),
|
|
29
|
+
relationships: str = Argument(
|
|
30
|
+
help="json of relationships (e.g., `[{'from_node':'fname', 'from_key':'cust_id', 'to_node':'tname', 'to_key'}]`)"),
|
|
31
|
+
parent_node: str = Argument(
|
|
32
|
+
help="parent/root node to which to aggregate all of the data"
|
|
33
|
+
),
|
|
34
|
+
cut_date: str = Argument(str(datetime.datetime.today())),
|
|
35
|
+
compute_layer: str = Argument("pandas"),
|
|
36
|
+
hops_front: int = Option(1, '-hf', '--hops-front', help='number of front hops to peform'),
|
|
37
|
+
hops_back: int = Option(3, '-hb', '--hops-back', help='number of back hops to perform'),
|
|
38
|
+
output_path: str = Option('-op', '--output-path', help='output path for the data')
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Main automated feature engineering function.
|
|
42
|
+
"""
|
|
43
|
+
prefixes = json.loads(prefixes)
|
|
44
|
+
date_keys = json.loads(date_keys)
|
|
45
|
+
relationships = json.loads(relationships)
|
|
46
|
+
|
|
47
|
+
if isinstance(cut_date, str):
|
|
48
|
+
cut_date = datetime.datetime.strptime(cut_date, '%Y-%m-%d')
|
|
49
|
+
|
|
50
|
+
nodes = {}
|
|
51
|
+
if fmt in ['csv', 'parquet', 'delta', 'iceberg']:
|
|
52
|
+
for f in os.listdir(data_path):
|
|
53
|
+
print(f"adding file {f}")
|
|
54
|
+
nodes[f] = DynamicNode(
|
|
55
|
+
fpath=f"{data_path}/{f}",
|
|
56
|
+
fmt=f.split('.')[1],
|
|
57
|
+
prefix=prefixes.get(f),
|
|
58
|
+
compute_layer=getattr(ComputeLayerEnum, compute_layer),
|
|
59
|
+
date_key=date_keys.get(f, None)
|
|
60
|
+
)
|
|
61
|
+
gr = GraphReduce(
|
|
62
|
+
name='autofe',
|
|
63
|
+
parent_node=nodes[parent_node],
|
|
64
|
+
fmt=fmt,
|
|
65
|
+
cut_date=cut_date,
|
|
66
|
+
compute_layer=getattr(ComputeLayerEnum, compute_layer),
|
|
67
|
+
auto_features=True,
|
|
68
|
+
auto_feature_hops_front=hops_front,
|
|
69
|
+
auto_feature_hops_back=hops_back
|
|
70
|
+
)
|
|
71
|
+
for rel in relationships:
|
|
72
|
+
gr.add_entity_edge(
|
|
73
|
+
parent_node=nodes[rel['to_node']],
|
|
74
|
+
parent_key=rel['to_key'],
|
|
75
|
+
relation_node=nodes[rel['from_node']],
|
|
76
|
+
relation_key=rel['from_key'],
|
|
77
|
+
reduce=rel.get('reduce', True)
|
|
78
|
+
)
|
|
79
|
+
gr.do_transformations()
|
|
80
|
+
if not output_path:
|
|
81
|
+
output_path = os.path.join(
|
|
82
|
+
os.path.expanduser("~"),
|
|
83
|
+
"graphreduce_outputs/test.csv"
|
|
84
|
+
)
|
|
85
|
+
getattr(gr.parent_node.df, f"to_{fmt}")(output_path)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from .auto_fe import auto_fe_cli
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
entrypoint_cli_typer = typer.Typer(
|
|
11
|
+
no_args_is_help=True,
|
|
12
|
+
add_completion=False,
|
|
13
|
+
rich_markup_mode="markdown",
|
|
14
|
+
help="""
|
|
15
|
+
See examples at https://github.com/wesmadrigal/graphreduce
|
|
16
|
+
"""
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
# Automated feature engineering
|
|
20
|
+
entrypoint_cli_typer.add_typer(auto_fe_cli, rich_help_panel="autofe")
|
|
21
|
+
entrypoint_cli = typer.main.get_command(entrypoint_cli_typer)
|
|
22
|
+
entrypoint_cli.list_commands(None)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == '__main__':
|
|
26
|
+
entrypoint_cli()
|
graphreduce/common.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import pytz
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import dask.dataframe as dd
|
|
7
|
+
from pyspark.sql import functions as F
|
|
8
|
+
import pyspark
|
|
9
|
+
from torch_frame import stype
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
stype_map = {
|
|
13
|
+
'numerical': [
|
|
14
|
+
'min',
|
|
15
|
+
'max',
|
|
16
|
+
'median',
|
|
17
|
+
'mean',
|
|
18
|
+
'sum',
|
|
19
|
+
],
|
|
20
|
+
'categorical': [
|
|
21
|
+
'nunique',
|
|
22
|
+
'count',
|
|
23
|
+
'mode',
|
|
24
|
+
],
|
|
25
|
+
'text_embedded': [
|
|
26
|
+
'length'
|
|
27
|
+
],
|
|
28
|
+
'text_tokenized': [
|
|
29
|
+
'length'
|
|
30
|
+
],
|
|
31
|
+
'multicategorical': [
|
|
32
|
+
'length'
|
|
33
|
+
],
|
|
34
|
+
'sequence_numerical': [
|
|
35
|
+
'sum',
|
|
36
|
+
'min',
|
|
37
|
+
'max',
|
|
38
|
+
'median',
|
|
39
|
+
],
|
|
40
|
+
'timestamp': [
|
|
41
|
+
'min',
|
|
42
|
+
'max',
|
|
43
|
+
'delta'
|
|
44
|
+
],
|
|
45
|
+
'image_embedded': [],
|
|
46
|
+
'embedding': []
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def clean_datetime_pandas(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
51
|
+
df[col] = pd.to_datetime(df[col], errors="coerce", utc=True)
|
|
52
|
+
|
|
53
|
+
# Count the number of rows before removing invalid dates
|
|
54
|
+
total_before = len(df)
|
|
55
|
+
|
|
56
|
+
# Remove rows where timestamp is NaT (indicating parsing failure)
|
|
57
|
+
df = df.dropna(subset=[col])
|
|
58
|
+
|
|
59
|
+
# Count the number of rows after removing invalid dates
|
|
60
|
+
total_after = len(df)
|
|
61
|
+
|
|
62
|
+
# Calculate the percentage of rows removed
|
|
63
|
+
percentage_removed = ((total_before - total_after) / total_before) * 100
|
|
64
|
+
|
|
65
|
+
# Print the percentage of comments removed
|
|
66
|
+
print(
|
|
67
|
+
f"Percentage of rows removed due to invalid dates: "
|
|
68
|
+
f"{percentage_removed:.2f}%"
|
|
69
|
+
)
|
|
70
|
+
return df
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def clean_datetime_dask(df: dd.DataFrame, col: str) -> dd.DataFrame:
|
|
74
|
+
df[col] = dd.to_datetime(df[col])
|
|
75
|
+
total_before = len(df)
|
|
76
|
+
df = df.dropna(subset=[col])
|
|
77
|
+
total_after = len(df)
|
|
78
|
+
percentage_removed = ((total_before - total_after) / total_before) * 100
|
|
79
|
+
return df
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def clean_datetime_spark(df, col: str) -> pyspark.sql.DataFrame:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def convert_to_utc(dt):
|
|
88
|
+
"""Converts a datetime object to UTC.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
dt: The datetime object to convert.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The datetime object converted to UTC.
|
|
95
|
+
"""
|
|
96
|
+
if dt.tzinfo is None: # Naive datetime
|
|
97
|
+
# Assuming the original timezone is the local system time
|
|
98
|
+
local_tz = pytz.timezone('US/Pacific') # Replace with the actual timezone if known
|
|
99
|
+
dt = local_tz.localize(dt)
|
|
100
|
+
return dt.astimezone(pytz.UTC)
|
graphreduce/context.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""A module for helping with context
|
|
3
|
+
in GraphReduce compute graphs.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# standard library
|
|
7
|
+
import typing
|
|
8
|
+
|
|
9
|
+
# third party
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import dask.dataframe as dd
|
|
12
|
+
import pyspark
|
|
13
|
+
from structlog import get_logger
|
|
14
|
+
|
|
15
|
+
# internal
|
|
16
|
+
from graphreduce.node import GraphReduceNode
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = get_logger('graphreduce.context')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def method_requires (
|
|
23
|
+
nodes: typing.List[GraphReduceNode] = [],
|
|
24
|
+
checkpoint: bool = False,
|
|
25
|
+
) -> callable:
|
|
26
|
+
"""
|
|
27
|
+
A decorator for ensuring the function
|
|
28
|
+
only runs when the calling Node has
|
|
29
|
+
a merged edge to the required `nodes` list.
|
|
30
|
+
|
|
31
|
+
Arguments
|
|
32
|
+
nodes: list of GraphReduceNode classes to require
|
|
33
|
+
for function execution
|
|
34
|
+
|
|
35
|
+
checkpoint: boolean of whether or not to checkpoint
|
|
36
|
+
|
|
37
|
+
Usage:
|
|
38
|
+
@requires(nodes=[CustomerNode, OrderNode])
|
|
39
|
+
def do_post_join_annotate(self):
|
|
40
|
+
self.df = self.df.withColumn(...)
|
|
41
|
+
"""
|
|
42
|
+
def wrapit(func, nodes=nodes):
|
|
43
|
+
def newfunc(inst, *args, **kwargs):
|
|
44
|
+
for x in nodes:
|
|
45
|
+
if x not in inst._merged:
|
|
46
|
+
return None
|
|
47
|
+
res = func(inst, *args, **kwargs)
|
|
48
|
+
if hasattr(inst, '_storage_client') and checkpoint:
|
|
49
|
+
if not isinstance(res, None.__class__):
|
|
50
|
+
if res.__class__ in [pd.DataFrame, dd.DataFrame, pyspark.sql.dataframe.DataFrame]:
|
|
51
|
+
df = res
|
|
52
|
+
else:
|
|
53
|
+
df = inst.df
|
|
54
|
+
|
|
55
|
+
fname = func.__name__
|
|
56
|
+
if hasattr(inst, '_checkpoints') and fname in inst._checkpoints:
|
|
57
|
+
return res
|
|
58
|
+
else:
|
|
59
|
+
name = f"{inst.__class__.__name__}_{fname}.{inst.fmt}"
|
|
60
|
+
# checkpoint
|
|
61
|
+
inst._storage_client.offload(
|
|
62
|
+
df,
|
|
63
|
+
name
|
|
64
|
+
)
|
|
65
|
+
path = inst._storage_client.get_path(name)
|
|
66
|
+
# reload
|
|
67
|
+
inst.df = inst._storage_client.load(path)
|
|
68
|
+
|
|
69
|
+
# add function to list of checkpoints.
|
|
70
|
+
inst._checkpoints.append(fname)
|
|
71
|
+
return res
|
|
72
|
+
return newfunc
|
|
73
|
+
return wrapit
|
graphreduce/enum.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
#!/usr/bin/env
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
|
|
5
|
+
class PeriodUnit(enum.Enum):
|
|
6
|
+
second = 'second'
|
|
7
|
+
minute = 'minute'
|
|
8
|
+
hour = 'hour'
|
|
9
|
+
day = 'day'
|
|
10
|
+
week = 'week'
|
|
11
|
+
month = 'month'
|
|
12
|
+
year = 'year'
|
|
13
|
+
|
|
14
|
+
class ComputeLayerEnum(enum.Enum):
|
|
15
|
+
# File-based compute layers.
|
|
16
|
+
pandas = 'pandas'
|
|
17
|
+
polars = 'polars'
|
|
18
|
+
dask = 'dask'
|
|
19
|
+
spark = 'spark'
|
|
20
|
+
ray = 'ray'
|
|
21
|
+
# SQL dialects.
|
|
22
|
+
athena = 'athena'
|
|
23
|
+
snowflake = 'snowflake'
|
|
24
|
+
redshift = 'redshift'
|
|
25
|
+
postgres = 'postgres'
|
|
26
|
+
mysql = 'mysql'
|
|
27
|
+
sqlite = 'sqlite'
|
|
28
|
+
databricks = 'databricks'
|
|
29
|
+
daft = 'daft'
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class StorageFormatEnum(enum.Enum):
|
|
33
|
+
csv = 'csv'
|
|
34
|
+
parquet = 'parquet'
|
|
35
|
+
tsv = 'tsv'
|
|
36
|
+
delta = 'delta'
|
|
37
|
+
|
|
38
|
+
class ProviderEnum(enum.Enum):
|
|
39
|
+
local = 'local'
|
|
40
|
+
s3 = 's3'
|
|
41
|
+
blog = 'blob'
|
|
42
|
+
gcs = 'gcs'
|
|
43
|
+
hdfs = 'hdfs'
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SQLOpType(enum.Enum):
|
|
47
|
+
# All things related to selecting
|
|
48
|
+
# data get housed under this op type.
|
|
49
|
+
# Even case statements, if in the select
|
|
50
|
+
# part of the query, should go here.
|
|
51
|
+
select = 'select'
|
|
52
|
+
from_ = 'from'
|
|
53
|
+
# All aspects of where clauses use this.
|
|
54
|
+
where = 'where'
|
|
55
|
+
# The anatomy of a given aggregation
|
|
56
|
+
# typically consists of a select
|
|
57
|
+
# AND a group by statement.
|
|
58
|
+
# This op should be used only for the
|
|
59
|
+
# actual grouping portion and a separate
|
|
60
|
+
# `select` op type should be used
|
|
61
|
+
# for the columns and aggregation functions.
|
|
62
|
+
agg = 'group by'
|
|
63
|
+
aggfunc = 'aggfunc'
|
|
64
|
+
order = 'order by'
|
|
65
|
+
having = 'having'
|
|
66
|
+
window = 'window'
|
|
67
|
+
custom = 'custom'
|