kumoai 2.7.0.dev202508201830__cp312-cp312-win_amd64.whl → 2.12.0.dev202511111731__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kumoai/__init__.py +4 -2
- kumoai/_version.py +1 -1
- kumoai/client/client.py +10 -5
- kumoai/client/endpoints.py +1 -0
- kumoai/client/rfm.py +37 -8
- kumoai/connector/file_upload_connector.py +94 -85
- kumoai/connector/snowflake_connector.py +9 -0
- kumoai/connector/utils.py +1377 -209
- kumoai/experimental/rfm/__init__.py +5 -3
- kumoai/experimental/rfm/authenticate.py +8 -5
- kumoai/experimental/rfm/infer/timestamp.py +7 -4
- kumoai/experimental/rfm/local_graph.py +96 -82
- kumoai/experimental/rfm/local_graph_sampler.py +16 -8
- kumoai/experimental/rfm/local_graph_store.py +32 -10
- kumoai/experimental/rfm/local_pquery_driver.py +342 -46
- kumoai/experimental/rfm/local_table.py +142 -45
- kumoai/experimental/rfm/pquery/__init__.py +4 -4
- kumoai/experimental/rfm/pquery/{backend.py → executor.py} +28 -58
- kumoai/experimental/rfm/pquery/pandas_executor.py +532 -0
- kumoai/experimental/rfm/rfm.py +535 -125
- kumoai/experimental/rfm/utils.py +0 -3
- kumoai/jobs.py +27 -1
- kumoai/kumolib.cp312-win_amd64.pyd +0 -0
- kumoai/pquery/prediction_table.py +5 -3
- kumoai/pquery/training_table.py +5 -3
- kumoai/trainer/job.py +9 -30
- kumoai/trainer/trainer.py +19 -10
- kumoai/utils/__init__.py +2 -1
- kumoai/utils/progress_logger.py +96 -16
- {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/METADATA +4 -5
- {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/RECORD +34 -34
- kumoai/experimental/rfm/pquery/pandas_backend.py +0 -437
- {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/WHEEL +0 -0
- {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/licenses/LICENSE +0 -0
- {kumoai-2.7.0.dev202508201830.dist-info → kumoai-2.12.0.dev202511111731.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,7 @@ CPU architecture: {platform.machine()}
|
|
|
12
12
|
glibc version: {platform.libc_ver()[1]}
|
|
13
13
|
|
|
14
14
|
✅ Supported Environments:
|
|
15
|
-
* Python versions: 3.
|
|
15
|
+
* Python versions: 3.10, 3.11, 3.12, 3.13
|
|
16
16
|
* Operating systems and CPU architectures:
|
|
17
17
|
* Linux (x86_64)
|
|
18
18
|
* macOS (arm64)
|
|
@@ -20,7 +20,7 @@ glibc version: {platform.libc_ver()[1]}
|
|
|
20
20
|
* glibc versions: >=2.28
|
|
21
21
|
|
|
22
22
|
❌ Unsupported Environments:
|
|
23
|
-
* Python versions: 3.8, 3.14
|
|
23
|
+
* Python versions: 3.8, 3.9, 3.14
|
|
24
24
|
* Operating systems and CPU architectures:
|
|
25
25
|
* Linux (arm64)
|
|
26
26
|
* macOS (x86_64)
|
|
@@ -36,7 +36,7 @@ import os
|
|
|
36
36
|
import kumoai
|
|
37
37
|
from .local_table import LocalTable
|
|
38
38
|
from .local_graph import LocalGraph
|
|
39
|
-
from .rfm import KumoRFM
|
|
39
|
+
from .rfm import ExplainConfig, Explanation, KumoRFM
|
|
40
40
|
from .authenticate import authenticate
|
|
41
41
|
|
|
42
42
|
|
|
@@ -60,6 +60,8 @@ __all__ = [
|
|
|
60
60
|
'LocalTable',
|
|
61
61
|
'LocalGraph',
|
|
62
62
|
'KumoRFM',
|
|
63
|
+
'ExplainConfig',
|
|
64
|
+
'Explanation',
|
|
63
65
|
'authenticate',
|
|
64
66
|
'init',
|
|
65
67
|
]
|
|
@@ -264,14 +264,17 @@ def _authenticate_local(api_url: str, redirect_port: int = 8765) -> None:
|
|
|
264
264
|
f"?callback_url={urllib.parse.quote(callback_url)}" +
|
|
265
265
|
f"&token_name={urllib.parse.quote(token_name)}")
|
|
266
266
|
|
|
267
|
-
print(
|
|
267
|
+
print(
|
|
268
|
+
"Opening browser page to automatically generate an API key...\n" +
|
|
269
|
+
"If the page does not open, manually create a new API key at " +
|
|
270
|
+
f"{api_url}/api-keys and set it using os.environ[\"KUMO_API_KEY\"] " +
|
|
271
|
+
"= \"YOUR_API_KEY\"")
|
|
272
|
+
|
|
268
273
|
webbrowser.open(login_url)
|
|
269
274
|
|
|
270
275
|
def get_user_input() -> None:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
token_entered = getpass("API Key (type then press enter): ").strip()
|
|
276
|
+
token_entered = getpass(
|
|
277
|
+
"or paste the API key here and press enter: ").strip()
|
|
275
278
|
|
|
276
279
|
while (len(token_entered) == 0):
|
|
277
280
|
token_entered = getpass(
|
|
@@ -2,6 +2,7 @@ import re
|
|
|
2
2
|
import warnings
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
+
from dateutil.parser import UnknownTimezoneWarning
|
|
5
6
|
from kumoapi.typing import Dtype, Stype
|
|
6
7
|
|
|
7
8
|
|
|
@@ -20,9 +21,7 @@ def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
|
|
|
20
21
|
column_name,
|
|
21
22
|
re.IGNORECASE,
|
|
22
23
|
)
|
|
23
|
-
|
|
24
|
-
if match is not None:
|
|
25
|
-
return True
|
|
24
|
+
score = 0.3 if match is not None else 0.0
|
|
26
25
|
|
|
27
26
|
ser = ser.iloc[:100]
|
|
28
27
|
ser = ser.dropna()
|
|
@@ -34,5 +33,9 @@ def contains_timestamp(ser: pd.Series, column_name: str, dtype: Dtype) -> bool:
|
|
|
34
33
|
ser = ser.astype(str) # Avoid parsing numbers as unix timestamps.
|
|
35
34
|
|
|
36
35
|
with warnings.catch_warnings():
|
|
36
|
+
warnings.simplefilter('ignore', UnknownTimezoneWarning)
|
|
37
37
|
warnings.filterwarnings('ignore', message='Could not infer format')
|
|
38
|
-
|
|
38
|
+
mask = pd.to_datetime(ser, errors='coerce').notna()
|
|
39
|
+
score += int(mask.sum()) / len(mask)
|
|
40
|
+
|
|
41
|
+
return score >= 1.0
|
|
@@ -29,45 +29,46 @@ class LocalGraph:
|
|
|
29
29
|
|
|
30
30
|
.. code-block:: python
|
|
31
31
|
|
|
32
|
-
|
|
33
|
-
import
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
graph
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
graph
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
graph.
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
graph
|
|
32
|
+
>>> # doctest: +SKIP
|
|
33
|
+
>>> import pandas as pd
|
|
34
|
+
>>> import kumoai.experimental.rfm as rfm
|
|
35
|
+
|
|
36
|
+
>>> # Load data frames into memory:
|
|
37
|
+
>>> df1 = pd.DataFrame(...)
|
|
38
|
+
>>> df2 = pd.DataFrame(...)
|
|
39
|
+
>>> df3 = pd.DataFrame(...)
|
|
40
|
+
|
|
41
|
+
>>> # Define tables from data frames:
|
|
42
|
+
>>> table1 = rfm.LocalTable(name="table1", data=df1)
|
|
43
|
+
>>> table2 = rfm.LocalTable(name="table2", data=df2)
|
|
44
|
+
>>> table3 = rfm.LocalTable(name="table3", data=df3)
|
|
45
|
+
|
|
46
|
+
>>> # Create a graph from a dictionary of tables:
|
|
47
|
+
>>> graph = rfm.LocalGraph({
|
|
48
|
+
... "table1": table1,
|
|
49
|
+
... "table2": table2,
|
|
50
|
+
... "table3": table3,
|
|
51
|
+
... })
|
|
52
|
+
|
|
53
|
+
>>> # Infer table metadata:
|
|
54
|
+
>>> graph.infer_metadata()
|
|
55
|
+
|
|
56
|
+
>>> # Infer links/edges:
|
|
57
|
+
>>> graph.infer_links()
|
|
58
|
+
|
|
59
|
+
>>> # Inspect table metadata:
|
|
60
|
+
>>> for table in graph.tables.values():
|
|
61
|
+
... table.print_metadata()
|
|
62
|
+
|
|
63
|
+
>>> # Visualize graph (if graphviz is installed):
|
|
64
|
+
>>> graph.visualize()
|
|
65
|
+
|
|
66
|
+
>>> # Add/Remove edges between tables:
|
|
67
|
+
>>> graph.link(src_table="table1", fkey="id1", dst_table="table2")
|
|
68
|
+
>>> graph.unlink(src_table="table1", fkey="id1", dst_table="table2")
|
|
69
|
+
|
|
70
|
+
>>> # Validate graph:
|
|
71
|
+
>>> graph.validate()
|
|
71
72
|
"""
|
|
72
73
|
|
|
73
74
|
# Constructors ############################################################
|
|
@@ -104,27 +105,28 @@ class LocalGraph:
|
|
|
104
105
|
|
|
105
106
|
.. code-block:: python
|
|
106
107
|
|
|
107
|
-
|
|
108
|
-
import
|
|
108
|
+
>>> # doctest: +SKIP
|
|
109
|
+
>>> import pandas as pd
|
|
110
|
+
>>> import kumoai.experimental.rfm as rfm
|
|
109
111
|
|
|
110
|
-
# Load data frames into memory:
|
|
111
|
-
df1 = pd.DataFrame(...)
|
|
112
|
-
df2 = pd.DataFrame(...)
|
|
113
|
-
df3 = pd.DataFrame(...)
|
|
112
|
+
>>> # Load data frames into memory:
|
|
113
|
+
>>> df1 = pd.DataFrame(...)
|
|
114
|
+
>>> df2 = pd.DataFrame(...)
|
|
115
|
+
>>> df3 = pd.DataFrame(...)
|
|
114
116
|
|
|
115
|
-
# Create a graph from a dictionary of data frames:
|
|
116
|
-
graph = rfm.LocalGraph.from_data({
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
})
|
|
117
|
+
>>> # Create a graph from a dictionary of data frames:
|
|
118
|
+
>>> graph = rfm.LocalGraph.from_data({
|
|
119
|
+
... "table1": df1,
|
|
120
|
+
... "table2": df2,
|
|
121
|
+
... "table3": df3,
|
|
122
|
+
... })
|
|
121
123
|
|
|
122
|
-
# Inspect table metadata:
|
|
123
|
-
for table in graph.tables.values():
|
|
124
|
-
|
|
124
|
+
>>> # Inspect table metadata:
|
|
125
|
+
>>> for table in graph.tables.values():
|
|
126
|
+
... table.print_metadata()
|
|
125
127
|
|
|
126
|
-
# Visualize graph (if graphviz is installed):
|
|
127
|
-
graph.visualize()
|
|
128
|
+
>>> # Visualize graph (if graphviz is installed):
|
|
129
|
+
>>> graph.visualize()
|
|
128
130
|
|
|
129
131
|
Args:
|
|
130
132
|
df_dict: A dictionary of data frames, where the keys are the names
|
|
@@ -141,6 +143,7 @@ class LocalGraph:
|
|
|
141
143
|
graph.
|
|
142
144
|
|
|
143
145
|
Example:
|
|
146
|
+
>>> # doctest: +SKIP
|
|
144
147
|
>>> import kumoai.experimental.rfm as rfm
|
|
145
148
|
>>> df1 = pd.DataFrame(...)
|
|
146
149
|
>>> df2 = pd.DataFrame(...)
|
|
@@ -150,7 +153,7 @@ class LocalGraph:
|
|
|
150
153
|
... "table2": df2,
|
|
151
154
|
... "table3": df3,
|
|
152
155
|
... })
|
|
153
|
-
|
|
156
|
+
>>> graph.validate()
|
|
154
157
|
"""
|
|
155
158
|
tables = [LocalTable(df, name) for name, df in df_dict.items()]
|
|
156
159
|
|
|
@@ -197,12 +200,6 @@ class LocalGraph:
|
|
|
197
200
|
KeyError: If a table with the same name already exists in the
|
|
198
201
|
graph.
|
|
199
202
|
"""
|
|
200
|
-
if len(self.tables) >= 15:
|
|
201
|
-
raise ValueError("Cannot create a graph with more than 15 "
|
|
202
|
-
"tables. Please create a feature request at "
|
|
203
|
-
"'https://github.com/kumo-ai/kumo-rfm' if you "
|
|
204
|
-
"must go beyond this for your use-case.")
|
|
205
|
-
|
|
206
203
|
if table.name in self._tables:
|
|
207
204
|
raise KeyError(f"Cannot add table with name '{table.name}' to "
|
|
208
205
|
f"this graph; table names must be globally unique.")
|
|
@@ -237,16 +234,17 @@ class LocalGraph:
|
|
|
237
234
|
r"""Returns a :class:`pandas.DataFrame` object containing metadata
|
|
238
235
|
information about the tables in this graph.
|
|
239
236
|
|
|
240
|
-
The returned dataframe has columns ``name``, ``primary_key``,
|
|
241
|
-
``time_column``, which provide an aggregate
|
|
242
|
-
the tables of this graph.
|
|
237
|
+
The returned dataframe has columns ``name``, ``primary_key``,
|
|
238
|
+
``time_column``, and ``end_time_column``, which provide an aggregate
|
|
239
|
+
view of the properties of the tables of this graph.
|
|
243
240
|
|
|
244
241
|
Example:
|
|
242
|
+
>>> # doctest: +SKIP
|
|
245
243
|
>>> import kumoai.experimental.rfm as rfm
|
|
246
244
|
>>> graph = rfm.LocalGraph(tables=...).infer_metadata()
|
|
247
|
-
>>> graph.metadata
|
|
248
|
-
name
|
|
249
|
-
0 users
|
|
245
|
+
>>> graph.metadata # doctest: +SKIP
|
|
246
|
+
name primary_key time_column end_time_column
|
|
247
|
+
0 users user_id - -
|
|
250
248
|
"""
|
|
251
249
|
tables = list(self.tables.values())
|
|
252
250
|
|
|
@@ -257,6 +255,11 @@ class LocalGraph:
|
|
|
257
255
|
pd.Series(dtype=str, data=[t._primary_key or '-' for t in tables]),
|
|
258
256
|
'time_column':
|
|
259
257
|
pd.Series(dtype=str, data=[t._time_column or '-' for t in tables]),
|
|
258
|
+
'end_time_column':
|
|
259
|
+
pd.Series(
|
|
260
|
+
dtype=str,
|
|
261
|
+
data=[t._end_time_column or '-' for t in tables],
|
|
262
|
+
),
|
|
260
263
|
})
|
|
261
264
|
|
|
262
265
|
def print_metadata(self) -> None:
|
|
@@ -580,13 +583,17 @@ class LocalGraph:
|
|
|
580
583
|
# Check that the destination table defines a primary key:
|
|
581
584
|
if dst_key is None:
|
|
582
585
|
raise ValueError(f"Edge {edge} is invalid since table "
|
|
583
|
-
f"'{dst_table}' does not have a primary key"
|
|
586
|
+
f"'{dst_table}' does not have a primary key. "
|
|
587
|
+
f"Add either a primary key or remove the "
|
|
588
|
+
f"link before proceeding.")
|
|
584
589
|
|
|
585
590
|
# Ensure that foreign key is not a primary key:
|
|
586
591
|
src_pkey = self[src_table].primary_key
|
|
587
592
|
if src_pkey is not None and src_pkey.name == fkey:
|
|
588
593
|
raise ValueError(f"Cannot treat the primary key of table "
|
|
589
|
-
f"'{src_table}' as a foreign key"
|
|
594
|
+
f"'{src_table}' as a foreign key. Remove "
|
|
595
|
+
f"either the primary key or the link before "
|
|
596
|
+
f"before proceeding.")
|
|
590
597
|
|
|
591
598
|
# Check that fkey/pkey have valid and consistent data types:
|
|
592
599
|
assert src_key.dtype is not None
|
|
@@ -604,8 +611,8 @@ class LocalGraph:
|
|
|
604
611
|
raise ValueError(f"{edge} is invalid as foreign key "
|
|
605
612
|
f"'{fkey}' and primary key '{dst_key.name}' "
|
|
606
613
|
f"have incompatible data types (got "
|
|
607
|
-
f"fkey.dtype '{
|
|
608
|
-
f"pkey.dtype '{
|
|
614
|
+
f"fkey.dtype '{src_key.dtype}' and "
|
|
615
|
+
f"pkey.dtype '{dst_key.dtype}')")
|
|
609
616
|
|
|
610
617
|
return self
|
|
611
618
|
|
|
@@ -678,6 +685,11 @@ class LocalGraph:
|
|
|
678
685
|
]
|
|
679
686
|
if time_column := table.time_column:
|
|
680
687
|
keys += [f'{time_column.name}: Time ({time_column.dtype})']
|
|
688
|
+
if end_time_column := table.end_time_column:
|
|
689
|
+
keys += [
|
|
690
|
+
f'{end_time_column.name}: '
|
|
691
|
+
f'End Time ({end_time_column.dtype})'
|
|
692
|
+
]
|
|
681
693
|
key_repr = left_align(keys)
|
|
682
694
|
|
|
683
695
|
columns = []
|
|
@@ -685,9 +697,9 @@ class LocalGraph:
|
|
|
685
697
|
columns += [
|
|
686
698
|
f'{column.name}: {column.stype} ({column.dtype})'
|
|
687
699
|
for column in table.columns
|
|
688
|
-
if column.name not in fkeys_dict[table_name]
|
|
689
|
-
and column.name != table.
|
|
690
|
-
and column.name != table.
|
|
700
|
+
if column.name not in fkeys_dict[table_name] and
|
|
701
|
+
column.name != table._primary_key and column.name != table.
|
|
702
|
+
_time_column and column.name != table._end_time_column
|
|
691
703
|
]
|
|
692
704
|
column_repr = left_align(columns)
|
|
693
705
|
|
|
@@ -754,16 +766,18 @@ class LocalGraph:
|
|
|
754
766
|
def _to_api_graph_definition(self) -> GraphDefinition:
|
|
755
767
|
tables: Dict[str, TableDefinition] = {}
|
|
756
768
|
col_groups: List[ColumnKeyGroup] = []
|
|
757
|
-
for
|
|
758
|
-
tables[
|
|
769
|
+
for table_name, table in self.tables.items():
|
|
770
|
+
tables[table_name] = table._to_api_table_definition()
|
|
759
771
|
if table.primary_key is None:
|
|
760
772
|
continue
|
|
761
|
-
keys = [ColumnKey(
|
|
773
|
+
keys = [ColumnKey(table_name, table.primary_key.name)]
|
|
762
774
|
for edge in self.edges:
|
|
763
|
-
if edge.dst_table ==
|
|
775
|
+
if edge.dst_table == table_name:
|
|
764
776
|
keys.append(ColumnKey(edge.src_table, edge.fkey))
|
|
765
|
-
keys = sorted(
|
|
766
|
-
|
|
777
|
+
keys = sorted(
|
|
778
|
+
list(set(keys)),
|
|
779
|
+
key=lambda x: f'{x.table_name}.{x.col_name}',
|
|
780
|
+
)
|
|
767
781
|
if len(keys) > 1:
|
|
768
782
|
col_groups.append(ColumnKeyGroup(keys))
|
|
769
783
|
return GraphDefinition(tables, col_groups)
|
|
@@ -92,15 +92,23 @@ class LocalGraphSampler:
|
|
|
92
92
|
)
|
|
93
93
|
continue
|
|
94
94
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
df = df.iloc[
|
|
99
|
-
|
|
95
|
+
row: Optional[np.ndarray] = None
|
|
96
|
+
if table_name in self._graph_store.end_time_column_dict:
|
|
97
|
+
# Set end time to NaT for all values greater than anchor time:
|
|
98
|
+
df = df.iloc[node].reset_index(drop=True)
|
|
99
|
+
col_name = self._graph_store.end_time_column_dict[table_name]
|
|
100
|
+
ser = df[col_name]
|
|
101
|
+
value = ser.astype('datetime64[ns]').astype(int).to_numpy()
|
|
102
|
+
mask = value > time[batch]
|
|
103
|
+
df.loc[mask, col_name] = pd.NaT
|
|
100
104
|
else:
|
|
101
|
-
df
|
|
102
|
-
|
|
103
|
-
|
|
105
|
+
# Only store unique rows in `df` above a certain threshold:
|
|
106
|
+
unique_node, inverse = np.unique(node, return_inverse=True)
|
|
107
|
+
if len(node) > 1.05 * len(unique_node):
|
|
108
|
+
df = df.iloc[unique_node].reset_index(drop=True)
|
|
109
|
+
row = inverse
|
|
110
|
+
else:
|
|
111
|
+
df = df.iloc[node].reset_index(drop=True)
|
|
104
112
|
|
|
105
113
|
# Filter data frame to minimal set of columns:
|
|
106
114
|
df = df[columns]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import warnings
|
|
2
|
-
from typing import Dict, List, Optional, Tuple
|
|
2
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -8,7 +8,7 @@ from kumoapi.typing import Stype
|
|
|
8
8
|
|
|
9
9
|
from kumoai.experimental.rfm import LocalGraph
|
|
10
10
|
from kumoai.experimental.rfm.utils import normalize_text
|
|
11
|
-
from kumoai.utils import ProgressLogger
|
|
11
|
+
from kumoai.utils import InteractiveProgressLogger, ProgressLogger
|
|
12
12
|
|
|
13
13
|
try:
|
|
14
14
|
import torch
|
|
@@ -22,10 +22,16 @@ class LocalGraphStore:
|
|
|
22
22
|
self,
|
|
23
23
|
graph: LocalGraph,
|
|
24
24
|
preprocess: bool = False,
|
|
25
|
-
verbose: bool = True,
|
|
25
|
+
verbose: Union[bool, ProgressLogger] = True,
|
|
26
26
|
) -> None:
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
if not isinstance(verbose, ProgressLogger):
|
|
29
|
+
verbose = InteractiveProgressLogger(
|
|
30
|
+
"Materializing graph",
|
|
31
|
+
verbose=verbose,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
with verbose as logger:
|
|
29
35
|
self.df_dict, self.mask_dict = self.sanitize(graph, preprocess)
|
|
30
36
|
self.stype_dict = self.get_stype_dict(graph)
|
|
31
37
|
logger.log("Sanitized input data")
|
|
@@ -39,6 +45,7 @@ class LocalGraphStore:
|
|
|
39
45
|
|
|
40
46
|
(
|
|
41
47
|
self.time_column_dict,
|
|
48
|
+
self.end_time_column_dict,
|
|
42
49
|
self.time_dict,
|
|
43
50
|
self.min_time,
|
|
44
51
|
self.max_time,
|
|
@@ -195,11 +202,15 @@ class LocalGraphStore:
|
|
|
195
202
|
pkey_map = pkey_map[self.mask_dict[table.name]]
|
|
196
203
|
|
|
197
204
|
if len(pkey_map) == 0:
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
205
|
+
error_msg = f"Found no valid rows in table '{table.name}'. "
|
|
206
|
+
if table.has_time_column():
|
|
207
|
+
error_msg += ("Please make sure that there exists valid "
|
|
208
|
+
"non-N/A primary key and time column pairs "
|
|
209
|
+
"in this table.")
|
|
210
|
+
else:
|
|
211
|
+
error_msg += ("Please make sure that there exists valid "
|
|
212
|
+
"non-N/A primary keys in this table.")
|
|
213
|
+
raise ValueError(error_msg)
|
|
203
214
|
|
|
204
215
|
pkey_map_dict[table.name] = pkey_map
|
|
205
216
|
|
|
@@ -209,16 +220,21 @@ class LocalGraphStore:
|
|
|
209
220
|
self,
|
|
210
221
|
graph: LocalGraph,
|
|
211
222
|
) -> Tuple[
|
|
223
|
+
Dict[str, str],
|
|
212
224
|
Dict[str, str],
|
|
213
225
|
Dict[str, np.ndarray],
|
|
214
226
|
pd.Timestamp,
|
|
215
227
|
pd.Timestamp,
|
|
216
228
|
]:
|
|
217
229
|
time_column_dict: Dict[str, str] = {}
|
|
230
|
+
end_time_column_dict: Dict[str, str] = {}
|
|
218
231
|
time_dict: Dict[str, np.ndarray] = {}
|
|
219
232
|
min_time = pd.Timestamp.max
|
|
220
233
|
max_time = pd.Timestamp.min
|
|
221
234
|
for table in graph.tables.values():
|
|
235
|
+
if table._end_time_column is not None:
|
|
236
|
+
end_time_column_dict[table.name] = table._end_time_column
|
|
237
|
+
|
|
222
238
|
if table._time_column is None:
|
|
223
239
|
continue
|
|
224
240
|
|
|
@@ -233,7 +249,13 @@ class LocalGraphStore:
|
|
|
233
249
|
min_time = min(min_time, time.min())
|
|
234
250
|
max_time = max(max_time, time.max())
|
|
235
251
|
|
|
236
|
-
return
|
|
252
|
+
return (
|
|
253
|
+
time_column_dict,
|
|
254
|
+
end_time_column_dict,
|
|
255
|
+
time_dict,
|
|
256
|
+
min_time,
|
|
257
|
+
max_time,
|
|
258
|
+
)
|
|
237
259
|
|
|
238
260
|
def get_csc(
|
|
239
261
|
self,
|