graflo 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of graflo might be problematic. Click here for more details.
- graflo/README.md +18 -0
- graflo/__init__.py +70 -0
- graflo/architecture/__init__.py +38 -0
- graflo/architecture/actor.py +1276 -0
- graflo/architecture/actor_util.py +450 -0
- graflo/architecture/edge.py +418 -0
- graflo/architecture/onto.py +376 -0
- graflo/architecture/onto_sql.py +54 -0
- graflo/architecture/resource.py +163 -0
- graflo/architecture/schema.py +135 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +89 -0
- graflo/architecture/vertex.py +562 -0
- graflo/caster.py +736 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +203 -0
- graflo/cli/manage_dbs.py +197 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/data_source/__init__.py +48 -0
- graflo/data_source/api.py +339 -0
- graflo/data_source/base.py +95 -0
- graflo/data_source/factory.py +304 -0
- graflo/data_source/file.py +148 -0
- graflo/data_source/memory.py +70 -0
- graflo/data_source/registry.py +82 -0
- graflo/data_source/sql.py +183 -0
- graflo/db/__init__.py +44 -0
- graflo/db/arango/__init__.py +22 -0
- graflo/db/arango/conn.py +1025 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/conn.py +377 -0
- graflo/db/connection/__init__.py +6 -0
- graflo/db/connection/config_mapping.py +18 -0
- graflo/db/connection/onto.py +717 -0
- graflo/db/connection/wsgi.py +29 -0
- graflo/db/manager.py +119 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +639 -0
- graflo/db/postgres/__init__.py +37 -0
- graflo/db/postgres/conn.py +948 -0
- graflo/db/postgres/fuzzy_matcher.py +281 -0
- graflo/db/postgres/heuristics.py +133 -0
- graflo/db/postgres/inference_utils.py +428 -0
- graflo/db/postgres/resource_mapping.py +273 -0
- graflo/db/postgres/schema_inference.py +372 -0
- graflo/db/postgres/types.py +148 -0
- graflo/db/postgres/util.py +87 -0
- graflo/db/tigergraph/__init__.py +9 -0
- graflo/db/tigergraph/conn.py +2365 -0
- graflo/db/tigergraph/onto.py +26 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +525 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +312 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +616 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +807 -0
- graflo/util/merge.py +150 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +422 -0
- graflo/util/transform.py +454 -0
- graflo-1.3.7.dist-info/METADATA +243 -0
- graflo-1.3.7.dist-info/RECORD +70 -0
- graflo-1.3.7.dist-info/WHEEL +4 -0
- graflo-1.3.7.dist-info/entry_points.txt +5 -0
- graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
graflo/util/transform.py
ADDED
|
@@ -0,0 +1,454 @@
|
|
|
1
|
+
"""Data transformation utilities for graph operations.
|
|
2
|
+
|
|
3
|
+
This module provides utility functions for transforming and standardizing data
|
|
4
|
+
in various formats, particularly for graph database operations. It includes
|
|
5
|
+
functions for date parsing, string standardization, and data cleaning.
|
|
6
|
+
|
|
7
|
+
Key Functions:
|
|
8
|
+
- standardize: Standardize string keys and names
|
|
9
|
+
- parse_date_*: Various date parsing functions for different formats
|
|
10
|
+
- cast_ibes_analyst: Parse and standardize analyst names
|
|
11
|
+
- clear_first_level_nones: Clean dictionaries by removing None values
|
|
12
|
+
- parse_multi_item: Parse complex multi-item strings
|
|
13
|
+
- pick_unique_dict: Remove duplicate dictionaries
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
>>> name = standardize("John. Doe, Smith")
|
|
17
|
+
>>> date = parse_date_standard("2023-01-01")
|
|
18
|
+
>>> analyst = cast_ibes_analyst("ADKINS/NARRA")
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import re
|
|
23
|
+
import time
|
|
24
|
+
from collections import defaultdict
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
|
|
27
|
+
ORDINAL_SUFFIX = ["st", "nd", "rd", "th"]
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def standardize(k):
|
|
33
|
+
"""Standardizes a string key by removing periods and splitting.
|
|
34
|
+
|
|
35
|
+
Handles comma and space-separated strings, normalizing their format.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
k (str): Input string to be standardized.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
str: Cleaned and standardized string.
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
>>> standardize("John. Doe, Smith")
|
|
45
|
+
'John,Doe,Smith'
|
|
46
|
+
>>> standardize("John Doe Smith")
|
|
47
|
+
'John,Doe,Smith'
|
|
48
|
+
"""
|
|
49
|
+
k = k.translate(str.maketrans({".": ""}))
|
|
50
|
+
# try to split by ", "
|
|
51
|
+
k = k.split(", ")
|
|
52
|
+
if len(k) < 2:
|
|
53
|
+
k = k[0].split(" ")
|
|
54
|
+
else:
|
|
55
|
+
k[1] = k[1].translate(str.maketrans({" ": ""}))
|
|
56
|
+
return ",".join(k)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def parse_date_standard(input_str):
|
|
60
|
+
"""Parse a date string in YYYY-MM-DD format.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
input_str (str): Date string in YYYY-MM-DD format.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
tuple: (year, month, day) as integers.
|
|
67
|
+
|
|
68
|
+
Example:
|
|
69
|
+
>>> parse_date_standard("2023-01-01")
|
|
70
|
+
(2023, 1, 1)
|
|
71
|
+
"""
|
|
72
|
+
dt = datetime.strptime(input_str, "%Y-%m-%d")
|
|
73
|
+
return dt.year, dt.month, dt.day
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def parse_date_conf(input_str):
|
|
77
|
+
"""Parse a date string in YYYYMMDD format.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
input_str (str): Date string in YYYYMMDD format.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
tuple: (year, month, day) as integers.
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
>>> parse_date_conf("20230101")
|
|
87
|
+
(2023, 1, 1)
|
|
88
|
+
"""
|
|
89
|
+
dt = datetime.strptime(input_str, "%Y%m%d")
|
|
90
|
+
return dt.year, dt.month, dt.day
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def parse_date_ibes(date0, time0):
|
|
94
|
+
"""Converts IBES date and time to ISO 8601 format datetime.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
date0 (str/int): Date in YYYYMMDD format.
|
|
98
|
+
time0 (str): Time in HH:MM:SS format.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
str: Datetime in ISO 8601 format (YYYY-MM-DDTHH:MM:SSZ).
|
|
102
|
+
|
|
103
|
+
Example:
|
|
104
|
+
>>> parse_date_ibes(20160126, "9:35:52")
|
|
105
|
+
'2016-01-26T09:35:52Z'
|
|
106
|
+
"""
|
|
107
|
+
date0 = str(date0)
|
|
108
|
+
year, month, day = date0[:4], date0[4:6], date0[6:]
|
|
109
|
+
full_datetime = f"{year}-{month}-{day}T{time0}Z"
|
|
110
|
+
|
|
111
|
+
return full_datetime
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def parse_date_yahoo(date0):
|
|
115
|
+
"""Convert Yahoo Finance date to ISO 8601 format.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
date0 (str): Date in YYYY-MM-DD format.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
str: Datetime in ISO 8601 format with noon time.
|
|
122
|
+
|
|
123
|
+
Example:
|
|
124
|
+
>>> parse_date_yahoo("2023-01-01")
|
|
125
|
+
'2023-01-01T12:00:00Z'
|
|
126
|
+
"""
|
|
127
|
+
full_datetime = f"{date0}T12:00:00Z"
|
|
128
|
+
return full_datetime
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def round_str(x, **kwargs):
|
|
132
|
+
"""Round a string number to specified precision.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
x (str): String representation of a number.
|
|
136
|
+
**kwargs: Additional arguments for round() function.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
float: Rounded number.
|
|
140
|
+
|
|
141
|
+
Example:
|
|
142
|
+
>>> round_str("3.14159", ndigits=2)
|
|
143
|
+
3.14
|
|
144
|
+
"""
|
|
145
|
+
return round(float(x), **kwargs)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def parse_date_standard_to_epoch(input_str):
|
|
149
|
+
"""Convert standard date string to Unix epoch timestamp.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
input_str (str): Date string in YYYY-MM-DD format.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
float: Unix epoch timestamp.
|
|
156
|
+
|
|
157
|
+
Example:
|
|
158
|
+
>>> parse_date_standard_to_epoch("2023-01-01")
|
|
159
|
+
1672531200.0
|
|
160
|
+
"""
|
|
161
|
+
dt = datetime.strptime(input_str, "%Y-%m-%d").timetuple()
|
|
162
|
+
timestamp = time.mktime(dt)
|
|
163
|
+
return timestamp
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def cast_ibes_analyst(s):
|
|
167
|
+
"""Splits and normalizes analyst name strings.
|
|
168
|
+
|
|
169
|
+
Handles various name formats like 'ADKINS/NARRA' or 'ARFSTROM J'.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
s (str): Analyst name string.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
tuple: (last_name, first_initial)
|
|
176
|
+
|
|
177
|
+
Examples:
|
|
178
|
+
>>> cast_ibes_analyst('ADKINS/NARRA')
|
|
179
|
+
('ADKINS', 'N')
|
|
180
|
+
>>> cast_ibes_analyst('ARFSTROM J')
|
|
181
|
+
('ARFSTROM', 'J')
|
|
182
|
+
"""
|
|
183
|
+
if " " in s or "\t" in s:
|
|
184
|
+
r = s.split()[:2]
|
|
185
|
+
if len(r) < 2:
|
|
186
|
+
return r[0], ""
|
|
187
|
+
else:
|
|
188
|
+
return r[0], r[1][:1]
|
|
189
|
+
else:
|
|
190
|
+
r = s.split("/")
|
|
191
|
+
if s.startswith("/"):
|
|
192
|
+
r = r[1:3]
|
|
193
|
+
else:
|
|
194
|
+
r = r[:2]
|
|
195
|
+
if len(r) < 2:
|
|
196
|
+
return r[0], ""
|
|
197
|
+
else:
|
|
198
|
+
return r[0], r[1][:1]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def parse_date_reference(input_str):
|
|
202
|
+
"""Extract year from a date reference string.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
input_str (str): Date reference string.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
int: Year from the date reference.
|
|
209
|
+
|
|
210
|
+
Example:
|
|
211
|
+
>>> parse_date_reference("1923, May 10")
|
|
212
|
+
1923
|
|
213
|
+
"""
|
|
214
|
+
return _parse_date_reference(input_str)["year"]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _parse_date_reference(input_str):
|
|
218
|
+
"""Parse complex, human-written date references.
|
|
219
|
+
|
|
220
|
+
Handles various date formats like:
|
|
221
|
+
- "1923, May 10"
|
|
222
|
+
- "1923, July"
|
|
223
|
+
- "1921, Sept"
|
|
224
|
+
- "1935-36"
|
|
225
|
+
- "1926, December 24th"
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
input_str (str): Date string in various formats.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
dict: Parsed date information with keys 'year', optional 'month', 'day'.
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
>>> _parse_date_reference("1923, May 10")
|
|
235
|
+
{'year': 1923, 'month': 5, 'day': 10}
|
|
236
|
+
"""
|
|
237
|
+
if "," in input_str:
|
|
238
|
+
if len(input_str.split(" ")) == 3:
|
|
239
|
+
if input_str[-2:] in ORDINAL_SUFFIX:
|
|
240
|
+
input_str = input_str[:-2]
|
|
241
|
+
try:
|
|
242
|
+
dt = datetime.strptime(input_str, "%Y, %B %d")
|
|
243
|
+
return {"year": dt.year, "month": dt.month, "day": dt.day}
|
|
244
|
+
except:
|
|
245
|
+
try:
|
|
246
|
+
aux = input_str.split(" ")
|
|
247
|
+
input_str = " ".join([aux[0]] + [aux[1][:3]] + [aux[2]])
|
|
248
|
+
dt = datetime.strptime(input_str, "%Y, %b %d")
|
|
249
|
+
return {"year": dt.year, "month": dt.month, "day": dt.day}
|
|
250
|
+
except:
|
|
251
|
+
return {"year": input_str}
|
|
252
|
+
else:
|
|
253
|
+
try:
|
|
254
|
+
dt = datetime.strptime(input_str, "%Y, %B")
|
|
255
|
+
return {"year": dt.year, "month": dt.month}
|
|
256
|
+
except:
|
|
257
|
+
try:
|
|
258
|
+
aux = input_str.split(" ")
|
|
259
|
+
input_str = " ".join([aux[0]] + [aux[1][:3]])
|
|
260
|
+
dt = datetime.strptime(input_str, "%Y, %b")
|
|
261
|
+
return {"year": dt.year, "month": dt.month}
|
|
262
|
+
except:
|
|
263
|
+
return {"year": input_str}
|
|
264
|
+
else:
|
|
265
|
+
try:
|
|
266
|
+
dt = datetime.strptime(input_str[:4], "%Y")
|
|
267
|
+
return {"year": dt.year}
|
|
268
|
+
except:
|
|
269
|
+
return {"year": input_str}
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def try_int(x):
|
|
273
|
+
"""Attempt to convert a value to integer.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
x: Value to convert.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
int or original value: Integer if conversion successful, original value otherwise.
|
|
280
|
+
|
|
281
|
+
Example:
|
|
282
|
+
>>> try_int("123")
|
|
283
|
+
123
|
|
284
|
+
>>> try_int("abc")
|
|
285
|
+
'abc'
|
|
286
|
+
"""
|
|
287
|
+
try:
|
|
288
|
+
x = int(x)
|
|
289
|
+
return x
|
|
290
|
+
except:
|
|
291
|
+
return x
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def clear_first_level_nones(docs, keys_keep_nones: list | None = None):
|
|
295
|
+
"""Removes None values from dictionaries, with optional key exceptions.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
docs (list): List of dictionaries to clean.
|
|
299
|
+
keys_keep_nones (list, optional): Keys to keep even if their value is None.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
list: Cleaned list of dictionaries.
|
|
303
|
+
|
|
304
|
+
Example:
|
|
305
|
+
>>> docs = [{"a": 1, "b": None}, {"a": None, "b": 2}]
|
|
306
|
+
>>> clear_first_level_nones(docs, keys_keep_nones=["a"])
|
|
307
|
+
[{"a": 1}, {"a": None, "b": 2}]
|
|
308
|
+
"""
|
|
309
|
+
if keys_keep_nones is not None:
|
|
310
|
+
docs = [
|
|
311
|
+
{k: v for k, v in tdict.items() if v or k in keys_keep_nones}
|
|
312
|
+
for tdict in docs
|
|
313
|
+
]
|
|
314
|
+
return docs
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def parse_multi_item(s, mapper: dict, direct: list):
|
|
318
|
+
"""Parses complex multi-item strings into structured data.
|
|
319
|
+
|
|
320
|
+
Supports parsing strings with quoted or bracketed items.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
s (str): Input string to parse.
|
|
324
|
+
mapper (dict): Mapping of input keys to output keys.
|
|
325
|
+
direct (list): Direct keys to extract.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
defaultdict: Parsed items with lists as values.
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
>>> s = '[name: John, age: 30] [name: Jane, age: 25]'
|
|
332
|
+
>>> mapper = {"name": "full_name"}
|
|
333
|
+
>>> direct = ["age"]
|
|
334
|
+
>>> parse_multi_item(s, mapper, direct)
|
|
335
|
+
defaultdict(list, {'full_name': ['John', 'Jane'], 'age': ['30', '25']})
|
|
336
|
+
"""
|
|
337
|
+
if "'" in s:
|
|
338
|
+
items_str = re.findall(r"\"(.*?)\"", s) + re.findall(r"\'(.*?)\'", s)
|
|
339
|
+
else:
|
|
340
|
+
# remove brackets
|
|
341
|
+
items_str = re.findall(r"\[([^]]+)", s)[0].split()
|
|
342
|
+
r: defaultdict[str, list] = defaultdict(list)
|
|
343
|
+
for item in items_str:
|
|
344
|
+
doc0 = [ss.strip().split(":") for ss in item.split(",")]
|
|
345
|
+
if all([len(x) == 2 for x in doc0]):
|
|
346
|
+
doc0_dict = dict(doc0)
|
|
347
|
+
for n_init, n_final in mapper.items():
|
|
348
|
+
try:
|
|
349
|
+
r[n_final] += [doc0_dict[n_init]]
|
|
350
|
+
except KeyError:
|
|
351
|
+
r[n_final] += [None]
|
|
352
|
+
|
|
353
|
+
for n_final in direct:
|
|
354
|
+
# Use field.name for dictionary keys (JSON serialization requires strings)
|
|
355
|
+
# Handle both Field objects and strings for backward compatibility
|
|
356
|
+
key = n_final.name if hasattr(n_final, "name") else str(n_final)
|
|
357
|
+
try:
|
|
358
|
+
r[key] += [doc0_dict[key]]
|
|
359
|
+
except KeyError:
|
|
360
|
+
r[key] += [None]
|
|
361
|
+
else:
|
|
362
|
+
for key, value in zip(direct, doc0):
|
|
363
|
+
# Use field.name for dictionary keys (JSON serialization requires strings)
|
|
364
|
+
# Handle both Field objects and strings for backward compatibility
|
|
365
|
+
key_str = key.name if hasattr(key, "name") else str(key)
|
|
366
|
+
r[key_str] += [value]
|
|
367
|
+
|
|
368
|
+
return r
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def pick_unique_dict(docs):
|
|
372
|
+
"""Removes duplicate dictionaries from a list.
|
|
373
|
+
|
|
374
|
+
Uses a hash-based approach to identify unique dictionaries, which is more
|
|
375
|
+
efficient than JSON serialization and preserves original object types.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
docs (list): List of dictionaries.
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
list: List of unique dictionaries (preserving original objects).
|
|
382
|
+
|
|
383
|
+
Example:
|
|
384
|
+
>>> docs = [{"a": 1}, {"a": 1}, {"b": 2}]
|
|
385
|
+
>>> pick_unique_dict(docs)
|
|
386
|
+
[{"a": 1}, {"b": 2}]
|
|
387
|
+
"""
|
|
388
|
+
from datetime import date, datetime, time
|
|
389
|
+
from decimal import Decimal
|
|
390
|
+
|
|
391
|
+
def make_hashable(obj):
|
|
392
|
+
"""Convert an object to a hashable representation.
|
|
393
|
+
|
|
394
|
+
Handles nested structures, datetime objects, and Decimal types.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
obj: Object to make hashable
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
Hashable representation of the object
|
|
401
|
+
"""
|
|
402
|
+
if isinstance(obj, dict):
|
|
403
|
+
# Sort items by key for consistent hashing
|
|
404
|
+
return tuple(sorted((k, make_hashable(v)) for k, v in obj.items()))
|
|
405
|
+
elif isinstance(obj, (list, tuple)):
|
|
406
|
+
return tuple(make_hashable(item) for item in obj)
|
|
407
|
+
elif isinstance(obj, (datetime, date, time)):
|
|
408
|
+
# Convert to ISO format string for hashing
|
|
409
|
+
return ("__datetime__", obj.isoformat())
|
|
410
|
+
elif isinstance(obj, Decimal):
|
|
411
|
+
# Convert to string representation to preserve precision
|
|
412
|
+
return ("__decimal__", str(obj))
|
|
413
|
+
elif isinstance(obj, set):
|
|
414
|
+
# Convert set to sorted tuple for consistent hashing
|
|
415
|
+
return tuple(sorted(make_hashable(item) for item in obj))
|
|
416
|
+
else:
|
|
417
|
+
# Primitive types (int, float, str, bool, None) are already hashable
|
|
418
|
+
return obj
|
|
419
|
+
|
|
420
|
+
# Use a dict to preserve insertion order and original objects
|
|
421
|
+
seen = {}
|
|
422
|
+
for doc in docs:
|
|
423
|
+
# Create hashable representation
|
|
424
|
+
hashable_repr = make_hashable(doc)
|
|
425
|
+
# Use hashable representation as key, original doc as value
|
|
426
|
+
if hashable_repr not in seen:
|
|
427
|
+
seen[hashable_repr] = doc
|
|
428
|
+
|
|
429
|
+
# Return list of unique documents (preserving original objects)
|
|
430
|
+
return list(seen.values())
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def split_keep_part(s: str, sep="/", keep=-1) -> str:
|
|
434
|
+
"""Split a string and keep specified parts.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
s (str): String to split.
|
|
438
|
+
sep (str): Separator to split on.
|
|
439
|
+
keep (int or list): Index or indices to keep.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
str: Joined string of kept parts.
|
|
443
|
+
|
|
444
|
+
Example:
|
|
445
|
+
>>> split_keep_part("a/b/c", keep=0)
|
|
446
|
+
'a'
|
|
447
|
+
>>> split_keep_part("a/b/c", keep=[0, 2])
|
|
448
|
+
'a/c'
|
|
449
|
+
"""
|
|
450
|
+
if isinstance(keep, list):
|
|
451
|
+
items = s.split(sep)
|
|
452
|
+
return sep.join(items[k] for k in keep)
|
|
453
|
+
else:
|
|
454
|
+
return s.split(sep)[keep]
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graflo
|
|
3
|
+
Version: 1.3.7
|
|
4
|
+
Summary: A framework for transforming tabular (CSV, SQL) and hierarchical data (JSON, XML) into property graphs and ingesting them into graph databases (ArangoDB, Neo4j, TigerGraph). Features automatic PostgreSQL schema inference.
|
|
5
|
+
Author-email: Alexander Belikov <alexander@growgraph.dev>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: ~=3.10.0
|
|
8
|
+
Requires-Dist: click<9,>=8.2.0
|
|
9
|
+
Requires-Dist: dataclass-wizard>=0.34.0
|
|
10
|
+
Requires-Dist: ijson<4,>=3.2.3
|
|
11
|
+
Requires-Dist: neo4j<6,>=5.22.0
|
|
12
|
+
Requires-Dist: networkx~=3.3
|
|
13
|
+
Requires-Dist: pandas-stubs==2.3.0.250703
|
|
14
|
+
Requires-Dist: pandas<3,>=2.0.3
|
|
15
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
16
|
+
Requires-Dist: pydantic-settings>=2.12.0
|
|
17
|
+
Requires-Dist: pydantic>=2.12.5
|
|
18
|
+
Requires-Dist: python-arango<9,>=8.1.2
|
|
19
|
+
Requires-Dist: pytigergraph>=1.9.0
|
|
20
|
+
Requires-Dist: requests>=2.31.0
|
|
21
|
+
Requires-Dist: sqlalchemy>=2.0.0
|
|
22
|
+
Requires-Dist: strenum>=0.4.15
|
|
23
|
+
Requires-Dist: suthing>=0.5.0
|
|
24
|
+
Requires-Dist: urllib3>=2.0.0
|
|
25
|
+
Requires-Dist: xmltodict<0.15,>=0.14.2
|
|
26
|
+
Provides-Extra: plot
|
|
27
|
+
Requires-Dist: pygraphviz>=1.14; extra == 'plot'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# GraFlo <img src="https://raw.githubusercontent.com/growgraph/graflo/main/docs/assets/favicon.ico" alt="graflo logo" style="height: 32px; width:32px;"/>
|
|
31
|
+
|
|
32
|
+
A framework for transforming **tabular** (CSV, SQL) and **hierarchical** data (JSON, XML) into property graphs and ingesting them into graph databases (ArangoDB, Neo4j, **TigerGraph**).
|
|
33
|
+
|
|
34
|
+
> **⚠️ Package Renamed**: This package was formerly known as `graphcast`.
|
|
35
|
+
|
|
36
|
+

|
|
37
|
+
[](https://badge.fury.io/py/graflo)
|
|
38
|
+
[](https://pepy.tech/projects/graflo)
|
|
39
|
+
[](https://github.com/growgraph/graflo/blob/main/LICENSE)
|
|
40
|
+
[](https://github.com/growgraph/graflo/actions/workflows/pre-commit.yml)
|
|
41
|
+
[]( https://doi.org/10.5281/zenodo.15446131)
|
|
42
|
+
|
|
43
|
+
## Core Concepts
|
|
44
|
+
|
|
45
|
+
### Property Graphs
|
|
46
|
+
graflo works with property graphs, which consist of:
|
|
47
|
+
|
|
48
|
+
- **Vertices**: Nodes with properties and optional unique identifiers
|
|
49
|
+
- **Edges**: Relationships between vertices with their own properties
|
|
50
|
+
- **Properties**: Both vertices and edges may have properties
|
|
51
|
+
|
|
52
|
+
### Schema
|
|
53
|
+
The Schema defines how your data should be transformed into a graph and contains:
|
|
54
|
+
|
|
55
|
+
- **Vertex Definitions**: Specify vertex types, their properties, and unique identifiers
|
|
56
|
+
- Fields can be specified as strings (backward compatible) or typed `Field` objects with types (INT, FLOAT, STRING, DATETIME, BOOL)
|
|
57
|
+
- Type information enables better validation and database-specific optimizations
|
|
58
|
+
- **Edge Definitions**: Define relationships between vertices and their properties
|
|
59
|
+
- Weight fields support typed definitions for better type safety
|
|
60
|
+
- **Resource Mapping**: describe how data sources map to vertices and edges
|
|
61
|
+
- **Transforms**: Modify data during the casting process
|
|
62
|
+
- **Automatic Schema Inference**: Generate schemas automatically from PostgreSQL 3NF databases
|
|
63
|
+
|
|
64
|
+
### Resources
|
|
65
|
+
Resources are your data sources that can be:
|
|
66
|
+
|
|
67
|
+
- **Table-like**: CSV files, database tables
|
|
68
|
+
- **JSON-like**: JSON files, nested data structures
|
|
69
|
+
|
|
70
|
+
## Features
|
|
71
|
+
|
|
72
|
+
- **Graph Transformation Meta-language**: A powerful declarative language to describe how your data becomes a property graph:
|
|
73
|
+
- Define vertex and edge structures with typed fields
|
|
74
|
+
- Set compound indexes for vertices and edges
|
|
75
|
+
- Use blank vertices for complex relationships
|
|
76
|
+
- Specify edge constraints and properties with typed weight fields
|
|
77
|
+
- Apply advanced filtering and transformations
|
|
78
|
+
- **Typed Schema Definitions**: Enhanced type support throughout the schema system
|
|
79
|
+
- Vertex fields support types (INT, FLOAT, STRING, DATETIME, BOOL) for better validation
|
|
80
|
+
- Edge weight fields can specify types for improved type safety
|
|
81
|
+
- Backward compatible: fields without types default to None (suitable for databases like ArangoDB)
|
|
82
|
+
- **🚀 PostgreSQL Schema Inference**: **Automatically generate schemas from PostgreSQL 3NF databases** - No manual schema definition needed!
|
|
83
|
+
- Introspect PostgreSQL schemas to identify vertex-like and edge-like tables
|
|
84
|
+
- Automatically map PostgreSQL data types to graflo Field types (INT, FLOAT, STRING, DATETIME, BOOL)
|
|
85
|
+
- Infer vertex configurations from table structures with proper indexes
|
|
86
|
+
- Infer edge configurations from foreign key relationships
|
|
87
|
+
- Create Resource mappings from PostgreSQL tables automatically
|
|
88
|
+
- Direct database access - ingest data without exporting to files first
|
|
89
|
+
- **Parallel processing**: Use as many cores as you have
|
|
90
|
+
- **Database support**: Ingest into ArangoDB, Neo4j, and **TigerGraph** using the same API (database agnostic). Source data from PostgreSQL and other SQL databases.
|
|
91
|
+
- **Server-side filtering**: Efficient querying with server-side filtering support (TigerGraph REST++ API)
|
|
92
|
+
|
|
93
|
+
## Documentation
|
|
94
|
+
Full documentation is available at: [growgraph.github.io/graflo](https://growgraph.github.io/graflo)
|
|
95
|
+
|
|
96
|
+
## Installation
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install graflo
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Usage Examples
|
|
103
|
+
|
|
104
|
+
### Simple ingest
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from suthing import FileHandle
|
|
108
|
+
|
|
109
|
+
from graflo import Schema, Caster, Patterns
|
|
110
|
+
from graflo.db.connection.onto import ArangoConfig
|
|
111
|
+
|
|
112
|
+
schema = Schema.from_dict(FileHandle.load("schema.yaml"))
|
|
113
|
+
|
|
114
|
+
# Option 1: Load config from docker/arango/.env (recommended)
|
|
115
|
+
conn_conf = ArangoConfig.from_docker_env()
|
|
116
|
+
|
|
117
|
+
# Option 2: Load from environment variables
|
|
118
|
+
# Set: ARANGO_URI, ARANGO_USERNAME, ARANGO_PASSWORD, ARANGO_DATABASE
|
|
119
|
+
conn_conf = ArangoConfig.from_env()
|
|
120
|
+
|
|
121
|
+
# Option 3: Load with custom prefix (for multiple configs)
|
|
122
|
+
# Set: USER_ARANGO_URI, USER_ARANGO_USERNAME, USER_ARANGO_PASSWORD, USER_ARANGO_DATABASE
|
|
123
|
+
user_conn_conf = ArangoConfig.from_env(prefix="USER")
|
|
124
|
+
|
|
125
|
+
# Option 4: Create config directly
|
|
126
|
+
# conn_conf = ArangoConfig(
|
|
127
|
+
# uri="http://localhost:8535",
|
|
128
|
+
# username="root",
|
|
129
|
+
# password="123",
|
|
130
|
+
# database="mygraph", # For ArangoDB, 'database' maps to schema/graph
|
|
131
|
+
# )
|
|
132
|
+
# Note: If 'database' (or 'schema_name' for TigerGraph) is not set,
|
|
133
|
+
# Caster will automatically use Schema.general.name as fallback
|
|
134
|
+
|
|
135
|
+
from graflo.util.onto import FilePattern
|
|
136
|
+
import pathlib
|
|
137
|
+
|
|
138
|
+
# Create Patterns with file patterns
|
|
139
|
+
patterns = Patterns()
|
|
140
|
+
patterns.add_file_pattern(
|
|
141
|
+
"work",
|
|
142
|
+
FilePattern(regex="\Sjson$", sub_path=pathlib.Path("./data"), resource_name="work")
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Or use resource_mapping for simpler initialization
|
|
146
|
+
# patterns = Patterns(
|
|
147
|
+
# _resource_mapping={
|
|
148
|
+
# "work": "./data/work.json",
|
|
149
|
+
# }
|
|
150
|
+
# )
|
|
151
|
+
|
|
152
|
+
schema.fetch_resource()
|
|
153
|
+
|
|
154
|
+
from graflo.caster import IngestionParams
|
|
155
|
+
|
|
156
|
+
caster = Caster(schema)
|
|
157
|
+
|
|
158
|
+
ingestion_params = IngestionParams(
|
|
159
|
+
clean_start=False, # Set to True to wipe existing database
|
|
160
|
+
# max_items=1000, # Optional: limit number of items to process
|
|
161
|
+
# batch_size=10000, # Optional: customize batch size
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
caster.ingest(
|
|
165
|
+
output_config=conn_conf, # Target database config
|
|
166
|
+
patterns=patterns, # Source data patterns
|
|
167
|
+
ingestion_params=ingestion_params,
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### PostgreSQL Schema Inference
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from graflo.db.postgres import PostgresConnection
|
|
175
|
+
from graflo.db.postgres.heuristics import infer_schema_from_postgres
|
|
176
|
+
from graflo.db.connection.onto import PostgresConfig
|
|
177
|
+
from graflo import Caster
|
|
178
|
+
from graflo.onto import DBFlavor
|
|
179
|
+
|
|
180
|
+
# Connect to PostgreSQL
|
|
181
|
+
postgres_config = PostgresConfig.from_docker_env() # or PostgresConfig.from_env()
|
|
182
|
+
postgres_conn = PostgresConnection(postgres_config)
|
|
183
|
+
|
|
184
|
+
# Infer schema from PostgreSQL 3NF database
|
|
185
|
+
schema = infer_schema_from_postgres(
|
|
186
|
+
postgres_conn,
|
|
187
|
+
schema_name="public", # PostgreSQL schema name
|
|
188
|
+
db_flavor=DBFlavor.ARANGO # Target graph database flavor
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Close PostgreSQL connection
|
|
192
|
+
postgres_conn.close()
|
|
193
|
+
|
|
194
|
+
# Use the inferred schema with Caster
|
|
195
|
+
caster = Caster(schema)
|
|
196
|
+
# ... continue with ingestion
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Development
|
|
200
|
+
|
|
201
|
+
To install requirements
|
|
202
|
+
|
|
203
|
+
```shell
|
|
204
|
+
git clone git@github.com:growgraph/graflo.git && cd graflo
|
|
205
|
+
uv sync --dev
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Tests
|
|
209
|
+
|
|
210
|
+
#### Test databases
|
|
211
|
+
Spin up Arango from [arango docker folder](./docker/arango) by
|
|
212
|
+
|
|
213
|
+
```shell
|
|
214
|
+
docker-compose --env-file .env up arango
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Neo4j from [neo4j docker folder](./docker/neo4j) by
|
|
218
|
+
|
|
219
|
+
```shell
|
|
220
|
+
docker-compose --env-file .env up neo4j
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
and TigerGraph from [tigergraph docker folder](./docker/tigergraph) by
|
|
224
|
+
|
|
225
|
+
```shell
|
|
226
|
+
docker-compose --env-file .env up tigergraph
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
To run unit tests
|
|
230
|
+
|
|
231
|
+
```shell
|
|
232
|
+
pytest test
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Requirements
|
|
236
|
+
|
|
237
|
+
- Python 3.10+
|
|
238
|
+
- python-arango
|
|
239
|
+
- sqlalchemy>=2.0.0 (for PostgreSQL and SQL data sources)
|
|
240
|
+
|
|
241
|
+
## Contributing
|
|
242
|
+
|
|
243
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|