graflo 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of graflo might be problematic. Click here for more details.
- graflo/README.md +18 -0
- graflo/__init__.py +70 -0
- graflo/architecture/__init__.py +38 -0
- graflo/architecture/actor.py +1276 -0
- graflo/architecture/actor_util.py +450 -0
- graflo/architecture/edge.py +418 -0
- graflo/architecture/onto.py +376 -0
- graflo/architecture/onto_sql.py +54 -0
- graflo/architecture/resource.py +163 -0
- graflo/architecture/schema.py +135 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +89 -0
- graflo/architecture/vertex.py +562 -0
- graflo/caster.py +736 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +203 -0
- graflo/cli/manage_dbs.py +197 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/data_source/__init__.py +48 -0
- graflo/data_source/api.py +339 -0
- graflo/data_source/base.py +95 -0
- graflo/data_source/factory.py +304 -0
- graflo/data_source/file.py +148 -0
- graflo/data_source/memory.py +70 -0
- graflo/data_source/registry.py +82 -0
- graflo/data_source/sql.py +183 -0
- graflo/db/__init__.py +44 -0
- graflo/db/arango/__init__.py +22 -0
- graflo/db/arango/conn.py +1025 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/conn.py +377 -0
- graflo/db/connection/__init__.py +6 -0
- graflo/db/connection/config_mapping.py +18 -0
- graflo/db/connection/onto.py +717 -0
- graflo/db/connection/wsgi.py +29 -0
- graflo/db/manager.py +119 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +639 -0
- graflo/db/postgres/__init__.py +37 -0
- graflo/db/postgres/conn.py +948 -0
- graflo/db/postgres/fuzzy_matcher.py +281 -0
- graflo/db/postgres/heuristics.py +133 -0
- graflo/db/postgres/inference_utils.py +428 -0
- graflo/db/postgres/resource_mapping.py +273 -0
- graflo/db/postgres/schema_inference.py +372 -0
- graflo/db/postgres/types.py +148 -0
- graflo/db/postgres/util.py +87 -0
- graflo/db/tigergraph/__init__.py +9 -0
- graflo/db/tigergraph/conn.py +2365 -0
- graflo/db/tigergraph/onto.py +26 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +525 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +312 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +616 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +807 -0
- graflo/util/merge.py +150 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +422 -0
- graflo/util/transform.py +454 -0
- graflo-1.3.7.dist-info/METADATA +243 -0
- graflo-1.3.7.dist-info/RECORD +70 -0
- graflo-1.3.7.dist-info/WHEEL +4 -0
- graflo-1.3.7.dist-info/entry_points.txt +5 -0
- graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""Inference utilities for PostgreSQL schema analysis.
|
|
2
|
+
|
|
3
|
+
This module provides utility functions for inferring relationships and patterns
|
|
4
|
+
from PostgreSQL table and column names using heuristics and fuzzy matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .fuzzy_matcher import FuzzyMatchCache, FuzzyMatcher
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def fuzzy_match_fragment(
|
|
13
|
+
fragment: str, vertex_names: list[str], threshold: float = 0.6
|
|
14
|
+
) -> str | None:
|
|
15
|
+
"""Fuzzy match a fragment to vertex names.
|
|
16
|
+
|
|
17
|
+
Backward-compatible wrapper function that uses the improved FuzzyMatcher.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
fragment: Fragment to match
|
|
21
|
+
vertex_names: List of vertex table names to match against
|
|
22
|
+
threshold: Similarity threshold (0.0 to 1.0)
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Best matching vertex name or None if no match above threshold
|
|
26
|
+
"""
|
|
27
|
+
matcher = FuzzyMatcher(vertex_names, threshold)
|
|
28
|
+
match, _ = matcher.match(fragment)
|
|
29
|
+
return match
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def detect_separator(text: str) -> str:
|
|
33
|
+
"""Detect the most common separator character in a text.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: Text to analyze
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Most common separator character, defaults to '_'
|
|
40
|
+
"""
|
|
41
|
+
# Common separators
|
|
42
|
+
separators = ["_", "-", "."]
|
|
43
|
+
counts = {sep: text.count(sep) for sep in separators}
|
|
44
|
+
|
|
45
|
+
if max(counts.values()) > 0:
|
|
46
|
+
return max(counts, key=counts.get)
|
|
47
|
+
return "_" # Default separator
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def split_by_separator(text: str, separator: str) -> list[str]:
|
|
51
|
+
"""Split text by separator, handling multiple consecutive separators.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to split
|
|
55
|
+
separator: Separator character
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
List of non-empty fragments
|
|
59
|
+
"""
|
|
60
|
+
# Split and filter out empty strings
|
|
61
|
+
parts = [p for p in text.split(separator) if p]
|
|
62
|
+
return parts
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def infer_edge_vertices_from_table_name(
|
|
66
|
+
table_name: str,
|
|
67
|
+
pk_columns: list[str],
|
|
68
|
+
fk_columns: list[dict[str, Any]],
|
|
69
|
+
vertex_table_names: list[str] | None = None,
|
|
70
|
+
match_cache: FuzzyMatchCache | None = None,
|
|
71
|
+
) -> tuple[str | None, str | None, str | None]:
|
|
72
|
+
"""Infer source and target vertex names from table name and structure.
|
|
73
|
+
|
|
74
|
+
Uses fuzzy matching to identify vertex names in table name fragments and key names.
|
|
75
|
+
Handles patterns like:
|
|
76
|
+
- rel_cluster_containment_host -> cluster, host, containment
|
|
77
|
+
- rel_cluster_containment_cluster_2 -> cluster, cluster, containment (self-reference)
|
|
78
|
+
- user_follows_user -> user, user, follows (self-reference)
|
|
79
|
+
- product_category_mapping -> product, category, mapping
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
table_name: Name of the table
|
|
83
|
+
pk_columns: List of primary key column names
|
|
84
|
+
fk_columns: List of foreign key dictionaries with 'column' and 'references_table' keys
|
|
85
|
+
vertex_table_names: Optional list of known vertex table names for fuzzy matching
|
|
86
|
+
match_cache: Optional pre-computed fuzzy match cache for better performance
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Tuple of (source_table, target_table, relation_name) or (None, None, None) if cannot infer
|
|
90
|
+
"""
|
|
91
|
+
if vertex_table_names is None:
|
|
92
|
+
vertex_table_names = []
|
|
93
|
+
|
|
94
|
+
# Use cache if provided, otherwise create a temporary one
|
|
95
|
+
if match_cache is None:
|
|
96
|
+
match_cache = FuzzyMatchCache(vertex_table_names)
|
|
97
|
+
|
|
98
|
+
# Step 1: Detect separator
|
|
99
|
+
separator = detect_separator(table_name)
|
|
100
|
+
|
|
101
|
+
# Step 2: Split table name by separator
|
|
102
|
+
table_fragments = split_by_separator(table_name, separator)
|
|
103
|
+
|
|
104
|
+
# Initialize relation_name - will be set if we identify a relation fragment
|
|
105
|
+
relation_name = None
|
|
106
|
+
|
|
107
|
+
# Step 3: Extract fragments from keys (preserve order for PK columns)
|
|
108
|
+
key_fragments_list = [] # Preserve order
|
|
109
|
+
key_fragments_set = set() # For deduplication
|
|
110
|
+
|
|
111
|
+
# Extract fragments from PK columns in order
|
|
112
|
+
for pk_col in pk_columns:
|
|
113
|
+
pk_fragments = split_by_separator(pk_col, separator)
|
|
114
|
+
for frag in pk_fragments:
|
|
115
|
+
if frag not in key_fragments_set:
|
|
116
|
+
key_fragments_list.append(frag)
|
|
117
|
+
key_fragments_set.add(frag)
|
|
118
|
+
|
|
119
|
+
# Extract fragments from FK columns
|
|
120
|
+
for fk in fk_columns:
|
|
121
|
+
fk_col = fk.get("column", "")
|
|
122
|
+
fk_fragments = split_by_separator(fk_col, separator)
|
|
123
|
+
for frag in fk_fragments:
|
|
124
|
+
if frag not in key_fragments_set:
|
|
125
|
+
key_fragments_list.append(frag)
|
|
126
|
+
key_fragments_set.add(frag)
|
|
127
|
+
|
|
128
|
+
# Step 4: Match table name fragments to vertices
|
|
129
|
+
# Strategy: match source from left, target from right
|
|
130
|
+
# Stop when we have 2 matches OR target_index > source_index + 1
|
|
131
|
+
source_match_idx: int | None = None
|
|
132
|
+
target_match_idx: int | None = None
|
|
133
|
+
source_vertex: str | None = None
|
|
134
|
+
target_vertex: str | None = None
|
|
135
|
+
matched_vertices_set = set() # For deduplication
|
|
136
|
+
matched_fragment_indices = {} # Track which fragment indices matched which vertices
|
|
137
|
+
|
|
138
|
+
# Match source starting from the left
|
|
139
|
+
for i, fragment in enumerate(table_fragments):
|
|
140
|
+
matched = match_cache.get_match(fragment)
|
|
141
|
+
if matched and matched not in matched_vertices_set:
|
|
142
|
+
source_match_idx = i
|
|
143
|
+
source_vertex = matched
|
|
144
|
+
matched_vertices_set.add(matched)
|
|
145
|
+
matched_fragment_indices[i] = matched
|
|
146
|
+
break # Found source, stop searching left
|
|
147
|
+
|
|
148
|
+
# Match target starting from the right
|
|
149
|
+
for i in range(
|
|
150
|
+
len(table_fragments) - 1,
|
|
151
|
+
source_match_idx if source_match_idx is not None else -1,
|
|
152
|
+
-1,
|
|
153
|
+
):
|
|
154
|
+
fragment = table_fragments[i]
|
|
155
|
+
matched = match_cache.get_match(fragment)
|
|
156
|
+
if matched and matched not in matched_vertices_set:
|
|
157
|
+
target_match_idx = i
|
|
158
|
+
target_vertex = matched
|
|
159
|
+
matched_vertices_set.add(matched)
|
|
160
|
+
matched_fragment_indices[i] = matched
|
|
161
|
+
break # Found target, stop searching right
|
|
162
|
+
|
|
163
|
+
# Match key fragments to fill in missing vertices (keys are primary source of truth)
|
|
164
|
+
matched_vertices = []
|
|
165
|
+
key_matched_vertices = [] # Track vertices matched from keys (higher priority)
|
|
166
|
+
|
|
167
|
+
if source_vertex:
|
|
168
|
+
matched_vertices.append(source_vertex)
|
|
169
|
+
if target_vertex and target_vertex != source_vertex:
|
|
170
|
+
matched_vertices.append(target_vertex)
|
|
171
|
+
|
|
172
|
+
for fragment in key_fragments_list:
|
|
173
|
+
matched = match_cache.get_match(fragment)
|
|
174
|
+
if matched:
|
|
175
|
+
if matched not in matched_vertices_set:
|
|
176
|
+
matched_vertices.append(matched)
|
|
177
|
+
matched_vertices_set.add(matched)
|
|
178
|
+
# Track key-matched vertices separately for priority
|
|
179
|
+
if matched not in key_matched_vertices:
|
|
180
|
+
key_matched_vertices.append(matched)
|
|
181
|
+
|
|
182
|
+
# Step 5: Use foreign keys to confirm or infer vertices
|
|
183
|
+
fk_vertex_names = []
|
|
184
|
+
if fk_columns:
|
|
185
|
+
for fk in fk_columns:
|
|
186
|
+
ref_table = fk.get("references_table")
|
|
187
|
+
if ref_table:
|
|
188
|
+
fk_vertex_names.append(ref_table)
|
|
189
|
+
|
|
190
|
+
# Step 6: Form hypothesis
|
|
191
|
+
source_table = None
|
|
192
|
+
target_table = None
|
|
193
|
+
|
|
194
|
+
# Priority 1: Use FK references if available (most reliable)
|
|
195
|
+
if len(fk_vertex_names) >= 2:
|
|
196
|
+
source_table = fk_vertex_names[0]
|
|
197
|
+
target_table = fk_vertex_names[1]
|
|
198
|
+
elif len(fk_vertex_names) == 1:
|
|
199
|
+
# Self-reference case
|
|
200
|
+
source_table = fk_vertex_names[0]
|
|
201
|
+
target_table = fk_vertex_names[0]
|
|
202
|
+
|
|
203
|
+
# Priority 2: Use matched vertices from fuzzy matching
|
|
204
|
+
# Prefer vertices from keys (primary source of truth) over table matches
|
|
205
|
+
if not source_table or not target_table:
|
|
206
|
+
# If we have vertices from keys, prefer those
|
|
207
|
+
if len(key_matched_vertices) >= 2:
|
|
208
|
+
source_table = key_matched_vertices[0]
|
|
209
|
+
target_table = key_matched_vertices[1]
|
|
210
|
+
elif len(key_matched_vertices) == 1:
|
|
211
|
+
# Use key vertex for source, try to find target from all matched vertices
|
|
212
|
+
source_table = key_matched_vertices[0]
|
|
213
|
+
if len(matched_vertices) >= 2:
|
|
214
|
+
# Find target that's not the source
|
|
215
|
+
for v in matched_vertices:
|
|
216
|
+
if v != source_table:
|
|
217
|
+
target_table = v
|
|
218
|
+
break
|
|
219
|
+
if not target_table:
|
|
220
|
+
target_table = source_table # Self-reference
|
|
221
|
+
else:
|
|
222
|
+
target_table = source_table # Self-reference
|
|
223
|
+
elif len(matched_vertices) >= 2:
|
|
224
|
+
source_table = matched_vertices[0]
|
|
225
|
+
target_table = matched_vertices[1]
|
|
226
|
+
elif len(matched_vertices) == 1:
|
|
227
|
+
# Self-reference case
|
|
228
|
+
source_table = matched_vertices[0]
|
|
229
|
+
target_table = matched_vertices[0]
|
|
230
|
+
|
|
231
|
+
# Priority 3: Fill in missing vertex from remaining options
|
|
232
|
+
if source_table and not target_table:
|
|
233
|
+
# Try to find target from remaining fragments or keys
|
|
234
|
+
if fk_vertex_names and len(fk_vertex_names) > 1:
|
|
235
|
+
# Use second FK if available
|
|
236
|
+
target_table = fk_vertex_names[1]
|
|
237
|
+
elif matched_vertices and len(matched_vertices) > 1:
|
|
238
|
+
target_table = matched_vertices[1]
|
|
239
|
+
elif fk_vertex_names:
|
|
240
|
+
# Self-reference case
|
|
241
|
+
target_table = fk_vertex_names[0]
|
|
242
|
+
elif matched_vertices:
|
|
243
|
+
target_table = matched_vertices[0]
|
|
244
|
+
|
|
245
|
+
if target_table and not source_table:
|
|
246
|
+
# Try to find source from remaining fragments or keys
|
|
247
|
+
if fk_vertex_names:
|
|
248
|
+
source_table = fk_vertex_names[0]
|
|
249
|
+
elif matched_vertices:
|
|
250
|
+
source_table = matched_vertices[0]
|
|
251
|
+
|
|
252
|
+
# Step 7: Identify relation from table fragments (after we know source/target)
|
|
253
|
+
# Relation is derived from table name fragments that are neither source nor target
|
|
254
|
+
# Patterns: bla_SOURCE_<relation>_TARGET, bla_SOURCE_TARGET_<relation>,
|
|
255
|
+
# bla_<relation>_SOURCE_TARGET, and other combinations
|
|
256
|
+
# We allow relation to appear anywhere except as source/target fragments
|
|
257
|
+
if relation_name is None and source_table and target_table:
|
|
258
|
+
source_lower = source_table.lower()
|
|
259
|
+
target_lower = target_table.lower()
|
|
260
|
+
|
|
261
|
+
# Use the matched fragment indices if available (more precise)
|
|
262
|
+
# Otherwise, find by matching fragment text
|
|
263
|
+
source_idx = source_match_idx
|
|
264
|
+
target_idx = target_match_idx
|
|
265
|
+
|
|
266
|
+
# If we don't have indices from matching, find them by text matching
|
|
267
|
+
if source_idx is None or target_idx is None:
|
|
268
|
+
for idx, fragment in enumerate(table_fragments):
|
|
269
|
+
fragment_lower = fragment.lower()
|
|
270
|
+
if source_idx is None and (
|
|
271
|
+
fragment_lower == source_lower
|
|
272
|
+
or source_lower in fragment_lower
|
|
273
|
+
or fragment_lower in source_lower
|
|
274
|
+
):
|
|
275
|
+
source_idx = idx
|
|
276
|
+
if target_idx is None and (
|
|
277
|
+
fragment_lower == target_lower
|
|
278
|
+
or target_lower in fragment_lower
|
|
279
|
+
or fragment_lower in target_lower
|
|
280
|
+
):
|
|
281
|
+
target_idx = idx
|
|
282
|
+
|
|
283
|
+
relation_candidates = []
|
|
284
|
+
|
|
285
|
+
# Collect all fragments that are not source or target
|
|
286
|
+
# Allow relation to appear anywhere: before, between, or after source/target
|
|
287
|
+
for idx, fragment in enumerate(table_fragments):
|
|
288
|
+
fragment_lower = fragment.lower()
|
|
289
|
+
|
|
290
|
+
# Skip if it's a source or target fragment
|
|
291
|
+
if (
|
|
292
|
+
fragment_lower == source_lower
|
|
293
|
+
or source_lower in fragment_lower
|
|
294
|
+
or fragment_lower in source_lower
|
|
295
|
+
or fragment_lower == target_lower
|
|
296
|
+
or target_lower in fragment_lower
|
|
297
|
+
or fragment_lower in target_lower
|
|
298
|
+
):
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Include all non-source/target fragments as relation candidates
|
|
302
|
+
relation_candidates.append((len(fragment), idx, fragment))
|
|
303
|
+
|
|
304
|
+
# Select candidate using scoring system:
|
|
305
|
+
# - Score = fragment_length + (position_index * 5) if fragment_length >= 3
|
|
306
|
+
# - Score = fragment_length if fragment_length < 3
|
|
307
|
+
# - Prefer candidates further to the right and longer
|
|
308
|
+
if relation_candidates:
|
|
309
|
+
|
|
310
|
+
def score_candidate(candidate: tuple[int, int, str]) -> int:
|
|
311
|
+
fragment_length, position_idx, _ = candidate
|
|
312
|
+
if fragment_length >= 3:
|
|
313
|
+
# Position bonus: each position to the right counts as 5 extra characters
|
|
314
|
+
return fragment_length + (position_idx * 5)
|
|
315
|
+
else:
|
|
316
|
+
# Fragments below 3 symbols don't get position bonus
|
|
317
|
+
return fragment_length
|
|
318
|
+
|
|
319
|
+
_, _, relation_name = max(relation_candidates, key=score_candidate)
|
|
320
|
+
elif len(table_fragments) >= 2:
|
|
321
|
+
# Fallback: if we have 2+ fragments and one doesn't match source/target, it might be the relation
|
|
322
|
+
for fragment in table_fragments:
|
|
323
|
+
fragment_lower = fragment.lower()
|
|
324
|
+
# Use if it doesn't match source or target
|
|
325
|
+
if (
|
|
326
|
+
fragment_lower != source_lower
|
|
327
|
+
and source_lower not in fragment_lower
|
|
328
|
+
and fragment_lower not in source_lower
|
|
329
|
+
and fragment_lower != target_lower
|
|
330
|
+
and target_lower not in fragment_lower
|
|
331
|
+
and fragment_lower not in target_lower
|
|
332
|
+
):
|
|
333
|
+
relation_name = fragment
|
|
334
|
+
break
|
|
335
|
+
|
|
336
|
+
return (source_table, target_table, relation_name)
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def infer_vertex_from_column_name(
|
|
340
|
+
column_name: str,
|
|
341
|
+
vertex_table_names: list[str] | None = None,
|
|
342
|
+
match_cache: FuzzyMatchCache | None = None,
|
|
343
|
+
) -> str | None:
|
|
344
|
+
"""Infer vertex table name from a column name using robust pattern matching.
|
|
345
|
+
|
|
346
|
+
Uses the same logic as infer_edge_vertices_from_table_name but focused on
|
|
347
|
+
extracting vertex names from column names. Handles patterns like:
|
|
348
|
+
- user_id -> user
|
|
349
|
+
- product_id -> product
|
|
350
|
+
- customer_fk -> customer
|
|
351
|
+
- source_vertex -> source_vertex (if matches)
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
column_name: Name of the column
|
|
355
|
+
vertex_table_names: Optional list of known vertex table names for fuzzy matching
|
|
356
|
+
match_cache: Optional pre-computed fuzzy match cache for better performance
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Inferred vertex table name or None if cannot infer
|
|
360
|
+
"""
|
|
361
|
+
if vertex_table_names is None:
|
|
362
|
+
vertex_table_names = []
|
|
363
|
+
|
|
364
|
+
# Use cache if provided, otherwise create a temporary one
|
|
365
|
+
if match_cache is None:
|
|
366
|
+
match_cache = FuzzyMatchCache(vertex_table_names)
|
|
367
|
+
|
|
368
|
+
if not column_name:
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
# Step 1: Detect separator
|
|
372
|
+
separator = detect_separator(column_name)
|
|
373
|
+
|
|
374
|
+
# Step 2: Split column name by separator
|
|
375
|
+
fragments = split_by_separator(column_name, separator)
|
|
376
|
+
|
|
377
|
+
if not fragments:
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
# Step 3: Try to match fragments to vertex names
|
|
381
|
+
# Common suffixes to remove: id, fk, key, pk, ref
|
|
382
|
+
common_suffixes = {"id", "fk", "key", "pk", "ref", "reference"}
|
|
383
|
+
|
|
384
|
+
# Try matching full column name first
|
|
385
|
+
matched = match_cache.get_match(column_name)
|
|
386
|
+
if matched:
|
|
387
|
+
return matched
|
|
388
|
+
|
|
389
|
+
# Try matching fragments (excluding common suffixes)
|
|
390
|
+
for fragment in fragments:
|
|
391
|
+
fragment_lower = fragment.lower()
|
|
392
|
+
# Skip common suffixes
|
|
393
|
+
if fragment_lower in common_suffixes:
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
matched = match_cache.get_match(fragment)
|
|
397
|
+
if matched:
|
|
398
|
+
return matched
|
|
399
|
+
|
|
400
|
+
# Step 4: If no match found, try removing common suffixes and matching again
|
|
401
|
+
# Remove last fragment if it's a common suffix
|
|
402
|
+
if len(fragments) > 1:
|
|
403
|
+
last_fragment = fragments[-1].lower()
|
|
404
|
+
if last_fragment in common_suffixes:
|
|
405
|
+
# Try matching the remaining fragments
|
|
406
|
+
remaining = separator.join(fragments[:-1])
|
|
407
|
+
matched = match_cache.get_match(remaining)
|
|
408
|
+
if matched:
|
|
409
|
+
return matched
|
|
410
|
+
|
|
411
|
+
# Step 5: As last resort, try exact match against vertex names (case-insensitive)
|
|
412
|
+
column_lower = column_name.lower()
|
|
413
|
+
for vertex_name in vertex_table_names:
|
|
414
|
+
vertex_lower = vertex_name.lower()
|
|
415
|
+
# Check if column name contains vertex name or vice versa
|
|
416
|
+
if vertex_lower in column_lower:
|
|
417
|
+
# Remove common suffixes from column name and check if it matches
|
|
418
|
+
for suffix in common_suffixes:
|
|
419
|
+
if column_lower.endswith(f"_{suffix}") or column_lower.endswith(suffix):
|
|
420
|
+
base = (
|
|
421
|
+
column_lower[: -len(f"_{suffix}")]
|
|
422
|
+
if column_lower.endswith(f"_{suffix}")
|
|
423
|
+
else column_lower[: -len(suffix)]
|
|
424
|
+
)
|
|
425
|
+
if base == vertex_lower:
|
|
426
|
+
return vertex_name
|
|
427
|
+
|
|
428
|
+
return None
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
"""Resource mapping from PostgreSQL tables to graflo Resources.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to map PostgreSQL tables (both vertex and edge tables)
|
|
4
|
+
to graflo Resource objects that can be used for data ingestion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
|
|
9
|
+
from graflo.architecture.edge import EdgeConfig
|
|
10
|
+
from graflo.architecture.resource import Resource
|
|
11
|
+
from graflo.architecture.vertex import VertexConfig
|
|
12
|
+
|
|
13
|
+
from .conn import EdgeTableInfo, SchemaIntrospectionResult
|
|
14
|
+
from .fuzzy_matcher import FuzzyMatchCache
|
|
15
|
+
from .inference_utils import (
|
|
16
|
+
detect_separator,
|
|
17
|
+
split_by_separator,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PostgresResourceMapper:
|
|
24
|
+
"""Maps PostgreSQL tables to graflo Resources.
|
|
25
|
+
|
|
26
|
+
This class creates Resource objects that map PostgreSQL tables to graph vertices
|
|
27
|
+
and edges, enabling ingestion of relational data into graph databases.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def create_vertex_resource(self, table_name: str, vertex_name: str) -> Resource:
|
|
31
|
+
"""Create a Resource for a vertex table.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
table_name: Name of the PostgreSQL table
|
|
35
|
+
vertex_name: Name of the vertex type (typically same as table_name)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Resource: Resource configured to ingest vertex data
|
|
39
|
+
"""
|
|
40
|
+
# Create apply list with VertexActor
|
|
41
|
+
# The actor wrapper will interpret {"vertex": vertex_name} as VertexActor
|
|
42
|
+
apply = [{"vertex": vertex_name}]
|
|
43
|
+
|
|
44
|
+
resource = Resource(
|
|
45
|
+
resource_name=table_name,
|
|
46
|
+
apply=apply,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
logger.debug(
|
|
50
|
+
f"Created vertex resource '{table_name}' for vertex '{vertex_name}'"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return resource
|
|
54
|
+
|
|
55
|
+
def create_edge_resource(
|
|
56
|
+
self,
|
|
57
|
+
edge_table_info: EdgeTableInfo,
|
|
58
|
+
vertex_config: VertexConfig,
|
|
59
|
+
match_cache: FuzzyMatchCache | None = None,
|
|
60
|
+
) -> Resource:
|
|
61
|
+
"""Create a Resource for an edge table.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
edge_table_info: Edge table information from introspection
|
|
65
|
+
vertex_config: Vertex configuration for source/target validation
|
|
66
|
+
match_cache: Optional fuzzy match cache for better performance
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Resource: Resource configured to ingest edge data
|
|
70
|
+
"""
|
|
71
|
+
table_name = edge_table_info.name
|
|
72
|
+
source_table = edge_table_info.source_table
|
|
73
|
+
target_table = edge_table_info.target_table
|
|
74
|
+
source_column = edge_table_info.source_column
|
|
75
|
+
target_column = edge_table_info.target_column
|
|
76
|
+
relation = edge_table_info.relation
|
|
77
|
+
|
|
78
|
+
# Verify source and target vertices exist
|
|
79
|
+
if source_table not in vertex_config.vertex_set:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"Source vertex '{source_table}' for edge table '{table_name}' "
|
|
82
|
+
f"not found in vertex config"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if target_table not in vertex_config.vertex_set:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
f"Target vertex '{target_table}' for edge table '{table_name}' "
|
|
88
|
+
f"not found in vertex config"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Get primary key fields for source and target vertices
|
|
92
|
+
source_vertex_obj = vertex_config._vertices_map[source_table]
|
|
93
|
+
target_vertex_obj = vertex_config._vertices_map[target_table]
|
|
94
|
+
|
|
95
|
+
# Get the primary key field(s) from the first index (primary key)
|
|
96
|
+
source_pk_fields = (
|
|
97
|
+
source_vertex_obj.indexes[0].fields if source_vertex_obj.indexes else []
|
|
98
|
+
)
|
|
99
|
+
target_pk_fields = (
|
|
100
|
+
target_vertex_obj.indexes[0].fields if target_vertex_obj.indexes else []
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Use heuristics to infer PK field names from column names
|
|
104
|
+
# This handles cases like "bla_user" -> "user" vertex -> use "id" or matched field
|
|
105
|
+
vertex_names = list(vertex_config.vertex_set)
|
|
106
|
+
source_pk_field = self._infer_pk_field_from_column(
|
|
107
|
+
source_column, source_table, source_pk_fields, vertex_names, match_cache
|
|
108
|
+
)
|
|
109
|
+
target_pk_field = self._infer_pk_field_from_column(
|
|
110
|
+
target_column, target_table, target_pk_fields, vertex_names, match_cache
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Create apply list using source_vertex and target_vertex pattern
|
|
114
|
+
# This pattern explicitly specifies which vertex type each mapping targets,
|
|
115
|
+
# avoiding attribute collisions between different vertex types
|
|
116
|
+
apply = []
|
|
117
|
+
|
|
118
|
+
# First mapping: map source foreign key column to source vertex's primary key field
|
|
119
|
+
if source_column:
|
|
120
|
+
source_map_config = {
|
|
121
|
+
"target_vertex": source_table,
|
|
122
|
+
"map": {source_column: source_pk_field},
|
|
123
|
+
}
|
|
124
|
+
apply.append(source_map_config)
|
|
125
|
+
|
|
126
|
+
# Second mapping: map target foreign key column to target vertex's primary key field
|
|
127
|
+
if target_column:
|
|
128
|
+
target_map_config = {
|
|
129
|
+
"target_vertex": target_table,
|
|
130
|
+
"map": {target_column: target_pk_field},
|
|
131
|
+
}
|
|
132
|
+
apply.append(target_map_config)
|
|
133
|
+
|
|
134
|
+
resource = Resource(
|
|
135
|
+
resource_name=table_name,
|
|
136
|
+
apply=apply,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
relation_info = f" with relation '{relation}'" if relation else ""
|
|
140
|
+
logger.debug(
|
|
141
|
+
f"Created edge resource '{table_name}' from {source_table} to {target_table}"
|
|
142
|
+
f"{relation_info} "
|
|
143
|
+
f"(source_col: {source_column} -> {source_pk_field}, "
|
|
144
|
+
f"target_col: {target_column} -> {target_pk_field})"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
return resource
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
def _infer_pk_field_from_column(
|
|
151
|
+
column_name: str,
|
|
152
|
+
vertex_name: str,
|
|
153
|
+
pk_fields: list[str],
|
|
154
|
+
vertex_names: list[str],
|
|
155
|
+
match_cache: FuzzyMatchCache | None = None,
|
|
156
|
+
) -> str:
|
|
157
|
+
"""Infer primary key field name from column name using heuristics.
|
|
158
|
+
|
|
159
|
+
Uses fuzzy matching to identify vertex name fragments in column names,
|
|
160
|
+
then matches to the appropriate PK field. Handles cases like:
|
|
161
|
+
- "user_id" -> "user" vertex -> use first PK field (e.g., "id")
|
|
162
|
+
- "bla_user" -> "user" vertex -> use first PK field
|
|
163
|
+
- "user_id_2" -> "user" vertex -> use first PK field
|
|
164
|
+
- "source_user_id" -> "user" vertex -> use first PK field
|
|
165
|
+
- "bla_user" and "bla_user_2" -> both map to "user" vertex PK field
|
|
166
|
+
|
|
167
|
+
The heuristic works by:
|
|
168
|
+
1. Splitting the column name into fragments
|
|
169
|
+
2. Fuzzy matching fragments to vertex names
|
|
170
|
+
3. If a fragment matches the target vertex_name, use the vertex's PK field
|
|
171
|
+
4. Otherwise, fall back to first PK field or "id"
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
column_name: Name of the column (e.g., "user_id", "bla_user", "bla_user_2")
|
|
175
|
+
vertex_name: Name of the target vertex (already known from edge table info)
|
|
176
|
+
pk_fields: List of primary key field names for the vertex
|
|
177
|
+
vertex_names: List of all vertex names for fuzzy matching
|
|
178
|
+
match_cache: Optional fuzzy match cache for better performance
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Primary key field name (defaults to first PK field or "id" if no match)
|
|
182
|
+
"""
|
|
183
|
+
# Split column name into fragments
|
|
184
|
+
separator = detect_separator(column_name)
|
|
185
|
+
fragments = split_by_separator(column_name, separator)
|
|
186
|
+
|
|
187
|
+
# Try to find a fragment that matches the target vertex name
|
|
188
|
+
# This confirms that the column is indeed related to this vertex
|
|
189
|
+
for fragment in fragments:
|
|
190
|
+
# Fuzzy match fragment to vertex names
|
|
191
|
+
if match_cache:
|
|
192
|
+
matched_vertex = match_cache.get_match(fragment)
|
|
193
|
+
else:
|
|
194
|
+
# Fallback: create temporary matcher if cache not provided
|
|
195
|
+
from .fuzzy_matcher import FuzzyMatcher
|
|
196
|
+
|
|
197
|
+
matcher = FuzzyMatcher(vertex_names)
|
|
198
|
+
matched_vertex, _ = matcher.match(fragment)
|
|
199
|
+
|
|
200
|
+
# If we found a match to our target vertex, use its PK field
|
|
201
|
+
if matched_vertex == vertex_name:
|
|
202
|
+
if pk_fields:
|
|
203
|
+
# Use the first PK field (most common case is single-column PK)
|
|
204
|
+
return pk_fields[0]
|
|
205
|
+
else:
|
|
206
|
+
# No PK fields available, use "id" as default
|
|
207
|
+
return "id"
|
|
208
|
+
|
|
209
|
+
# No fragment matched the target vertex, but we still have vertex_name
|
|
210
|
+
# This might happen if the column name doesn't contain the vertex name fragment
|
|
211
|
+
# In this case, trust that vertex_name is correct and use its PK field
|
|
212
|
+
if pk_fields:
|
|
213
|
+
return pk_fields[0]
|
|
214
|
+
|
|
215
|
+
# Last resort: use "id" as default
|
|
216
|
+
# This is better than failing, but ideally pk_fields should always be available
|
|
217
|
+
logger.debug(
|
|
218
|
+
f"No PK fields found for vertex '{vertex_name}', using 'id' as default "
|
|
219
|
+
f"for column '{column_name}'"
|
|
220
|
+
)
|
|
221
|
+
return "id"
|
|
222
|
+
|
|
223
|
+
def map_tables_to_resources(
|
|
224
|
+
self,
|
|
225
|
+
introspection_result: SchemaIntrospectionResult,
|
|
226
|
+
vertex_config: VertexConfig,
|
|
227
|
+
edge_config: EdgeConfig,
|
|
228
|
+
) -> list[Resource]:
|
|
229
|
+
"""Map all PostgreSQL tables to Resources.
|
|
230
|
+
|
|
231
|
+
Creates Resources for both vertex and edge tables, enabling ingestion
|
|
232
|
+
of the entire database schema.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
introspection_result: Result from PostgresConnection.introspect_schema()
|
|
236
|
+
vertex_config: Inferred vertex configuration
|
|
237
|
+
edge_config: Inferred edge configuration
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
list[Resource]: List of Resources for all tables
|
|
241
|
+
"""
|
|
242
|
+
resources = []
|
|
243
|
+
|
|
244
|
+
# Create fuzzy match cache once for all edge tables (significant performance improvement)
|
|
245
|
+
vertex_names = list(vertex_config.vertex_set)
|
|
246
|
+
match_cache = FuzzyMatchCache(vertex_names)
|
|
247
|
+
|
|
248
|
+
# Map vertex tables to resources
|
|
249
|
+
vertex_tables = introspection_result.vertex_tables
|
|
250
|
+
for table_info in vertex_tables:
|
|
251
|
+
table_name = table_info.name
|
|
252
|
+
vertex_name = table_name # Use table name as vertex name
|
|
253
|
+
resource = self.create_vertex_resource(table_name, vertex_name)
|
|
254
|
+
resources.append(resource)
|
|
255
|
+
|
|
256
|
+
# Map edge tables to resources
|
|
257
|
+
edge_tables = introspection_result.edge_tables
|
|
258
|
+
for edge_table_info in edge_tables:
|
|
259
|
+
try:
|
|
260
|
+
resource = self.create_edge_resource(
|
|
261
|
+
edge_table_info, vertex_config, match_cache
|
|
262
|
+
)
|
|
263
|
+
resources.append(resource)
|
|
264
|
+
except ValueError as e:
|
|
265
|
+
logger.warning(f"Skipping edge resource creation: {e}")
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
logger.info(
|
|
269
|
+
f"Mapped {len(vertex_tables)} vertex tables and {len(edge_tables)} edge tables "
|
|
270
|
+
f"to {len(resources)} resources"
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
return resources
|