graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1276 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +418 -0
  7. graflo/architecture/onto.py +376 -0
  8. graflo/architecture/onto_sql.py +54 -0
  9. graflo/architecture/resource.py +163 -0
  10. graflo/architecture/schema.py +135 -0
  11. graflo/architecture/transform.py +292 -0
  12. graflo/architecture/util.py +89 -0
  13. graflo/architecture/vertex.py +562 -0
  14. graflo/caster.py +736 -0
  15. graflo/cli/__init__.py +14 -0
  16. graflo/cli/ingest.py +203 -0
  17. graflo/cli/manage_dbs.py +197 -0
  18. graflo/cli/plot_schema.py +132 -0
  19. graflo/cli/xml2json.py +93 -0
  20. graflo/data_source/__init__.py +48 -0
  21. graflo/data_source/api.py +339 -0
  22. graflo/data_source/base.py +95 -0
  23. graflo/data_source/factory.py +304 -0
  24. graflo/data_source/file.py +148 -0
  25. graflo/data_source/memory.py +70 -0
  26. graflo/data_source/registry.py +82 -0
  27. graflo/data_source/sql.py +183 -0
  28. graflo/db/__init__.py +44 -0
  29. graflo/db/arango/__init__.py +22 -0
  30. graflo/db/arango/conn.py +1025 -0
  31. graflo/db/arango/query.py +180 -0
  32. graflo/db/arango/util.py +88 -0
  33. graflo/db/conn.py +377 -0
  34. graflo/db/connection/__init__.py +6 -0
  35. graflo/db/connection/config_mapping.py +18 -0
  36. graflo/db/connection/onto.py +717 -0
  37. graflo/db/connection/wsgi.py +29 -0
  38. graflo/db/manager.py +119 -0
  39. graflo/db/neo4j/__init__.py +16 -0
  40. graflo/db/neo4j/conn.py +639 -0
  41. graflo/db/postgres/__init__.py +37 -0
  42. graflo/db/postgres/conn.py +948 -0
  43. graflo/db/postgres/fuzzy_matcher.py +281 -0
  44. graflo/db/postgres/heuristics.py +133 -0
  45. graflo/db/postgres/inference_utils.py +428 -0
  46. graflo/db/postgres/resource_mapping.py +273 -0
  47. graflo/db/postgres/schema_inference.py +372 -0
  48. graflo/db/postgres/types.py +148 -0
  49. graflo/db/postgres/util.py +87 -0
  50. graflo/db/tigergraph/__init__.py +9 -0
  51. graflo/db/tigergraph/conn.py +2365 -0
  52. graflo/db/tigergraph/onto.py +26 -0
  53. graflo/db/util.py +49 -0
  54. graflo/filter/__init__.py +21 -0
  55. graflo/filter/onto.py +525 -0
  56. graflo/logging.conf +22 -0
  57. graflo/onto.py +312 -0
  58. graflo/plot/__init__.py +17 -0
  59. graflo/plot/plotter.py +616 -0
  60. graflo/util/__init__.py +23 -0
  61. graflo/util/chunker.py +807 -0
  62. graflo/util/merge.py +150 -0
  63. graflo/util/misc.py +37 -0
  64. graflo/util/onto.py +422 -0
  65. graflo/util/transform.py +454 -0
  66. graflo-1.3.7.dist-info/METADATA +243 -0
  67. graflo-1.3.7.dist-info/RECORD +70 -0
  68. graflo-1.3.7.dist-info/WHEEL +4 -0
  69. graflo-1.3.7.dist-info/entry_points.txt +5 -0
  70. graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,428 @@
1
+ """Inference utilities for PostgreSQL schema analysis.
2
+
3
+ This module provides utility functions for inferring relationships and patterns
4
+ from PostgreSQL table and column names using heuristics and fuzzy matching.
5
+ """
6
+
7
+ from typing import Any
8
+
9
+ from .fuzzy_matcher import FuzzyMatchCache, FuzzyMatcher
10
+
11
+
12
+ def fuzzy_match_fragment(
13
+ fragment: str, vertex_names: list[str], threshold: float = 0.6
14
+ ) -> str | None:
15
+ """Fuzzy match a fragment to vertex names.
16
+
17
+ Backward-compatible wrapper function that uses the improved FuzzyMatcher.
18
+
19
+ Args:
20
+ fragment: Fragment to match
21
+ vertex_names: List of vertex table names to match against
22
+ threshold: Similarity threshold (0.0 to 1.0)
23
+
24
+ Returns:
25
+ Best matching vertex name or None if no match above threshold
26
+ """
27
+ matcher = FuzzyMatcher(vertex_names, threshold)
28
+ match, _ = matcher.match(fragment)
29
+ return match
30
+
31
+
32
+ def detect_separator(text: str) -> str:
33
+ """Detect the most common separator character in a text.
34
+
35
+ Args:
36
+ text: Text to analyze
37
+
38
+ Returns:
39
+ Most common separator character, defaults to '_'
40
+ """
41
+ # Common separators
42
+ separators = ["_", "-", "."]
43
+ counts = {sep: text.count(sep) for sep in separators}
44
+
45
+ if max(counts.values()) > 0:
46
+ return max(counts, key=counts.get)
47
+ return "_" # Default separator
48
+
49
+
50
+ def split_by_separator(text: str, separator: str) -> list[str]:
51
+ """Split text by separator, handling multiple consecutive separators.
52
+
53
+ Args:
54
+ text: Text to split
55
+ separator: Separator character
56
+
57
+ Returns:
58
+ List of non-empty fragments
59
+ """
60
+ # Split and filter out empty strings
61
+ parts = [p for p in text.split(separator) if p]
62
+ return parts
63
+
64
+
65
+ def infer_edge_vertices_from_table_name(
66
+ table_name: str,
67
+ pk_columns: list[str],
68
+ fk_columns: list[dict[str, Any]],
69
+ vertex_table_names: list[str] | None = None,
70
+ match_cache: FuzzyMatchCache | None = None,
71
+ ) -> tuple[str | None, str | None, str | None]:
72
+ """Infer source and target vertex names from table name and structure.
73
+
74
+ Uses fuzzy matching to identify vertex names in table name fragments and key names.
75
+ Handles patterns like:
76
+ - rel_cluster_containment_host -> cluster, host, containment
77
+ - rel_cluster_containment_cluster_2 -> cluster, cluster, containment (self-reference)
78
+ - user_follows_user -> user, user, follows (self-reference)
79
+ - product_category_mapping -> product, category, mapping
80
+
81
+ Args:
82
+ table_name: Name of the table
83
+ pk_columns: List of primary key column names
84
+ fk_columns: List of foreign key dictionaries with 'column' and 'references_table' keys
85
+ vertex_table_names: Optional list of known vertex table names for fuzzy matching
86
+ match_cache: Optional pre-computed fuzzy match cache for better performance
87
+
88
+ Returns:
89
+ Tuple of (source_table, target_table, relation_name) or (None, None, None) if cannot infer
90
+ """
91
+ if vertex_table_names is None:
92
+ vertex_table_names = []
93
+
94
+ # Use cache if provided, otherwise create a temporary one
95
+ if match_cache is None:
96
+ match_cache = FuzzyMatchCache(vertex_table_names)
97
+
98
+ # Step 1: Detect separator
99
+ separator = detect_separator(table_name)
100
+
101
+ # Step 2: Split table name by separator
102
+ table_fragments = split_by_separator(table_name, separator)
103
+
104
+ # Initialize relation_name - will be set if we identify a relation fragment
105
+ relation_name = None
106
+
107
+ # Step 3: Extract fragments from keys (preserve order for PK columns)
108
+ key_fragments_list = [] # Preserve order
109
+ key_fragments_set = set() # For deduplication
110
+
111
+ # Extract fragments from PK columns in order
112
+ for pk_col in pk_columns:
113
+ pk_fragments = split_by_separator(pk_col, separator)
114
+ for frag in pk_fragments:
115
+ if frag not in key_fragments_set:
116
+ key_fragments_list.append(frag)
117
+ key_fragments_set.add(frag)
118
+
119
+ # Extract fragments from FK columns
120
+ for fk in fk_columns:
121
+ fk_col = fk.get("column", "")
122
+ fk_fragments = split_by_separator(fk_col, separator)
123
+ for frag in fk_fragments:
124
+ if frag not in key_fragments_set:
125
+ key_fragments_list.append(frag)
126
+ key_fragments_set.add(frag)
127
+
128
+ # Step 4: Match table name fragments to vertices
129
+ # Strategy: match source from left, target from right
130
+ # Stop when we have 2 matches OR target_index > source_index + 1
131
+ source_match_idx: int | None = None
132
+ target_match_idx: int | None = None
133
+ source_vertex: str | None = None
134
+ target_vertex: str | None = None
135
+ matched_vertices_set = set() # For deduplication
136
+ matched_fragment_indices = {} # Track which fragment indices matched which vertices
137
+
138
+ # Match source starting from the left
139
+ for i, fragment in enumerate(table_fragments):
140
+ matched = match_cache.get_match(fragment)
141
+ if matched and matched not in matched_vertices_set:
142
+ source_match_idx = i
143
+ source_vertex = matched
144
+ matched_vertices_set.add(matched)
145
+ matched_fragment_indices[i] = matched
146
+ break # Found source, stop searching left
147
+
148
+ # Match target starting from the right
149
+ for i in range(
150
+ len(table_fragments) - 1,
151
+ source_match_idx if source_match_idx is not None else -1,
152
+ -1,
153
+ ):
154
+ fragment = table_fragments[i]
155
+ matched = match_cache.get_match(fragment)
156
+ if matched and matched not in matched_vertices_set:
157
+ target_match_idx = i
158
+ target_vertex = matched
159
+ matched_vertices_set.add(matched)
160
+ matched_fragment_indices[i] = matched
161
+ break # Found target, stop searching right
162
+
163
+ # Match key fragments to fill in missing vertices (keys are primary source of truth)
164
+ matched_vertices = []
165
+ key_matched_vertices = [] # Track vertices matched from keys (higher priority)
166
+
167
+ if source_vertex:
168
+ matched_vertices.append(source_vertex)
169
+ if target_vertex and target_vertex != source_vertex:
170
+ matched_vertices.append(target_vertex)
171
+
172
+ for fragment in key_fragments_list:
173
+ matched = match_cache.get_match(fragment)
174
+ if matched:
175
+ if matched not in matched_vertices_set:
176
+ matched_vertices.append(matched)
177
+ matched_vertices_set.add(matched)
178
+ # Track key-matched vertices separately for priority
179
+ if matched not in key_matched_vertices:
180
+ key_matched_vertices.append(matched)
181
+
182
+ # Step 5: Use foreign keys to confirm or infer vertices
183
+ fk_vertex_names = []
184
+ if fk_columns:
185
+ for fk in fk_columns:
186
+ ref_table = fk.get("references_table")
187
+ if ref_table:
188
+ fk_vertex_names.append(ref_table)
189
+
190
+ # Step 6: Form hypothesis
191
+ source_table = None
192
+ target_table = None
193
+
194
+ # Priority 1: Use FK references if available (most reliable)
195
+ if len(fk_vertex_names) >= 2:
196
+ source_table = fk_vertex_names[0]
197
+ target_table = fk_vertex_names[1]
198
+ elif len(fk_vertex_names) == 1:
199
+ # Self-reference case
200
+ source_table = fk_vertex_names[0]
201
+ target_table = fk_vertex_names[0]
202
+
203
+ # Priority 2: Use matched vertices from fuzzy matching
204
+ # Prefer vertices from keys (primary source of truth) over table matches
205
+ if not source_table or not target_table:
206
+ # If we have vertices from keys, prefer those
207
+ if len(key_matched_vertices) >= 2:
208
+ source_table = key_matched_vertices[0]
209
+ target_table = key_matched_vertices[1]
210
+ elif len(key_matched_vertices) == 1:
211
+ # Use key vertex for source, try to find target from all matched vertices
212
+ source_table = key_matched_vertices[0]
213
+ if len(matched_vertices) >= 2:
214
+ # Find target that's not the source
215
+ for v in matched_vertices:
216
+ if v != source_table:
217
+ target_table = v
218
+ break
219
+ if not target_table:
220
+ target_table = source_table # Self-reference
221
+ else:
222
+ target_table = source_table # Self-reference
223
+ elif len(matched_vertices) >= 2:
224
+ source_table = matched_vertices[0]
225
+ target_table = matched_vertices[1]
226
+ elif len(matched_vertices) == 1:
227
+ # Self-reference case
228
+ source_table = matched_vertices[0]
229
+ target_table = matched_vertices[0]
230
+
231
+ # Priority 3: Fill in missing vertex from remaining options
232
+ if source_table and not target_table:
233
+ # Try to find target from remaining fragments or keys
234
+ if fk_vertex_names and len(fk_vertex_names) > 1:
235
+ # Use second FK if available
236
+ target_table = fk_vertex_names[1]
237
+ elif matched_vertices and len(matched_vertices) > 1:
238
+ target_table = matched_vertices[1]
239
+ elif fk_vertex_names:
240
+ # Self-reference case
241
+ target_table = fk_vertex_names[0]
242
+ elif matched_vertices:
243
+ target_table = matched_vertices[0]
244
+
245
+ if target_table and not source_table:
246
+ # Try to find source from remaining fragments or keys
247
+ if fk_vertex_names:
248
+ source_table = fk_vertex_names[0]
249
+ elif matched_vertices:
250
+ source_table = matched_vertices[0]
251
+
252
+ # Step 7: Identify relation from table fragments (after we know source/target)
253
+ # Relation is derived from table name fragments that are neither source nor target
254
+ # Patterns: bla_SOURCE_<relation>_TARGET, bla_SOURCE_TARGET_<relation>,
255
+ # bla_<relation>_SOURCE_TARGET, and other combinations
256
+ # We allow relation to appear anywhere except as source/target fragments
257
+ if relation_name is None and source_table and target_table:
258
+ source_lower = source_table.lower()
259
+ target_lower = target_table.lower()
260
+
261
+ # Use the matched fragment indices if available (more precise)
262
+ # Otherwise, find by matching fragment text
263
+ source_idx = source_match_idx
264
+ target_idx = target_match_idx
265
+
266
+ # If we don't have indices from matching, find them by text matching
267
+ if source_idx is None or target_idx is None:
268
+ for idx, fragment in enumerate(table_fragments):
269
+ fragment_lower = fragment.lower()
270
+ if source_idx is None and (
271
+ fragment_lower == source_lower
272
+ or source_lower in fragment_lower
273
+ or fragment_lower in source_lower
274
+ ):
275
+ source_idx = idx
276
+ if target_idx is None and (
277
+ fragment_lower == target_lower
278
+ or target_lower in fragment_lower
279
+ or fragment_lower in target_lower
280
+ ):
281
+ target_idx = idx
282
+
283
+ relation_candidates = []
284
+
285
+ # Collect all fragments that are not source or target
286
+ # Allow relation to appear anywhere: before, between, or after source/target
287
+ for idx, fragment in enumerate(table_fragments):
288
+ fragment_lower = fragment.lower()
289
+
290
+ # Skip if it's a source or target fragment
291
+ if (
292
+ fragment_lower == source_lower
293
+ or source_lower in fragment_lower
294
+ or fragment_lower in source_lower
295
+ or fragment_lower == target_lower
296
+ or target_lower in fragment_lower
297
+ or fragment_lower in target_lower
298
+ ):
299
+ continue
300
+
301
+ # Include all non-source/target fragments as relation candidates
302
+ relation_candidates.append((len(fragment), idx, fragment))
303
+
304
+ # Select candidate using scoring system:
305
+ # - Score = fragment_length + (position_index * 5) if fragment_length >= 3
306
+ # - Score = fragment_length if fragment_length < 3
307
+ # - Prefer candidates further to the right and longer
308
+ if relation_candidates:
309
+
310
+ def score_candidate(candidate: tuple[int, int, str]) -> int:
311
+ fragment_length, position_idx, _ = candidate
312
+ if fragment_length >= 3:
313
+ # Position bonus: each position to the right counts as 5 extra characters
314
+ return fragment_length + (position_idx * 5)
315
+ else:
316
+ # Fragments below 3 symbols don't get position bonus
317
+ return fragment_length
318
+
319
+ _, _, relation_name = max(relation_candidates, key=score_candidate)
320
+ elif len(table_fragments) >= 2:
321
+ # Fallback: if we have 2+ fragments and one doesn't match source/target, it might be the relation
322
+ for fragment in table_fragments:
323
+ fragment_lower = fragment.lower()
324
+ # Use if it doesn't match source or target
325
+ if (
326
+ fragment_lower != source_lower
327
+ and source_lower not in fragment_lower
328
+ and fragment_lower not in source_lower
329
+ and fragment_lower != target_lower
330
+ and target_lower not in fragment_lower
331
+ and fragment_lower not in target_lower
332
+ ):
333
+ relation_name = fragment
334
+ break
335
+
336
+ return (source_table, target_table, relation_name)
337
+
338
+
339
+ def infer_vertex_from_column_name(
340
+ column_name: str,
341
+ vertex_table_names: list[str] | None = None,
342
+ match_cache: FuzzyMatchCache | None = None,
343
+ ) -> str | None:
344
+ """Infer vertex table name from a column name using robust pattern matching.
345
+
346
+ Uses the same logic as infer_edge_vertices_from_table_name but focused on
347
+ extracting vertex names from column names. Handles patterns like:
348
+ - user_id -> user
349
+ - product_id -> product
350
+ - customer_fk -> customer
351
+ - source_vertex -> source_vertex (if matches)
352
+
353
+ Args:
354
+ column_name: Name of the column
355
+ vertex_table_names: Optional list of known vertex table names for fuzzy matching
356
+ match_cache: Optional pre-computed fuzzy match cache for better performance
357
+
358
+ Returns:
359
+ Inferred vertex table name or None if cannot infer
360
+ """
361
+ if vertex_table_names is None:
362
+ vertex_table_names = []
363
+
364
+ # Use cache if provided, otherwise create a temporary one
365
+ if match_cache is None:
366
+ match_cache = FuzzyMatchCache(vertex_table_names)
367
+
368
+ if not column_name:
369
+ return None
370
+
371
+ # Step 1: Detect separator
372
+ separator = detect_separator(column_name)
373
+
374
+ # Step 2: Split column name by separator
375
+ fragments = split_by_separator(column_name, separator)
376
+
377
+ if not fragments:
378
+ return None
379
+
380
+ # Step 3: Try to match fragments to vertex names
381
+ # Common suffixes to remove: id, fk, key, pk, ref
382
+ common_suffixes = {"id", "fk", "key", "pk", "ref", "reference"}
383
+
384
+ # Try matching full column name first
385
+ matched = match_cache.get_match(column_name)
386
+ if matched:
387
+ return matched
388
+
389
+ # Try matching fragments (excluding common suffixes)
390
+ for fragment in fragments:
391
+ fragment_lower = fragment.lower()
392
+ # Skip common suffixes
393
+ if fragment_lower in common_suffixes:
394
+ continue
395
+
396
+ matched = match_cache.get_match(fragment)
397
+ if matched:
398
+ return matched
399
+
400
+ # Step 4: If no match found, try removing common suffixes and matching again
401
+ # Remove last fragment if it's a common suffix
402
+ if len(fragments) > 1:
403
+ last_fragment = fragments[-1].lower()
404
+ if last_fragment in common_suffixes:
405
+ # Try matching the remaining fragments
406
+ remaining = separator.join(fragments[:-1])
407
+ matched = match_cache.get_match(remaining)
408
+ if matched:
409
+ return matched
410
+
411
+ # Step 5: As last resort, try exact match against vertex names (case-insensitive)
412
+ column_lower = column_name.lower()
413
+ for vertex_name in vertex_table_names:
414
+ vertex_lower = vertex_name.lower()
415
+ # Check if column name contains vertex name or vice versa
416
+ if vertex_lower in column_lower:
417
+ # Remove common suffixes from column name and check if it matches
418
+ for suffix in common_suffixes:
419
+ if column_lower.endswith(f"_{suffix}") or column_lower.endswith(suffix):
420
+ base = (
421
+ column_lower[: -len(f"_{suffix}")]
422
+ if column_lower.endswith(f"_{suffix}")
423
+ else column_lower[: -len(suffix)]
424
+ )
425
+ if base == vertex_lower:
426
+ return vertex_name
427
+
428
+ return None
@@ -0,0 +1,273 @@
1
+ """Resource mapping from PostgreSQL tables to graflo Resources.
2
+
3
+ This module provides functionality to map PostgreSQL tables (both vertex and edge tables)
4
+ to graflo Resource objects that can be used for data ingestion.
5
+ """
6
+
7
+ import logging
8
+
9
+ from graflo.architecture.edge import EdgeConfig
10
+ from graflo.architecture.resource import Resource
11
+ from graflo.architecture.vertex import VertexConfig
12
+
13
+ from .conn import EdgeTableInfo, SchemaIntrospectionResult
14
+ from .fuzzy_matcher import FuzzyMatchCache
15
+ from .inference_utils import (
16
+ detect_separator,
17
+ split_by_separator,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class PostgresResourceMapper:
24
+ """Maps PostgreSQL tables to graflo Resources.
25
+
26
+ This class creates Resource objects that map PostgreSQL tables to graph vertices
27
+ and edges, enabling ingestion of relational data into graph databases.
28
+ """
29
+
30
+ def create_vertex_resource(self, table_name: str, vertex_name: str) -> Resource:
31
+ """Create a Resource for a vertex table.
32
+
33
+ Args:
34
+ table_name: Name of the PostgreSQL table
35
+ vertex_name: Name of the vertex type (typically same as table_name)
36
+
37
+ Returns:
38
+ Resource: Resource configured to ingest vertex data
39
+ """
40
+ # Create apply list with VertexActor
41
+ # The actor wrapper will interpret {"vertex": vertex_name} as VertexActor
42
+ apply = [{"vertex": vertex_name}]
43
+
44
+ resource = Resource(
45
+ resource_name=table_name,
46
+ apply=apply,
47
+ )
48
+
49
+ logger.debug(
50
+ f"Created vertex resource '{table_name}' for vertex '{vertex_name}'"
51
+ )
52
+
53
+ return resource
54
+
55
+ def create_edge_resource(
56
+ self,
57
+ edge_table_info: EdgeTableInfo,
58
+ vertex_config: VertexConfig,
59
+ match_cache: FuzzyMatchCache | None = None,
60
+ ) -> Resource:
61
+ """Create a Resource for an edge table.
62
+
63
+ Args:
64
+ edge_table_info: Edge table information from introspection
65
+ vertex_config: Vertex configuration for source/target validation
66
+ match_cache: Optional fuzzy match cache for better performance
67
+
68
+ Returns:
69
+ Resource: Resource configured to ingest edge data
70
+ """
71
+ table_name = edge_table_info.name
72
+ source_table = edge_table_info.source_table
73
+ target_table = edge_table_info.target_table
74
+ source_column = edge_table_info.source_column
75
+ target_column = edge_table_info.target_column
76
+ relation = edge_table_info.relation
77
+
78
+ # Verify source and target vertices exist
79
+ if source_table not in vertex_config.vertex_set:
80
+ raise ValueError(
81
+ f"Source vertex '{source_table}' for edge table '{table_name}' "
82
+ f"not found in vertex config"
83
+ )
84
+
85
+ if target_table not in vertex_config.vertex_set:
86
+ raise ValueError(
87
+ f"Target vertex '{target_table}' for edge table '{table_name}' "
88
+ f"not found in vertex config"
89
+ )
90
+
91
+ # Get primary key fields for source and target vertices
92
+ source_vertex_obj = vertex_config._vertices_map[source_table]
93
+ target_vertex_obj = vertex_config._vertices_map[target_table]
94
+
95
+ # Get the primary key field(s) from the first index (primary key)
96
+ source_pk_fields = (
97
+ source_vertex_obj.indexes[0].fields if source_vertex_obj.indexes else []
98
+ )
99
+ target_pk_fields = (
100
+ target_vertex_obj.indexes[0].fields if target_vertex_obj.indexes else []
101
+ )
102
+
103
+ # Use heuristics to infer PK field names from column names
104
+ # This handles cases like "bla_user" -> "user" vertex -> use "id" or matched field
105
+ vertex_names = list(vertex_config.vertex_set)
106
+ source_pk_field = self._infer_pk_field_from_column(
107
+ source_column, source_table, source_pk_fields, vertex_names, match_cache
108
+ )
109
+ target_pk_field = self._infer_pk_field_from_column(
110
+ target_column, target_table, target_pk_fields, vertex_names, match_cache
111
+ )
112
+
113
+ # Create apply list using source_vertex and target_vertex pattern
114
+ # This pattern explicitly specifies which vertex type each mapping targets,
115
+ # avoiding attribute collisions between different vertex types
116
+ apply = []
117
+
118
+ # First mapping: map source foreign key column to source vertex's primary key field
119
+ if source_column:
120
+ source_map_config = {
121
+ "target_vertex": source_table,
122
+ "map": {source_column: source_pk_field},
123
+ }
124
+ apply.append(source_map_config)
125
+
126
+ # Second mapping: map target foreign key column to target vertex's primary key field
127
+ if target_column:
128
+ target_map_config = {
129
+ "target_vertex": target_table,
130
+ "map": {target_column: target_pk_field},
131
+ }
132
+ apply.append(target_map_config)
133
+
134
+ resource = Resource(
135
+ resource_name=table_name,
136
+ apply=apply,
137
+ )
138
+
139
+ relation_info = f" with relation '{relation}'" if relation else ""
140
+ logger.debug(
141
+ f"Created edge resource '{table_name}' from {source_table} to {target_table}"
142
+ f"{relation_info} "
143
+ f"(source_col: {source_column} -> {source_pk_field}, "
144
+ f"target_col: {target_column} -> {target_pk_field})"
145
+ )
146
+
147
+ return resource
148
+
149
+ @staticmethod
150
+ def _infer_pk_field_from_column(
151
+ column_name: str,
152
+ vertex_name: str,
153
+ pk_fields: list[str],
154
+ vertex_names: list[str],
155
+ match_cache: FuzzyMatchCache | None = None,
156
+ ) -> str:
157
+ """Infer primary key field name from column name using heuristics.
158
+
159
+ Uses fuzzy matching to identify vertex name fragments in column names,
160
+ then matches to the appropriate PK field. Handles cases like:
161
+ - "user_id" -> "user" vertex -> use first PK field (e.g., "id")
162
+ - "bla_user" -> "user" vertex -> use first PK field
163
+ - "user_id_2" -> "user" vertex -> use first PK field
164
+ - "source_user_id" -> "user" vertex -> use first PK field
165
+ - "bla_user" and "bla_user_2" -> both map to "user" vertex PK field
166
+
167
+ The heuristic works by:
168
+ 1. Splitting the column name into fragments
169
+ 2. Fuzzy matching fragments to vertex names
170
+ 3. If a fragment matches the target vertex_name, use the vertex's PK field
171
+ 4. Otherwise, fall back to first PK field or "id"
172
+
173
+ Args:
174
+ column_name: Name of the column (e.g., "user_id", "bla_user", "bla_user_2")
175
+ vertex_name: Name of the target vertex (already known from edge table info)
176
+ pk_fields: List of primary key field names for the vertex
177
+ vertex_names: List of all vertex names for fuzzy matching
178
+ match_cache: Optional fuzzy match cache for better performance
179
+
180
+ Returns:
181
+ Primary key field name (defaults to first PK field or "id" if no match)
182
+ """
183
+ # Split column name into fragments
184
+ separator = detect_separator(column_name)
185
+ fragments = split_by_separator(column_name, separator)
186
+
187
+ # Try to find a fragment that matches the target vertex name
188
+ # This confirms that the column is indeed related to this vertex
189
+ for fragment in fragments:
190
+ # Fuzzy match fragment to vertex names
191
+ if match_cache:
192
+ matched_vertex = match_cache.get_match(fragment)
193
+ else:
194
+ # Fallback: create temporary matcher if cache not provided
195
+ from .fuzzy_matcher import FuzzyMatcher
196
+
197
+ matcher = FuzzyMatcher(vertex_names)
198
+ matched_vertex, _ = matcher.match(fragment)
199
+
200
+ # If we found a match to our target vertex, use its PK field
201
+ if matched_vertex == vertex_name:
202
+ if pk_fields:
203
+ # Use the first PK field (most common case is single-column PK)
204
+ return pk_fields[0]
205
+ else:
206
+ # No PK fields available, use "id" as default
207
+ return "id"
208
+
209
+ # No fragment matched the target vertex, but we still have vertex_name
210
+ # This might happen if the column name doesn't contain the vertex name fragment
211
+ # In this case, trust that vertex_name is correct and use its PK field
212
+ if pk_fields:
213
+ return pk_fields[0]
214
+
215
+ # Last resort: use "id" as default
216
+ # This is better than failing, but ideally pk_fields should always be available
217
+ logger.debug(
218
+ f"No PK fields found for vertex '{vertex_name}', using 'id' as default "
219
+ f"for column '{column_name}'"
220
+ )
221
+ return "id"
222
+
223
+ def map_tables_to_resources(
224
+ self,
225
+ introspection_result: SchemaIntrospectionResult,
226
+ vertex_config: VertexConfig,
227
+ edge_config: EdgeConfig,
228
+ ) -> list[Resource]:
229
+ """Map all PostgreSQL tables to Resources.
230
+
231
+ Creates Resources for both vertex and edge tables, enabling ingestion
232
+ of the entire database schema.
233
+
234
+ Args:
235
+ introspection_result: Result from PostgresConnection.introspect_schema()
236
+ vertex_config: Inferred vertex configuration
237
+ edge_config: Inferred edge configuration
238
+
239
+ Returns:
240
+ list[Resource]: List of Resources for all tables
241
+ """
242
+ resources = []
243
+
244
+ # Create fuzzy match cache once for all edge tables (significant performance improvement)
245
+ vertex_names = list(vertex_config.vertex_set)
246
+ match_cache = FuzzyMatchCache(vertex_names)
247
+
248
+ # Map vertex tables to resources
249
+ vertex_tables = introspection_result.vertex_tables
250
+ for table_info in vertex_tables:
251
+ table_name = table_info.name
252
+ vertex_name = table_name # Use table name as vertex name
253
+ resource = self.create_vertex_resource(table_name, vertex_name)
254
+ resources.append(resource)
255
+
256
+ # Map edge tables to resources
257
+ edge_tables = introspection_result.edge_tables
258
+ for edge_table_info in edge_tables:
259
+ try:
260
+ resource = self.create_edge_resource(
261
+ edge_table_info, vertex_config, match_cache
262
+ )
263
+ resources.append(resource)
264
+ except ValueError as e:
265
+ logger.warning(f"Skipping edge resource creation: {e}")
266
+ continue
267
+
268
+ logger.info(
269
+ f"Mapped {len(vertex_tables)} vertex tables and {len(edge_tables)} edge tables "
270
+ f"to {len(resources)} resources"
271
+ )
272
+
273
+ return resources