graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1276 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +418 -0
  7. graflo/architecture/onto.py +376 -0
  8. graflo/architecture/onto_sql.py +54 -0
  9. graflo/architecture/resource.py +163 -0
  10. graflo/architecture/schema.py +135 -0
  11. graflo/architecture/transform.py +292 -0
  12. graflo/architecture/util.py +89 -0
  13. graflo/architecture/vertex.py +562 -0
  14. graflo/caster.py +736 -0
  15. graflo/cli/__init__.py +14 -0
  16. graflo/cli/ingest.py +203 -0
  17. graflo/cli/manage_dbs.py +197 -0
  18. graflo/cli/plot_schema.py +132 -0
  19. graflo/cli/xml2json.py +93 -0
  20. graflo/data_source/__init__.py +48 -0
  21. graflo/data_source/api.py +339 -0
  22. graflo/data_source/base.py +95 -0
  23. graflo/data_source/factory.py +304 -0
  24. graflo/data_source/file.py +148 -0
  25. graflo/data_source/memory.py +70 -0
  26. graflo/data_source/registry.py +82 -0
  27. graflo/data_source/sql.py +183 -0
  28. graflo/db/__init__.py +44 -0
  29. graflo/db/arango/__init__.py +22 -0
  30. graflo/db/arango/conn.py +1025 -0
  31. graflo/db/arango/query.py +180 -0
  32. graflo/db/arango/util.py +88 -0
  33. graflo/db/conn.py +377 -0
  34. graflo/db/connection/__init__.py +6 -0
  35. graflo/db/connection/config_mapping.py +18 -0
  36. graflo/db/connection/onto.py +717 -0
  37. graflo/db/connection/wsgi.py +29 -0
  38. graflo/db/manager.py +119 -0
  39. graflo/db/neo4j/__init__.py +16 -0
  40. graflo/db/neo4j/conn.py +639 -0
  41. graflo/db/postgres/__init__.py +37 -0
  42. graflo/db/postgres/conn.py +948 -0
  43. graflo/db/postgres/fuzzy_matcher.py +281 -0
  44. graflo/db/postgres/heuristics.py +133 -0
  45. graflo/db/postgres/inference_utils.py +428 -0
  46. graflo/db/postgres/resource_mapping.py +273 -0
  47. graflo/db/postgres/schema_inference.py +372 -0
  48. graflo/db/postgres/types.py +148 -0
  49. graflo/db/postgres/util.py +87 -0
  50. graflo/db/tigergraph/__init__.py +9 -0
  51. graflo/db/tigergraph/conn.py +2365 -0
  52. graflo/db/tigergraph/onto.py +26 -0
  53. graflo/db/util.py +49 -0
  54. graflo/filter/__init__.py +21 -0
  55. graflo/filter/onto.py +525 -0
  56. graflo/logging.conf +22 -0
  57. graflo/onto.py +312 -0
  58. graflo/plot/__init__.py +17 -0
  59. graflo/plot/plotter.py +616 -0
  60. graflo/util/__init__.py +23 -0
  61. graflo/util/chunker.py +807 -0
  62. graflo/util/merge.py +150 -0
  63. graflo/util/misc.py +37 -0
  64. graflo/util/onto.py +422 -0
  65. graflo/util/transform.py +454 -0
  66. graflo-1.3.7.dist-info/METADATA +243 -0
  67. graflo-1.3.7.dist-info/RECORD +70 -0
  68. graflo-1.3.7.dist-info/WHEEL +4 -0
  69. graflo-1.3.7.dist-info/entry_points.txt +5 -0
  70. graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,948 @@
1
+ """PostgreSQL connection implementation for schema introspection.
2
+
3
+ This module implements PostgreSQL connection and schema introspection functionality,
4
+ specifically designed to analyze 3NF schemas and identify vertex-like and edge-like tables.
5
+
6
+ Key Features:
7
+ - Connection management using psycopg2
8
+ - Schema introspection (tables, columns, constraints)
9
+ - Vertex/edge table detection heuristics
10
+ - Structured schema information extraction
11
+
12
+ Example:
13
+ >>> from graflo.db.postgres import PostgresConnection
14
+ >>> from graflo.db.connection.onto import PostgresConfig
15
+ >>> config = PostgresConfig.from_docker_env()
16
+ >>> conn = PostgresConnection(config)
17
+ >>> schema_info = conn.introspect_schema()
18
+ >>> print(schema_info.vertex_tables)
19
+ >>> conn.close()
20
+ """
21
+
22
+ import logging
23
+ from typing import Any
24
+
25
+ import psycopg2
26
+ from psycopg2.extras import RealDictCursor
27
+
28
+ from graflo.architecture.onto_sql import (
29
+ ColumnInfo,
30
+ ForeignKeyInfo,
31
+ VertexTableInfo,
32
+ EdgeTableInfo,
33
+ SchemaIntrospectionResult,
34
+ )
35
+ from graflo.db.connection.onto import PostgresConfig
36
+
37
+ from .inference_utils import (
38
+ FuzzyMatchCache,
39
+ infer_edge_vertices_from_table_name,
40
+ infer_vertex_from_column_name,
41
+ )
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ class PostgresConnection:
47
+ """PostgreSQL connection for schema introspection.
48
+
49
+ This class provides PostgreSQL-specific functionality for connecting to databases
50
+ and introspecting 3NF schemas to identify vertex-like and edge-like tables.
51
+
52
+ Attributes:
53
+ config: PostgreSQL connection configuration
54
+ conn: psycopg2 connection instance
55
+ """
56
+
57
+ def __init__(self, config: PostgresConfig):
58
+ """Initialize PostgreSQL connection.
59
+
60
+ Args:
61
+ config: PostgreSQL connection configuration containing URI and credentials
62
+ """
63
+ self.config = config
64
+
65
+ # Validate required config values
66
+ if config.uri is None:
67
+ raise ValueError("PostgreSQL connection requires a URI to be configured")
68
+ if config.database is None:
69
+ raise ValueError(
70
+ "PostgreSQL connection requires a database name to be configured"
71
+ )
72
+
73
+ # Use config properties directly - all fallbacks are handled in PostgresConfig
74
+ host = config.hostname or "localhost"
75
+ port = int(config.port) if config.port else 5432
76
+ database = config.database
77
+ user = config.username or "postgres"
78
+ password = config.password
79
+
80
+ # Build connection parameters dict
81
+ conn_params = {
82
+ "host": host,
83
+ "port": port,
84
+ "database": database,
85
+ "user": user,
86
+ }
87
+
88
+ if password:
89
+ conn_params["password"] = password
90
+
91
+ try:
92
+ self.conn = psycopg2.connect(**conn_params)
93
+ logger.info(f"Successfully connected to PostgreSQL database '{database}'")
94
+ except Exception as e:
95
+ logger.error(f"Failed to connect to PostgreSQL: {e}", exc_info=True)
96
+ raise
97
+
98
+ def read(self, query: str, params: tuple | None = None) -> list[dict[str, Any]]:
99
+ """Execute a SELECT query and return results as a list of dictionaries.
100
+
101
+ Args:
102
+ query: SQL SELECT query to execute
103
+ params: Optional tuple of parameters for parameterized queries
104
+
105
+ Returns:
106
+ List of dictionaries, where each dictionary represents a row with column names as keys.
107
+ Decimal values are converted to float for compatibility with graph databases.
108
+ """
109
+ from decimal import Decimal
110
+
111
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
112
+ if params:
113
+ cursor.execute(query, params)
114
+ else:
115
+ cursor.execute(query)
116
+
117
+ # Convert rows to dictionaries and convert Decimal to float
118
+ results = []
119
+ for row in cursor.fetchall():
120
+ row_dict = dict(row)
121
+ # Convert Decimal to float for JSON/graph database compatibility
122
+ for key, value in row_dict.items():
123
+ if isinstance(value, Decimal):
124
+ row_dict[key] = float(value)
125
+ results.append(row_dict)
126
+
127
+ return results
128
+
129
+ def __enter__(self):
130
+ """Enter the context manager.
131
+
132
+ Returns:
133
+ PostgresConnection: Self for use in 'with' statements
134
+ """
135
+ return self
136
+
137
+ def __exit__(self, exc_type, exc_value, exc_traceback):
138
+ """Exit the context manager.
139
+
140
+ Ensures the connection is properly closed when exiting the context.
141
+
142
+ Args:
143
+ exc_type: Exception type if an exception occurred
144
+ exc_value: Exception value if an exception occurred
145
+ exc_traceback: Exception traceback if an exception occurred
146
+ """
147
+ self.close()
148
+ return False # Don't suppress exceptions
149
+
150
+ def close(self):
151
+ """Close the PostgreSQL connection."""
152
+ if hasattr(self, "conn") and self.conn:
153
+ try:
154
+ self.conn.close()
155
+ logger.debug("PostgreSQL connection closed")
156
+ except Exception as e:
157
+ logger.warning(
158
+ f"Error closing PostgreSQL connection: {e}", exc_info=True
159
+ )
160
+
161
+ def _check_information_schema_reliable(self, schema_name: str) -> bool:
162
+ """Check if information_schema is reliable for the given schema.
163
+
164
+ Args:
165
+ schema_name: Schema name to check
166
+
167
+ Returns:
168
+ True if information_schema appears reliable, False otherwise
169
+ """
170
+ try:
171
+ # Try to query information_schema.tables
172
+ query = """
173
+ SELECT COUNT(*) as count
174
+ FROM information_schema.tables
175
+ WHERE table_schema = %s
176
+ AND table_type = 'BASE TABLE'
177
+ """
178
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
179
+ cursor.execute(query, (schema_name,))
180
+ result = cursor.fetchone()
181
+ # If query succeeds, check if we can also query constraints
182
+ pk_query = """
183
+ SELECT COUNT(*) as count
184
+ FROM information_schema.table_constraints tc
185
+ JOIN information_schema.key_column_usage kcu
186
+ ON tc.constraint_name = kcu.constraint_name
187
+ AND tc.table_schema = kcu.table_schema
188
+ WHERE tc.constraint_type = 'PRIMARY KEY'
189
+ AND tc.table_schema = %s
190
+ """
191
+ cursor.execute(pk_query, (schema_name,))
192
+ pk_result = cursor.fetchone()
193
+ # If both queries work, information_schema seems reliable
194
+ return result is not None and pk_result is not None
195
+ except Exception as e:
196
+ logger.debug(f"information_schema check failed: {e}")
197
+ return False
198
+
199
+ def _get_tables_pg_catalog(self, schema_name: str) -> list[dict[str, Any]]:
200
+ """Get all tables using pg_catalog (fallback method).
201
+
202
+ Args:
203
+ schema_name: Schema name to query
204
+
205
+ Returns:
206
+ List of table information dictionaries with keys: table_name, table_schema
207
+ """
208
+ query = """
209
+ SELECT
210
+ c.relname as table_name,
211
+ n.nspname as table_schema
212
+ FROM pg_catalog.pg_class c
213
+ JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
214
+ WHERE n.nspname = %s
215
+ AND c.relkind = 'r'
216
+ AND NOT c.relispartition
217
+ ORDER BY c.relname;
218
+ """
219
+
220
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
221
+ cursor.execute(query, (schema_name,))
222
+ return [dict(row) for row in cursor.fetchall()]
223
+
224
+ def get_tables(self, schema_name: str | None = None) -> list[dict[str, Any]]:
225
+ """Get all tables in the specified schema.
226
+
227
+ Tries information_schema first, falls back to pg_catalog if needed.
228
+
229
+ Args:
230
+ schema_name: Schema name to query. If None, uses 'public' or config schema_name.
231
+
232
+ Returns:
233
+ List of table information dictionaries with keys: table_name, table_schema
234
+ """
235
+ if schema_name is None:
236
+ schema_name = self.config.schema_name or "public"
237
+
238
+ # Try information_schema first
239
+ try:
240
+ query = """
241
+ SELECT table_name, table_schema
242
+ FROM information_schema.tables
243
+ WHERE table_schema = %s
244
+ AND table_type = 'BASE TABLE'
245
+ ORDER BY table_name;
246
+ """
247
+
248
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
249
+ cursor.execute(query, (schema_name,))
250
+ results = [dict(row) for row in cursor.fetchall()]
251
+ # If we got results, check if information_schema is reliable
252
+ if results and self._check_information_schema_reliable(schema_name):
253
+ return results
254
+ # If no results or unreliable, fall back to pg_catalog
255
+ logger.debug(
256
+ f"information_schema returned no results or is unreliable, "
257
+ f"falling back to pg_catalog for schema '{schema_name}'"
258
+ )
259
+ except Exception as e:
260
+ logger.debug(
261
+ f"information_schema query failed: {e}, falling back to pg_catalog"
262
+ )
263
+
264
+ # Fallback to pg_catalog
265
+ return self._get_tables_pg_catalog(schema_name)
266
+
267
+ def _get_table_columns_pg_catalog(
268
+ self, table_name: str, schema_name: str
269
+ ) -> list[dict[str, Any]]:
270
+ """Get columns using pg_catalog (fallback method).
271
+
272
+ Args:
273
+ table_name: Name of the table
274
+ schema_name: Schema name
275
+
276
+ Returns:
277
+ List of column information dictionaries with keys:
278
+ name, type, description, is_nullable, column_default
279
+ """
280
+ query = """
281
+ SELECT
282
+ a.attname as name,
283
+ pg_catalog.format_type(a.atttypid, a.atttypmod) as type,
284
+ CASE WHEN a.attnotnull THEN 'NO' ELSE 'YES' END as is_nullable,
285
+ pg_catalog.pg_get_expr(d.adbin, d.adrelid) as column_default,
286
+ COALESCE(dsc.description, '') as description
287
+ FROM pg_catalog.pg_attribute a
288
+ JOIN pg_catalog.pg_class c ON c.oid = a.attrelid
289
+ JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
290
+ LEFT JOIN pg_catalog.pg_attrdef d ON d.adrelid = a.attrelid AND d.adnum = a.attnum
291
+ LEFT JOIN pg_catalog.pg_description dsc ON dsc.objoid = a.attrelid AND dsc.objsubid = a.attnum
292
+ WHERE n.nspname = %s
293
+ AND c.relname = %s
294
+ AND a.attnum > 0
295
+ AND NOT a.attisdropped
296
+ ORDER BY a.attnum;
297
+ """
298
+
299
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
300
+ cursor.execute(query, (schema_name, table_name))
301
+ columns = []
302
+ for row in cursor.fetchall():
303
+ col_dict = dict(row)
304
+ # Normalize type format
305
+ if col_dict["type"]:
306
+ # Remove length info from type if present (e.g., "character varying(255)" -> "varchar")
307
+ type_str = col_dict["type"]
308
+ if "(" in type_str:
309
+ base_type = type_str.split("(")[0]
310
+ # Map common types
311
+ type_mapping = {
312
+ "character varying": "varchar",
313
+ "character": "char",
314
+ "double precision": "float8",
315
+ "real": "float4",
316
+ "integer": "int4",
317
+ "bigint": "int8",
318
+ "smallint": "int2",
319
+ }
320
+ col_dict["type"] = type_mapping.get(
321
+ base_type.lower(), base_type.lower()
322
+ )
323
+ else:
324
+ type_mapping = {
325
+ "character varying": "varchar",
326
+ "character": "char",
327
+ "double precision": "float8",
328
+ "real": "float4",
329
+ "integer": "int4",
330
+ "bigint": "int8",
331
+ "smallint": "int2",
332
+ }
333
+ col_dict["type"] = type_mapping.get(
334
+ type_str.lower(), type_str.lower()
335
+ )
336
+ columns.append(col_dict)
337
+ return columns
338
+
339
+ def get_table_columns(
340
+ self, table_name: str, schema_name: str | None = None
341
+ ) -> list[dict[str, Any]]:
342
+ """Get columns for a specific table with types and descriptions.
343
+
344
+ Tries information_schema first, falls back to pg_catalog if needed.
345
+
346
+ Args:
347
+ table_name: Name of the table
348
+ schema_name: Schema name. If None, uses 'public' or config schema_name.
349
+
350
+ Returns:
351
+ List of column information dictionaries with keys:
352
+ name, type, description, is_nullable, column_default
353
+ """
354
+ if schema_name is None:
355
+ schema_name = self.config.schema_name or "public"
356
+
357
+ # Try information_schema first
358
+ try:
359
+ query = """
360
+ SELECT
361
+ c.column_name as name,
362
+ c.data_type as type,
363
+ c.udt_name as udt_name,
364
+ c.character_maximum_length,
365
+ c.is_nullable,
366
+ c.column_default,
367
+ COALESCE(d.description, '') as description
368
+ FROM information_schema.columns c
369
+ LEFT JOIN pg_catalog.pg_statio_all_tables st
370
+ ON st.schemaname = c.table_schema
371
+ AND st.relname = c.table_name
372
+ LEFT JOIN pg_catalog.pg_description d
373
+ ON d.objoid = st.relid
374
+ AND d.objsubid = c.ordinal_position
375
+ WHERE c.table_schema = %s
376
+ AND c.table_name = %s
377
+ ORDER BY c.ordinal_position;
378
+ """
379
+
380
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
381
+ cursor.execute(query, (schema_name, table_name))
382
+ columns = []
383
+ for row in cursor.fetchall():
384
+ col_dict = dict(row)
385
+ # Format type with length if applicable
386
+ if col_dict["character_maximum_length"]:
387
+ col_dict["type"] = (
388
+ f"{col_dict['type']}({col_dict['character_maximum_length']})"
389
+ )
390
+ # Use udt_name if it's more specific (e.g., varchar, int4)
391
+ if (
392
+ col_dict["udt_name"]
393
+ and col_dict["udt_name"] != col_dict["type"]
394
+ ):
395
+ col_dict["type"] = col_dict["udt_name"]
396
+ # Remove helper fields
397
+ col_dict.pop("character_maximum_length", None)
398
+ col_dict.pop("udt_name", None)
399
+ columns.append(col_dict)
400
+
401
+ # If we got results and information_schema is reliable, return them
402
+ if columns and self._check_information_schema_reliable(schema_name):
403
+ return columns
404
+ # Otherwise fall back to pg_catalog
405
+ logger.debug(
406
+ f"information_schema returned no results or is unreliable, "
407
+ f"falling back to pg_catalog for table '{schema_name}.{table_name}'"
408
+ )
409
+ except Exception as e:
410
+ logger.debug(
411
+ f"information_schema query failed: {e}, falling back to pg_catalog"
412
+ )
413
+
414
+ # Fallback to pg_catalog
415
+ return self._get_table_columns_pg_catalog(table_name, schema_name)
416
+
417
+ def _get_primary_keys_pg_catalog(
418
+ self, table_name: str, schema_name: str
419
+ ) -> list[str]:
420
+ """Get primary key columns using pg_catalog (fallback method).
421
+
422
+ Args:
423
+ table_name: Name of the table
424
+ schema_name: Schema name
425
+
426
+ Returns:
427
+ List of primary key column names
428
+ """
429
+ query = """
430
+ SELECT a.attname
431
+ FROM pg_catalog.pg_constraint con
432
+ JOIN pg_catalog.pg_class c ON c.oid = con.conrelid
433
+ JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
434
+ JOIN pg_catalog.pg_attribute a ON a.attrelid = con.conrelid AND a.attnum = ANY(con.conkey)
435
+ WHERE n.nspname = %s
436
+ AND c.relname = %s
437
+ AND con.contype = 'p'
438
+ ORDER BY array_position(con.conkey, a.attnum);
439
+ """
440
+
441
+ with self.conn.cursor() as cursor:
442
+ cursor.execute(query, (schema_name, table_name))
443
+ return [row[0] for row in cursor.fetchall()]
444
+
445
+ def get_primary_keys(
446
+ self, table_name: str, schema_name: str | None = None
447
+ ) -> list[str]:
448
+ """Get primary key columns for a table.
449
+
450
+ Tries information_schema first, falls back to pg_catalog if needed.
451
+
452
+ Args:
453
+ table_name: Name of the table
454
+ schema_name: Schema name. If None, uses 'public' or config schema_name.
455
+
456
+ Returns:
457
+ List of primary key column names
458
+ """
459
+ if schema_name is None:
460
+ schema_name = self.config.schema_name or "public"
461
+
462
+ # Try information_schema first
463
+ try:
464
+ query = """
465
+ SELECT kcu.column_name
466
+ FROM information_schema.table_constraints tc
467
+ JOIN information_schema.key_column_usage kcu
468
+ ON tc.constraint_name = kcu.constraint_name
469
+ AND tc.table_schema = kcu.table_schema
470
+ WHERE tc.constraint_type = 'PRIMARY KEY'
471
+ AND tc.table_schema = %s
472
+ AND tc.table_name = %s
473
+ ORDER BY kcu.ordinal_position;
474
+ """
475
+
476
+ with self.conn.cursor() as cursor:
477
+ cursor.execute(query, (schema_name, table_name))
478
+ results = [row[0] for row in cursor.fetchall()]
479
+ # If we got results and information_schema is reliable, return them
480
+ if results and self._check_information_schema_reliable(schema_name):
481
+ return results
482
+ # Otherwise fall back to pg_catalog
483
+ logger.debug(
484
+ f"information_schema returned no results or is unreliable, "
485
+ f"falling back to pg_catalog for primary keys of '{schema_name}.{table_name}'"
486
+ )
487
+ except Exception as e:
488
+ logger.debug(
489
+ f"information_schema query failed: {e}, falling back to pg_catalog"
490
+ )
491
+
492
+ # Fallback to pg_catalog
493
+ return self._get_primary_keys_pg_catalog(table_name, schema_name)
494
+
495
+ def _get_foreign_keys_pg_catalog(
496
+ self, table_name: str, schema_name: str
497
+ ) -> list[dict[str, Any]]:
498
+ """Get foreign key relationships using pg_catalog (fallback method).
499
+
500
+ Handles both single-column and multi-column foreign keys.
501
+ For multi-column foreign keys, returns one row per column.
502
+
503
+ Args:
504
+ table_name: Name of the table
505
+ schema_name: Schema name
506
+
507
+ Returns:
508
+ List of foreign key dictionaries with keys:
509
+ column, references_table, references_column, constraint_name
510
+ """
511
+ # Use generate_subscripts for better compatibility with older PostgreSQL versions
512
+ query = """
513
+ SELECT
514
+ a.attname as column,
515
+ ref_c.relname as references_table,
516
+ ref_a.attname as references_column,
517
+ con.conname as constraint_name
518
+ FROM pg_catalog.pg_constraint con
519
+ JOIN pg_catalog.pg_class c ON c.oid = con.conrelid
520
+ JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
521
+ JOIN pg_catalog.pg_class ref_c ON ref_c.oid = con.confrelid
522
+ JOIN generate_subscripts(con.conkey, 1) AS i ON true
523
+ JOIN pg_catalog.pg_attribute a ON a.attrelid = con.conrelid AND a.attnum = con.conkey[i]
524
+ JOIN pg_catalog.pg_attribute ref_a ON ref_a.attrelid = con.confrelid AND ref_a.attnum = con.confkey[i]
525
+ WHERE n.nspname = %s
526
+ AND c.relname = %s
527
+ AND con.contype = 'f'
528
+ ORDER BY con.conname, i;
529
+ """
530
+
531
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
532
+ cursor.execute(query, (schema_name, table_name))
533
+ return [dict(row) for row in cursor.fetchall()]
534
+
535
+ def get_foreign_keys(
536
+ self, table_name: str, schema_name: str | None = None
537
+ ) -> list[dict[str, Any]]:
538
+ """Get foreign key relationships for a table.
539
+
540
+ Tries information_schema first, falls back to pg_catalog if needed.
541
+
542
+ Args:
543
+ table_name: Name of the table
544
+ schema_name: Schema name. If None, uses 'public' or config schema_name.
545
+
546
+ Returns:
547
+ List of foreign key dictionaries with keys:
548
+ column, references_table, references_column, constraint_name
549
+ """
550
+ if schema_name is None:
551
+ schema_name = self.config.schema_name or "public"
552
+
553
+ # Try information_schema first
554
+ try:
555
+ query = """
556
+ SELECT
557
+ kcu.column_name as column,
558
+ ccu.table_name as references_table,
559
+ ccu.column_name as references_column,
560
+ tc.constraint_name
561
+ FROM information_schema.table_constraints tc
562
+ JOIN information_schema.key_column_usage kcu
563
+ ON tc.constraint_name = kcu.constraint_name
564
+ AND tc.table_schema = kcu.table_schema
565
+ JOIN information_schema.constraint_column_usage ccu
566
+ ON ccu.constraint_name = tc.constraint_name
567
+ AND ccu.table_schema = tc.table_schema
568
+ WHERE tc.constraint_type = 'FOREIGN KEY'
569
+ AND tc.table_schema = %s
570
+ AND tc.table_name = %s
571
+ ORDER BY kcu.ordinal_position;
572
+ """
573
+
574
+ with self.conn.cursor(cursor_factory=RealDictCursor) as cursor:
575
+ cursor.execute(query, (schema_name, table_name))
576
+ results = [dict(row) for row in cursor.fetchall()]
577
+ # If we got results and information_schema is reliable, return them
578
+ if self._check_information_schema_reliable(schema_name):
579
+ return results
580
+ # Otherwise fall back to pg_catalog
581
+ logger.debug(
582
+ f"information_schema returned no results or is unreliable, "
583
+ f"falling back to pg_catalog for foreign keys of '{schema_name}.{table_name}'"
584
+ )
585
+ except Exception as e:
586
+ logger.debug(
587
+ f"information_schema query failed: {e}, falling back to pg_catalog"
588
+ )
589
+
590
+ # Fallback to pg_catalog
591
+ return self._get_foreign_keys_pg_catalog(table_name, schema_name)
592
+
593
+ def _is_edge_like_table(
594
+ self, table_name: str, pk_columns: list[str], fk_columns: list[dict[str, Any]]
595
+ ) -> bool:
596
+ """Determine if a table is edge-like based on heuristics.
597
+
598
+ Heuristics:
599
+ 1. Tables with 2 or more primary keys are likely edge tables
600
+ 2. Tables with exactly 2 foreign keys are likely edge tables
601
+ 3. Tables with names starting with 'rel_' are likely edge tables
602
+ 4. Tables where primary key columns match foreign key columns are likely edge tables
603
+
604
+ Args:
605
+ table_name: Name of the table
606
+ pk_columns: List of primary key column names
607
+ fk_columns: List of foreign key dictionaries
608
+
609
+ Returns:
610
+ True if table appears to be edge-like, False otherwise
611
+ """
612
+ # Heuristic 1: Tables with 2 or more primary keys are likely edge tables
613
+ if len(pk_columns) >= 2:
614
+ return True
615
+
616
+ # Heuristic 2: Tables with exactly 2 foreign keys are likely edge tables
617
+ if len(fk_columns) == 2:
618
+ return True
619
+
620
+ # Heuristic 3: Tables with names starting with 'rel_' are likely edge tables
621
+ if table_name.startswith("rel_"):
622
+ return True
623
+
624
+ # Heuristic 4: If primary key columns match foreign key columns, it's likely an edge table
625
+ fk_column_names = {fk["column"] for fk in fk_columns}
626
+ pk_set = set(pk_columns)
627
+ # If all PK columns are FK columns and we have at least 2 FKs, it's likely an edge table
628
+ if pk_set.issubset(fk_column_names) and len(fk_columns) >= 2:
629
+ return True
630
+
631
+ return False
632
+
633
+ def detect_vertex_tables(
634
+ self, schema_name: str | None = None
635
+ ) -> list[VertexTableInfo]:
636
+ """Detect vertex-like tables in the schema.
637
+
638
+ Heuristic: Tables with a primary key and descriptive columns
639
+ (not just foreign keys). These represent entities.
640
+
641
+ Note: Tables identified as edge-like are excluded from vertex tables.
642
+
643
+ Args:
644
+ schema_name: Schema name. If None, uses 'public' or config schema_name.
645
+
646
+ Returns:
647
+ List of vertex table information dictionaries
648
+ """
649
+ if schema_name is None:
650
+ schema_name = self.config.schema_name or "public"
651
+
652
+ tables = self.get_tables(schema_name)
653
+ vertex_tables = []
654
+
655
+ for table_info in tables:
656
+ table_name = table_info["table_name"]
657
+ pk_columns = self.get_primary_keys(table_name, schema_name)
658
+ fk_columns = self.get_foreign_keys(table_name, schema_name)
659
+ all_columns = self.get_table_columns(table_name, schema_name)
660
+
661
+ # Vertex-like tables have:
662
+ # 1. A primary key
663
+ # 2. Not identified as edge-like tables
664
+ # 3. Descriptive columns beyond just foreign keys
665
+
666
+ if not pk_columns:
667
+ continue # Skip tables without primary keys
668
+
669
+ # Skip edge-like tables
670
+ if self._is_edge_like_table(table_name, pk_columns, fk_columns):
671
+ continue
672
+
673
+ # Count non-FK, non-PK columns (descriptive columns)
674
+ fk_column_names = {fk["column"] for fk in fk_columns}
675
+ pk_column_names = set(pk_columns)
676
+ descriptive_columns = [
677
+ col
678
+ for col in all_columns
679
+ if col["name"] not in fk_column_names
680
+ and col["name"] not in pk_column_names
681
+ ]
682
+
683
+ # If table has descriptive columns, consider it vertex-like
684
+ if descriptive_columns:
685
+ # Mark primary key columns and convert to ColumnInfo
686
+ pk_set = set(pk_columns)
687
+ column_infos = []
688
+ for col in all_columns:
689
+ column_infos.append(
690
+ ColumnInfo(
691
+ name=col["name"],
692
+ type=col["type"],
693
+ description=col.get("description", ""),
694
+ is_nullable=col.get("is_nullable", "YES"),
695
+ column_default=col.get("column_default"),
696
+ is_pk=col["name"] in pk_set,
697
+ )
698
+ )
699
+
700
+ # Convert foreign keys to ForeignKeyInfo
701
+ fk_infos = []
702
+ for fk in fk_columns:
703
+ fk_infos.append(
704
+ ForeignKeyInfo(
705
+ column=fk["column"],
706
+ references_table=fk["references_table"],
707
+ references_column=fk.get("references_column"),
708
+ constraint_name=fk.get("constraint_name"),
709
+ )
710
+ )
711
+
712
+ vertex_tables.append(
713
+ VertexTableInfo(
714
+ name=table_name,
715
+ schema_name=schema_name,
716
+ columns=column_infos,
717
+ primary_key=pk_columns,
718
+ foreign_keys=fk_infos,
719
+ )
720
+ )
721
+
722
+ return vertex_tables
723
+
724
+ def detect_edge_tables(
725
+ self,
726
+ schema_name: str | None = None,
727
+ vertex_table_names: list[str] | None = None,
728
+ ) -> list[EdgeTableInfo]:
729
+ """Detect edge-like tables in the schema.
730
+
731
+ Heuristic: Tables with 2 or more primary keys, or exactly 2 foreign keys,
732
+ or names starting with 'rel_'. These represent relationships between entities.
733
+
734
+ Args:
735
+ schema_name: Schema name. If None, uses 'public' or config schema_name.
736
+ vertex_table_names: Optional list of vertex table names for fuzzy matching.
737
+ If None, will be inferred from detect_vertex_tables().
738
+
739
+ Returns:
740
+ List of edge table information dictionaries with source_table and target_table
741
+ """
742
+ if schema_name is None:
743
+ schema_name = self.config.schema_name or "public"
744
+
745
+ # Get vertex table names if not provided
746
+ if vertex_table_names is None:
747
+ vertex_tables = self.detect_vertex_tables(schema_name)
748
+ vertex_table_names = [vt.name for vt in vertex_tables]
749
+
750
+ # Create fuzzy match cache once for all tables (significant performance improvement)
751
+ match_cache = FuzzyMatchCache(vertex_table_names)
752
+
753
+ tables = self.get_tables(schema_name)
754
+ edge_tables = []
755
+
756
+ for table_info in tables:
757
+ table_name = table_info["table_name"]
758
+ pk_columns = self.get_primary_keys(table_name, schema_name)
759
+ fk_columns = self.get_foreign_keys(table_name, schema_name)
760
+
761
+ # Skip tables without primary keys
762
+ if not pk_columns:
763
+ continue
764
+
765
+ # Check if table is edge-like
766
+ if not self._is_edge_like_table(table_name, pk_columns, fk_columns):
767
+ continue
768
+
769
+ all_columns = self.get_table_columns(table_name, schema_name)
770
+
771
+ # Mark primary key columns and convert to ColumnInfo
772
+ pk_set = set(pk_columns)
773
+ column_infos = []
774
+ for col in all_columns:
775
+ column_infos.append(
776
+ ColumnInfo(
777
+ name=col["name"],
778
+ type=col["type"],
779
+ description=col.get("description", ""),
780
+ is_nullable=col.get("is_nullable", "YES"),
781
+ column_default=col.get("column_default"),
782
+ is_pk=col["name"] in pk_set,
783
+ )
784
+ )
785
+
786
+ # Convert foreign keys to ForeignKeyInfo
787
+ fk_infos = []
788
+ for fk in fk_columns:
789
+ fk_infos.append(
790
+ ForeignKeyInfo(
791
+ column=fk["column"],
792
+ references_table=fk["references_table"],
793
+ references_column=fk.get("references_column"),
794
+ constraint_name=fk.get("constraint_name"),
795
+ )
796
+ )
797
+
798
+ # Determine source and target tables
799
+ source_table = None
800
+ target_table = None
801
+ source_column = None
802
+ target_column = None
803
+ relation_name = None
804
+
805
+ # If we have exactly 2 foreign keys, use them directly
806
+ if len(fk_infos) == 2:
807
+ source_fk = fk_infos[0]
808
+ target_fk = fk_infos[1]
809
+ source_table = source_fk.references_table
810
+ target_table = target_fk.references_table
811
+ source_column = source_fk.column
812
+ target_column = target_fk.column
813
+ # Still try to infer relation from table name
814
+ fk_dicts = [
815
+ {
816
+ "column": fk.column,
817
+ "references_table": fk.references_table,
818
+ }
819
+ for fk in fk_infos
820
+ ]
821
+ _, _, relation_name = infer_edge_vertices_from_table_name(
822
+ table_name, pk_columns, fk_dicts, vertex_table_names, match_cache
823
+ )
824
+ # If we have 2 or more primary keys, try to infer from table name and structure
825
+ elif len(pk_columns) >= 2:
826
+ # Convert fk_infos to dicts for _infer_edge_vertices_from_table_name
827
+ fk_dicts = [
828
+ {
829
+ "column": fk.column,
830
+ "references_table": fk.references_table,
831
+ }
832
+ for fk in fk_infos
833
+ ]
834
+
835
+ # Try to infer from table name pattern
836
+ inferred_source, inferred_target, relation_name = (
837
+ infer_edge_vertices_from_table_name(
838
+ table_name,
839
+ pk_columns,
840
+ fk_dicts,
841
+ vertex_table_names,
842
+ match_cache,
843
+ )
844
+ )
845
+
846
+ if inferred_source and inferred_target:
847
+ source_table = inferred_source
848
+ target_table = inferred_target
849
+ # Try to match PK columns to FK columns for source/target columns
850
+ if fk_infos:
851
+ # Use first FK for source, second for target if available
852
+ if len(fk_infos) >= 2:
853
+ source_column = fk_infos[0].column
854
+ target_column = fk_infos[1].column
855
+ elif len(fk_infos) == 1:
856
+ # Self-reference case
857
+ source_column = fk_infos[0].column
858
+ target_column = fk_infos[0].column
859
+ else:
860
+ # Use PK columns as source/target columns
861
+ source_column = pk_columns[0]
862
+ target_column = (
863
+ pk_columns[1] if len(pk_columns) > 1 else pk_columns[0]
864
+ )
865
+ elif fk_infos:
866
+ # Fallback: use FK references if available
867
+ if len(fk_infos) >= 2:
868
+ source_table = fk_infos[0].references_table
869
+ target_table = fk_infos[1].references_table
870
+ source_column = fk_infos[0].column
871
+ target_column = fk_infos[1].column
872
+ elif len(fk_infos) == 1:
873
+ source_table = fk_infos[0].references_table
874
+ target_table = fk_infos[0].references_table
875
+ source_column = fk_infos[0].column
876
+ target_column = fk_infos[0].column
877
+ else:
878
+ # Last resort: use PK columns and infer table names from column names
879
+ source_column = pk_columns[0]
880
+ target_column = (
881
+ pk_columns[1] if len(pk_columns) > 1 else pk_columns[0]
882
+ )
883
+ # Use robust inference logic to extract vertex names from column names
884
+ source_table = infer_vertex_from_column_name(
885
+ source_column, vertex_table_names, match_cache
886
+ )
887
+ target_table = infer_vertex_from_column_name(
888
+ target_column, vertex_table_names, match_cache
889
+ )
890
+
891
+ # Only add if we have source and target information
892
+ if source_table and target_table:
893
+ edge_tables.append(
894
+ EdgeTableInfo(
895
+ name=table_name,
896
+ schema_name=schema_name,
897
+ columns=column_infos,
898
+ primary_key=pk_columns,
899
+ foreign_keys=fk_infos,
900
+ source_table=source_table,
901
+ target_table=target_table,
902
+ source_column=source_column or pk_columns[0],
903
+ target_column=target_column
904
+ or (pk_columns[1] if len(pk_columns) > 1 else pk_columns[0]),
905
+ relation=relation_name,
906
+ )
907
+ )
908
+ else:
909
+ logger.warning(
910
+ f"Could not determine source/target tables for edge-like table '{table_name}'. "
911
+ f"Skipping."
912
+ )
913
+
914
+ return edge_tables
915
+
916
+ def introspect_schema(
917
+ self, schema_name: str | None = None
918
+ ) -> SchemaIntrospectionResult:
919
+ """Introspect the database schema and return structured information.
920
+
921
+ This is the main method that analyzes the schema and returns information
922
+ about vertex-like and edge-like tables.
923
+
924
+ Args:
925
+ schema_name: Schema name. If None, uses 'public' or config schema_name.
926
+
927
+ Returns:
928
+ SchemaIntrospectionResult with vertex_tables, edge_tables, and schema_name
929
+ """
930
+ if schema_name is None:
931
+ schema_name = self.config.schema_name or "public"
932
+
933
+ logger.info(f"Introspecting PostgreSQL schema '{schema_name}'")
934
+
935
+ vertex_tables = self.detect_vertex_tables(schema_name)
936
+ edge_tables = self.detect_edge_tables(schema_name)
937
+
938
+ result = SchemaIntrospectionResult(
939
+ vertex_tables=vertex_tables,
940
+ edge_tables=edge_tables,
941
+ schema_name=schema_name,
942
+ )
943
+
944
+ logger.info(
945
+ f"Found {len(vertex_tables)} vertex-like tables and {len(edge_tables)} edge-like tables"
946
+ )
947
+
948
+ return result