structured2graph 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. __init__.py +47 -0
  2. core/__init__.py +23 -0
  3. core/hygm/__init__.py +74 -0
  4. core/hygm/hygm.py +2351 -0
  5. core/hygm/models/__init__.py +82 -0
  6. core/hygm/models/graph_models.py +667 -0
  7. core/hygm/models/llm_models.py +229 -0
  8. core/hygm/models/operations.py +176 -0
  9. core/hygm/models/sources.py +68 -0
  10. core/hygm/models/user_operations.py +139 -0
  11. core/hygm/strategies/__init__.py +17 -0
  12. core/hygm/strategies/base.py +36 -0
  13. core/hygm/strategies/deterministic.py +262 -0
  14. core/hygm/strategies/llm.py +904 -0
  15. core/hygm/validation/__init__.py +38 -0
  16. core/hygm/validation/base.py +194 -0
  17. core/hygm/validation/graph_schema_validator.py +687 -0
  18. core/hygm/validation/memgraph_data_validator.py +991 -0
  19. core/migration_agent.py +1369 -0
  20. core/schema/spec.json +155 -0
  21. core/utils/meta_graph.py +108 -0
  22. database/__init__.py +36 -0
  23. database/adapters/__init__.py +11 -0
  24. database/adapters/memgraph.py +318 -0
  25. database/adapters/mysql.py +311 -0
  26. database/adapters/postgresql.py +335 -0
  27. database/analyzer.py +396 -0
  28. database/factory.py +219 -0
  29. database/models.py +209 -0
  30. main.py +518 -0
  31. query_generation/__init__.py +20 -0
  32. query_generation/cypher_generator.py +129 -0
  33. query_generation/schema_utilities.py +88 -0
  34. structured2graph-0.1.1.dist-info/METADATA +197 -0
  35. structured2graph-0.1.1.dist-info/RECORD +41 -0
  36. structured2graph-0.1.1.dist-info/WHEEL +4 -0
  37. structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
  38. structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
  39. utils/__init__.py +57 -0
  40. utils/config.py +235 -0
  41. utils/environment.py +404 -0
database/analyzer.py ADDED
@@ -0,0 +1,396 @@
1
+ """
2
+ Abstract analyzer interface for database systems.
3
+
4
+ This module defines the abstract base class that all database analyzers
5
+ must implement to ensure compatibility with the migration system.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Dict, List, Any, Optional
10
+ from .models import (
11
+ DatabaseStructure,
12
+ TableInfo,
13
+ ColumnInfo,
14
+ ForeignKeyInfo,
15
+ RelationshipInfo,
16
+ TableType,
17
+ )
18
+
19
+
20
+ class DatabaseAnalyzer(ABC):
21
+ """
22
+ Abstract base class for database analyzers.
23
+
24
+ All database-specific analyzers must implement this interface to ensure
25
+ compatibility with HyGM and the migration system.
26
+ """
27
+
28
+ def __init__(self, connection_config: Dict[str, Any]):
29
+ """
30
+ Initialize the database analyzer.
31
+
32
+ Args:
33
+ connection_config: Database-specific connection configuration
34
+ """
35
+ self.connection_config = connection_config
36
+ self.connection = None
37
+ self.database_type = self._get_database_type()
38
+
39
+ @abstractmethod
40
+ def _get_database_type(self) -> str:
41
+ """Return the type of database (e.g., 'mysql', 'postgresql')."""
42
+ pass
43
+
44
+ @abstractmethod
45
+ def connect(self) -> bool:
46
+ """
47
+ Establish connection to the database.
48
+
49
+ Returns:
50
+ True if connection successful, False otherwise
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def disconnect(self) -> None:
56
+ """Close the database connection."""
57
+ pass
58
+
59
+ @abstractmethod
60
+ def get_tables(self) -> List[str]:
61
+ """
62
+ Get list of all tables in the database.
63
+
64
+ Returns:
65
+ List of table names
66
+ """
67
+ pass
68
+
69
+ @abstractmethod
70
+ def get_table_schema(self, table_name: str) -> List[ColumnInfo]:
71
+ """
72
+ Get schema information for a specific table.
73
+
74
+ Args:
75
+ table_name: Name of the table
76
+
77
+ Returns:
78
+ List of ColumnInfo objects describing the table schema
79
+ """
80
+ pass
81
+
82
+ @abstractmethod
83
+ def get_foreign_keys(self, table_name: str) -> List[ForeignKeyInfo]:
84
+ """
85
+ Get foreign key relationships for a table.
86
+
87
+ Args:
88
+ table_name: Name of the table
89
+
90
+ Returns:
91
+ List of ForeignKeyInfo objects
92
+ """
93
+ pass
94
+
95
+ @abstractmethod
96
+ def get_table_data(
97
+ self, table_name: str, limit: Optional[int] = None
98
+ ) -> List[Dict[str, Any]]:
99
+ """
100
+ Get data from a specific table.
101
+
102
+ Args:
103
+ table_name: Name of the table
104
+ limit: Maximum number of rows to return
105
+
106
+ Returns:
107
+ List of dictionaries representing rows
108
+ """
109
+ pass
110
+
111
+ @abstractmethod
112
+ def get_table_row_count(self, table_name: str) -> int:
113
+ """
114
+ Get the number of rows in a table.
115
+
116
+ Args:
117
+ table_name: Name of the table
118
+
119
+ Returns:
120
+ Number of rows in the table
121
+ """
122
+ pass
123
+
124
+ @abstractmethod
125
+ def is_view(self, table_name: str) -> bool:
126
+ """
127
+ Check if a table is actually a view.
128
+
129
+ Args:
130
+ table_name: Name of the table
131
+
132
+ Returns:
133
+ True if the table is a view, False otherwise
134
+ """
135
+ pass
136
+
137
+ @abstractmethod
138
+ def get_indexes(self, table_name: str) -> List[Dict[str, Any]]:
139
+ """
140
+ Get indexes for a specific table.
141
+
142
+ Args:
143
+ table_name: Name of the table
144
+
145
+ Returns:
146
+ List of dictionaries representing the table's indexes
147
+ """
148
+ pass
149
+
150
+ def is_connected(self) -> bool:
151
+ """
152
+ Check if the database connection is active.
153
+
154
+ Returns:
155
+ True if connected, False otherwise
156
+ """
157
+ return self.connection is not None
158
+
159
+ def get_connection_info(self) -> Dict[str, Any]:
160
+ """
161
+ Get connection information (excluding sensitive data like passwords).
162
+
163
+ Returns:
164
+ Dictionary with connection information
165
+ """
166
+ safe_config = self.connection_config.copy()
167
+ if "password" in safe_config:
168
+ safe_config["password"] = "***"
169
+ return {
170
+ "database_type": self.database_type,
171
+ "config": safe_config,
172
+ "connected": self.is_connected(),
173
+ }
174
+
175
+ def get_migration_config(self) -> Dict[str, str]:
176
+ """
177
+ Get connection config formatted for migration tools.
178
+
179
+ Returns:
180
+ Dictionary with string values suitable for migration tools
181
+ """
182
+ config = self.connection_config.copy()
183
+
184
+ # Ensure all values are strings for compatibility
185
+ migration_config = {}
186
+ for key, value in config.items():
187
+ if key == "password" and value is None:
188
+ migration_config[key] = ""
189
+ else:
190
+ migration_config[key] = str(value)
191
+
192
+ return migration_config
193
+
194
+ def is_join_table(self, table_info: TableInfo) -> bool:
195
+ """
196
+ Determine if a table is a join table (many-to-many).
197
+
198
+ This implementation is database-agnostic and can be overridden
199
+ if database-specific logic is needed.
200
+
201
+ Args:
202
+ table_info: TableInfo object
203
+
204
+ Returns:
205
+ True if the table is a join table, False otherwise
206
+ """
207
+ # A join table typically has:
208
+ # 1. Only foreign key columns (and maybe an ID or timestamp)
209
+ # 2. At least 2 foreign keys
210
+ # 3. Small number of total columns
211
+
212
+ if len(table_info.foreign_keys) < 2:
213
+ return False
214
+
215
+ # Count non-FK columns (excluding common metadata columns)
216
+ non_fk_columns = []
217
+ fk_column_names = {fk.column_name for fk in table_info.foreign_keys}
218
+ metadata_columns = {
219
+ "id",
220
+ "created_at",
221
+ "updated_at",
222
+ "created_on",
223
+ "updated_on",
224
+ "timestamp",
225
+ }
226
+
227
+ for col in table_info.columns:
228
+ field_name = col.name.lower()
229
+ if col.name not in fk_column_names and field_name not in metadata_columns:
230
+ non_fk_columns.append(col.name)
231
+
232
+ # If most columns are foreign keys, it's likely a join table
233
+ total_columns = len(table_info.columns)
234
+ fk_ratio = len(table_info.foreign_keys) / total_columns
235
+
236
+ # Consider it a join table if:
237
+ # - At least 2 FKs and FK ratio > 0.5, OR
238
+ # - All columns are FKs or metadata columns
239
+ return (len(table_info.foreign_keys) >= 2 and fk_ratio > 0.5) or len(
240
+ non_fk_columns
241
+ ) == 0
242
+
243
+ def determine_table_type(self, table_info: TableInfo) -> TableType:
244
+ """
245
+ Determine the type of table.
246
+
247
+ Args:
248
+ table_info: TableInfo object
249
+
250
+ Returns:
251
+ TableType enum value
252
+ """
253
+ # Check if it's a view first
254
+ if self.is_view(table_info.name):
255
+ return TableType.VIEW
256
+
257
+ if self.is_join_table(table_info):
258
+ return TableType.JOIN
259
+ elif len(table_info.foreign_keys) == 0:
260
+ return TableType.ENTITY # Pure entity table with no references
261
+ else:
262
+ return TableType.ENTITY # Entity table with references
263
+
264
+ def get_database_structure(self) -> DatabaseStructure:
265
+ """
266
+ Get complete database structure including tables, schemas, and relationships.
267
+
268
+ This method provides a standardized database structure that works
269
+ with HyGM regardless of the underlying database system.
270
+
271
+ Returns:
272
+ DatabaseStructure object containing all database information
273
+ """
274
+ tables = {}
275
+ entity_tables = {}
276
+ join_tables = {}
277
+ view_tables = {}
278
+ relationships = []
279
+ sample_data = {}
280
+ table_counts = {}
281
+
282
+ # Get all tables
283
+ all_table_names = self.get_tables()
284
+
285
+ # First pass: collect table information
286
+ for table_name in all_table_names:
287
+ columns = self.get_table_schema(table_name)
288
+ foreign_keys = self.get_foreign_keys(table_name)
289
+ row_count = self.get_table_row_count(table_name)
290
+
291
+ # Get primary keys
292
+ primary_keys = [col.name for col in columns if col.is_primary_key]
293
+
294
+ # Get indexes for this table
295
+ try:
296
+ table_indexes = self.get_indexes(table_name)
297
+ except (NotImplementedError, AttributeError):
298
+ # If get_indexes is not implemented, use empty list
299
+ table_indexes = []
300
+
301
+ # Create TableInfo object
302
+ table_info = TableInfo(
303
+ name=table_name,
304
+ table_type=TableType.ENTITY, # Will be determined later
305
+ columns=columns,
306
+ foreign_keys=foreign_keys,
307
+ row_count=row_count,
308
+ primary_keys=primary_keys,
309
+ indexes=table_indexes,
310
+ )
311
+
312
+ # Determine table type
313
+ table_info.table_type = self.determine_table_type(table_info)
314
+
315
+ tables[table_name] = table_info
316
+ table_counts[table_name] = row_count
317
+
318
+ # Categorize tables
319
+ if table_info.table_type == TableType.VIEW:
320
+ view_tables[table_name] = table_info
321
+ elif table_info.table_type == TableType.JOIN:
322
+ join_tables[table_name] = table_info
323
+ else:
324
+ entity_tables[table_name] = table_info
325
+
326
+ # Get sample data (limit to 3 rows for performance)
327
+ try:
328
+ sample_data[table_name] = self.get_table_data(table_name, limit=3)
329
+ except Exception:
330
+ sample_data[table_name] = []
331
+
332
+ # Second pass: create relationships
333
+ for table_name, table_info in tables.items():
334
+ if table_info.table_type == TableType.JOIN:
335
+ # Handle join tables as many-to-many relationships
336
+ fks = table_info.foreign_keys
337
+ if len(fks) >= 2:
338
+ # Create a many-to-many relationship
339
+ fk1, fk2 = fks[0], fks[1]
340
+
341
+ # Get additional properties from non-FK columns
342
+ fk_columns = {fk.column_name for fk in fks}
343
+ additional_properties = []
344
+ metadata_columns = {
345
+ "id",
346
+ "created_at",
347
+ "updated_at",
348
+ "created_on",
349
+ "updated_on",
350
+ "timestamp",
351
+ }
352
+
353
+ for col in table_info.columns:
354
+ if (
355
+ col.name not in fk_columns
356
+ and col.name.lower() not in metadata_columns
357
+ ):
358
+ additional_properties.append(col.name)
359
+
360
+ relationships.append(
361
+ RelationshipInfo(
362
+ relationship_type="many_to_many",
363
+ from_table=fk1.referenced_table,
364
+ from_column=fk1.referenced_column,
365
+ to_table=fk2.referenced_table,
366
+ to_column=fk2.referenced_column,
367
+ join_table=table_name,
368
+ join_from_column=fk1.column_name,
369
+ join_to_column=fk2.column_name,
370
+ additional_properties=additional_properties,
371
+ )
372
+ )
373
+ else:
374
+ # Handle regular foreign key relationships
375
+ for fk in table_info.foreign_keys:
376
+ relationships.append(
377
+ RelationshipInfo(
378
+ relationship_type="one_to_many",
379
+ from_table=table_name,
380
+ from_column=fk.column_name,
381
+ to_table=fk.referenced_table,
382
+ to_column=fk.referenced_column,
383
+ )
384
+ )
385
+
386
+ return DatabaseStructure(
387
+ tables=tables,
388
+ entity_tables=entity_tables,
389
+ join_tables=join_tables,
390
+ view_tables=view_tables,
391
+ relationships=relationships,
392
+ sample_data=sample_data,
393
+ table_counts=table_counts,
394
+ database_name=self.connection_config.get("database", "unknown"),
395
+ database_type=self.database_type,
396
+ )
database/factory.py ADDED
@@ -0,0 +1,219 @@
1
+ """
2
+ Database analyzer factory for creating database-specific analyzers.
3
+
4
+ This module provides a factory pattern for creating appropriate database
5
+ analyzers based on the database type or connection parameters.
6
+ """
7
+
8
+ from typing import Dict, Type
9
+ from .analyzer import DatabaseAnalyzer
10
+ from .adapters.mysql import MySQLAnalyzer
11
+ from .adapters.postgresql import PostgreSQLAnalyzer
12
+
13
+
14
+ class DatabaseAnalyzerFactory:
15
+ """Factory for creating database-specific analyzers."""
16
+
17
+ # Registry of available analyzers
18
+ _analyzers: Dict[str, Type[DatabaseAnalyzer]] = {
19
+ "mysql": MySQLAnalyzer,
20
+ "postgresql": PostgreSQLAnalyzer,
21
+ # Future database types can be added here:
22
+ # "duckdb": DuckDBAnalyzer,
23
+ # "oracle": OracleAnalyzer,
24
+ # "sqlserver": SQLServerAnalyzer,
25
+ }
26
+
27
+ @classmethod
28
+ def create_analyzer(
29
+ cls, database_type: str, **connection_params
30
+ ) -> DatabaseAnalyzer:
31
+ """
32
+ Create a database analyzer for the specified database type.
33
+
34
+ Args:
35
+ database_type: Type of database (mysql, postgresql, etc.)
36
+ **connection_params: Database-specific connection parameters
37
+
38
+ Returns:
39
+ DatabaseAnalyzer instance
40
+
41
+ Raises:
42
+ ValueError: If database type is not supported
43
+ """
44
+ database_type = database_type.lower()
45
+
46
+ if database_type not in cls._analyzers:
47
+ supported_types = ", ".join(cls._analyzers.keys())
48
+ raise ValueError(
49
+ f"Unsupported database type: {database_type}. "
50
+ f"Supported types: {supported_types}"
51
+ )
52
+
53
+ # Create analyzer with appropriate parameters based on database type
54
+ if database_type == "mysql":
55
+ return MySQLAnalyzer(
56
+ host=connection_params.get("host", "localhost"),
57
+ user=connection_params.get("user", "root"),
58
+ password=connection_params.get("password", ""),
59
+ database=connection_params.get("database") or "",
60
+ port=connection_params.get("port", 3306),
61
+ )
62
+ if database_type == "postgresql":
63
+ return PostgreSQLAnalyzer(
64
+ host=connection_params.get("host", "localhost"),
65
+ user=connection_params.get("user", "postgres"),
66
+ password=connection_params.get("password", ""),
67
+ database=connection_params.get("database") or "",
68
+ port=connection_params.get("port", 5432),
69
+ schema=connection_params.get("schema", "public"),
70
+ )
71
+ # elif database_type == "duckdb":
72
+ # return analyzer_class(
73
+ # database_path=connection_params.get("database_path"),
74
+ # )
75
+
76
+ # This should never be reached due to the check above
77
+ raise ValueError(f"No implementation for database type: {database_type}")
78
+
79
+ @classmethod
80
+ def create_from_uri(cls, database_uri: str) -> DatabaseAnalyzer:
81
+ """
82
+ Create a database analyzer from a database URI.
83
+
84
+ Args:
85
+ database_uri: Database connection URI
86
+ (e.g., mysql://user:pass@host/db)
87
+
88
+ Returns:
89
+ DatabaseAnalyzer instance
90
+
91
+ Raises:
92
+ ValueError: If URI format is invalid or database type unsupported
93
+ """
94
+ try:
95
+ # Parse the URI
96
+ if "://" not in database_uri:
97
+ raise ValueError("Invalid URI format: missing protocol")
98
+
99
+ protocol, rest = database_uri.split("://", 1)
100
+ database_type = protocol.lower()
101
+
102
+ # Parse connection parameters based on database type
103
+ if database_type == "mysql":
104
+ return cls._parse_mysql_uri(rest)
105
+ if database_type == "postgresql":
106
+ return cls._parse_postgresql_uri(rest)
107
+ # elif database_type == "duckdb":
108
+ # return cls._parse_duckdb_uri(rest)
109
+ else:
110
+ raise ValueError(f"Unsupported database type in URI: {database_type}")
111
+
112
+ except Exception as e:
113
+ raise ValueError(f"Failed to parse database URI: {e}") from e
114
+
115
+ @classmethod
116
+ def _parse_mysql_uri(cls, uri_part: str) -> MySQLAnalyzer:
117
+ """Parse MySQL URI and create analyzer."""
118
+ # Format: user:password@host:port/database
119
+ if "@" not in uri_part:
120
+ raise ValueError("Invalid MySQL URI: missing credentials")
121
+
122
+ credentials, host_db = uri_part.split("@", 1)
123
+
124
+ # Parse credentials
125
+ if ":" in credentials:
126
+ user, password = credentials.split(":", 1)
127
+ else:
128
+ user = credentials
129
+ password = ""
130
+
131
+ # Parse host, port, and database
132
+ if "/" not in host_db:
133
+ raise ValueError("Invalid MySQL URI: missing database name")
134
+
135
+ host_port, database = host_db.rsplit("/", 1)
136
+
137
+ if ":" in host_port:
138
+ host, port_str = host_port.split(":", 1)
139
+ port = int(port_str)
140
+ else:
141
+ host = host_port
142
+ port = 3306
143
+
144
+ return MySQLAnalyzer(
145
+ host=host,
146
+ user=user,
147
+ password=password,
148
+ database=database,
149
+ port=port,
150
+ )
151
+
152
+ @classmethod
153
+ def _parse_postgresql_uri(cls, uri_part: str) -> PostgreSQLAnalyzer:
154
+ """Parse PostgreSQL URI and create analyzer."""
155
+
156
+ if "@" not in uri_part:
157
+ raise ValueError("Invalid PostgreSQL URI: missing credentials")
158
+
159
+ credentials, host_db = uri_part.split("@", 1)
160
+
161
+ if ":" in credentials:
162
+ user, password = credentials.split(":", 1)
163
+ else:
164
+ user = credentials
165
+ password = ""
166
+
167
+ if "/" not in host_db:
168
+ raise ValueError("Invalid PostgreSQL URI: missing database name")
169
+
170
+ host_port, database = host_db.rsplit("/", 1)
171
+
172
+ schema = "public"
173
+ if "?" in database:
174
+ database, query = database.split("?", 1)
175
+ for part in query.split("&"):
176
+ if part.startswith("schema="):
177
+ schema = part.split("=", 1)[1] or "public"
178
+
179
+ if ":" in host_port:
180
+ host, port_str = host_port.split(":", 1)
181
+ port = int(port_str)
182
+ else:
183
+ host = host_port
184
+ port = 5432
185
+
186
+ return PostgreSQLAnalyzer(
187
+ host=host,
188
+ user=user,
189
+ password=password,
190
+ database=database,
191
+ port=port,
192
+ schema=schema,
193
+ )
194
+
195
+ @classmethod
196
+ def get_supported_databases(cls) -> list[str]:
197
+ """
198
+ Get list of supported database types.
199
+
200
+ Returns:
201
+ List of supported database type strings
202
+ """
203
+ return list(cls._analyzers.keys())
204
+
205
+ @classmethod
206
+ def register_analyzer(
207
+ cls, database_type: str, analyzer_class: Type[DatabaseAnalyzer]
208
+ ) -> None:
209
+ """
210
+ Register a new database analyzer.
211
+
212
+ This allows for extending the factory with new database types
213
+ without modifying the core factory code.
214
+
215
+ Args:
216
+ database_type: String identifier for the database type
217
+ analyzer_class: DatabaseAnalyzer subclass for this database type
218
+ """
219
+ cls._analyzers[database_type.lower()] = analyzer_class