graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1276 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +418 -0
  7. graflo/architecture/onto.py +376 -0
  8. graflo/architecture/onto_sql.py +54 -0
  9. graflo/architecture/resource.py +163 -0
  10. graflo/architecture/schema.py +135 -0
  11. graflo/architecture/transform.py +292 -0
  12. graflo/architecture/util.py +89 -0
  13. graflo/architecture/vertex.py +562 -0
  14. graflo/caster.py +736 -0
  15. graflo/cli/__init__.py +14 -0
  16. graflo/cli/ingest.py +203 -0
  17. graflo/cli/manage_dbs.py +197 -0
  18. graflo/cli/plot_schema.py +132 -0
  19. graflo/cli/xml2json.py +93 -0
  20. graflo/data_source/__init__.py +48 -0
  21. graflo/data_source/api.py +339 -0
  22. graflo/data_source/base.py +95 -0
  23. graflo/data_source/factory.py +304 -0
  24. graflo/data_source/file.py +148 -0
  25. graflo/data_source/memory.py +70 -0
  26. graflo/data_source/registry.py +82 -0
  27. graflo/data_source/sql.py +183 -0
  28. graflo/db/__init__.py +44 -0
  29. graflo/db/arango/__init__.py +22 -0
  30. graflo/db/arango/conn.py +1025 -0
  31. graflo/db/arango/query.py +180 -0
  32. graflo/db/arango/util.py +88 -0
  33. graflo/db/conn.py +377 -0
  34. graflo/db/connection/__init__.py +6 -0
  35. graflo/db/connection/config_mapping.py +18 -0
  36. graflo/db/connection/onto.py +717 -0
  37. graflo/db/connection/wsgi.py +29 -0
  38. graflo/db/manager.py +119 -0
  39. graflo/db/neo4j/__init__.py +16 -0
  40. graflo/db/neo4j/conn.py +639 -0
  41. graflo/db/postgres/__init__.py +37 -0
  42. graflo/db/postgres/conn.py +948 -0
  43. graflo/db/postgres/fuzzy_matcher.py +281 -0
  44. graflo/db/postgres/heuristics.py +133 -0
  45. graflo/db/postgres/inference_utils.py +428 -0
  46. graflo/db/postgres/resource_mapping.py +273 -0
  47. graflo/db/postgres/schema_inference.py +372 -0
  48. graflo/db/postgres/types.py +148 -0
  49. graflo/db/postgres/util.py +87 -0
  50. graflo/db/tigergraph/__init__.py +9 -0
  51. graflo/db/tigergraph/conn.py +2365 -0
  52. graflo/db/tigergraph/onto.py +26 -0
  53. graflo/db/util.py +49 -0
  54. graflo/filter/__init__.py +21 -0
  55. graflo/filter/onto.py +525 -0
  56. graflo/logging.conf +22 -0
  57. graflo/onto.py +312 -0
  58. graflo/plot/__init__.py +17 -0
  59. graflo/plot/plotter.py +616 -0
  60. graflo/util/__init__.py +23 -0
  61. graflo/util/chunker.py +807 -0
  62. graflo/util/merge.py +150 -0
  63. graflo/util/misc.py +37 -0
  64. graflo/util/onto.py +422 -0
  65. graflo/util/transform.py +454 -0
  66. graflo-1.3.7.dist-info/METADATA +243 -0
  67. graflo-1.3.7.dist-info/RECORD +70 -0
  68. graflo-1.3.7.dist-info/WHEEL +4 -0
  69. graflo-1.3.7.dist-info/entry_points.txt +5 -0
  70. graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,281 @@
1
+ """Fuzzy matching utilities for PostgreSQL schema analysis.
2
+
3
+ This module provides improved fuzzy matching strategies for identifying
4
+ vertex names from table and column fragments.
5
+ """
6
+
7
+ from difflib import SequenceMatcher
8
+
9
+
10
+ class FuzzyMatcher:
11
+ """Improved fuzzy matcher with multiple matching strategies.
12
+
13
+ Uses a combination of matching techniques:
14
+ 1. Exact matching (case-insensitive)
15
+ 2. Substring matching (with length-based scoring)
16
+ 3. Sequence similarity (difflib)
17
+ 4. Prefix/suffix matching
18
+ 5. Common pattern matching (handles id, fk, etc.)
19
+ """
20
+
21
+ def __init__(self, vertex_names: list[str], threshold: float = 0.6):
22
+ """Initialize the fuzzy matcher.
23
+
24
+ Args:
25
+ vertex_names: List of vertex table names to match against
26
+ threshold: Similarity threshold (0.0 to 1.0)
27
+ """
28
+ self.vertex_names = vertex_names
29
+ self.threshold = threshold
30
+ # Pre-compute lowercase versions for efficiency
31
+ self._vertex_lower_map = {vn.lower(): vn for vn in vertex_names}
32
+ self._vertex_lower_list = list(self._vertex_lower_map.keys())
33
+
34
+ def match(self, fragment: str) -> tuple[str | None, float]:
35
+ """Match a fragment against vertex names using multiple strategies.
36
+
37
+ Args:
38
+ fragment: Fragment to match
39
+
40
+ Returns:
41
+ Tuple of (best_match, score) or (None, 0.0) if no match above threshold
42
+ """
43
+ if not self.vertex_names or not fragment:
44
+ return (None, 0.0)
45
+
46
+ fragment_lower = fragment.lower()
47
+
48
+ # Strategy 1: Exact match (highest priority, returns immediately)
49
+ if fragment_lower in self._vertex_lower_map:
50
+ return (self._vertex_lower_map[fragment_lower], 1.0)
51
+
52
+ best_match = None
53
+ best_score = 0.0
54
+
55
+ # Strategy 2: Substring matching with length-based scoring
56
+ substring_score = self._substring_match(fragment_lower)
57
+ if substring_score[1] > best_score:
58
+ best_match, best_score = substring_score
59
+
60
+ # Strategy 3: Sequence similarity (difflib)
61
+ sequence_score = self._sequence_match(fragment_lower)
62
+ if sequence_score[1] > best_score:
63
+ best_match, best_score = sequence_score
64
+
65
+ # Strategy 4: Prefix/suffix matching
66
+ prefix_suffix_score = self._prefix_suffix_match(fragment_lower)
67
+ if prefix_suffix_score[1] > best_score:
68
+ best_match, best_score = prefix_suffix_score
69
+
70
+ # Strategy 5: Common pattern matching (handles id, fk, etc.)
71
+ pattern_score = self._pattern_match(fragment_lower)
72
+ if pattern_score[1] > best_score:
73
+ best_match, best_score = pattern_score
74
+
75
+ # Return match only if above threshold
76
+ if best_score >= self.threshold:
77
+ return (best_match, best_score)
78
+ return (None, 0.0)
79
+
80
+ def _substring_match(self, fragment_lower: str) -> tuple[str | None, float]:
81
+ """Match using substring containment with length-based scoring.
82
+
83
+ Args:
84
+ fragment_lower: Lowercase fragment to match
85
+
86
+ Returns:
87
+ Tuple of (best_match, score)
88
+ """
89
+ best_match = None
90
+ best_score = 0.0
91
+
92
+ for vertex_lower, vertex_name in self._vertex_lower_map.items():
93
+ # Check if fragment is contained in vertex or vice versa
94
+ if fragment_lower in vertex_lower:
95
+ # Fragment is substring of vertex (e.g., "user" in "users")
96
+ score = len(fragment_lower) / len(vertex_lower)
97
+ # Boost score if fragment is significant portion
98
+ if len(fragment_lower) >= 3: # At least 3 chars
99
+ score = min(score * 1.2, 0.95) # Cap at 0.95
100
+ if score > best_score:
101
+ best_score = score
102
+ best_match = vertex_name
103
+ elif vertex_lower in fragment_lower:
104
+ # Vertex is substring of fragment (e.g., "user" in "user_id")
105
+ score = len(vertex_lower) / len(fragment_lower)
106
+ # Boost score if vertex is significant portion
107
+ if len(vertex_lower) >= 3:
108
+ score = min(score * 1.2, 0.95)
109
+ if score > best_score:
110
+ best_score = score
111
+ best_match = vertex_name
112
+
113
+ return (best_match, best_score)
114
+
115
+ def _sequence_match(self, fragment_lower: str) -> tuple[str | None, float]:
116
+ """Match using sequence similarity (difflib).
117
+
118
+ Args:
119
+ fragment_lower: Lowercase fragment to match
120
+
121
+ Returns:
122
+ Tuple of (best_match, score)
123
+ """
124
+ best_match = None
125
+ best_score = 0.0
126
+
127
+ for vertex_lower, vertex_name in self._vertex_lower_map.items():
128
+ similarity = SequenceMatcher(None, fragment_lower, vertex_lower).ratio()
129
+ if similarity > best_score:
130
+ best_score = similarity
131
+ best_match = vertex_name
132
+
133
+ return (best_match, best_score)
134
+
135
+ def _prefix_suffix_match(self, fragment_lower: str) -> tuple[str | None, float]:
136
+ """Match using prefix or suffix patterns.
137
+
138
+ Args:
139
+ fragment_lower: Lowercase fragment to match
140
+
141
+ Returns:
142
+ Tuple of (best_match, score)
143
+ """
144
+ best_match = None
145
+ best_score = 0.0
146
+
147
+ for vertex_lower, vertex_name in self._vertex_lower_map.items():
148
+ # Check prefix match
149
+ if fragment_lower.startswith(vertex_lower):
150
+ score = len(vertex_lower) / len(fragment_lower)
151
+ if score > best_score:
152
+ best_score = score
153
+ best_match = vertex_name
154
+ # Check suffix match
155
+ elif fragment_lower.endswith(vertex_lower):
156
+ score = len(vertex_lower) / len(fragment_lower)
157
+ if score > best_score:
158
+ best_score = score
159
+ best_match = vertex_name
160
+ # Check if vertex starts with fragment
161
+ elif vertex_lower.startswith(fragment_lower):
162
+ score = len(fragment_lower) / len(vertex_lower)
163
+ if score > best_score:
164
+ best_score = score
165
+ best_match = vertex_name
166
+
167
+ return (best_match, best_score)
168
+
169
+ def _pattern_match(self, fragment_lower: str) -> tuple[str | None, float]:
170
+ """Match using common patterns (id, fk, etc.).
171
+
172
+ Args:
173
+ fragment_lower: Lowercase fragment to match
174
+
175
+ Returns:
176
+ Tuple of (best_match, score)
177
+ """
178
+ # Common suffixes/prefixes to remove
179
+ common_patterns = [
180
+ ("_id", ""),
181
+ ("_fk", ""),
182
+ ("_key", ""),
183
+ ("_pk", ""),
184
+ ("_ref", ""),
185
+ ("_reference", ""),
186
+ ("id_", ""),
187
+ ("fk_", ""),
188
+ ("key_", ""),
189
+ ("pk_", ""),
190
+ ("ref_", ""),
191
+ ("reference_", ""),
192
+ ]
193
+
194
+ best_match = None
195
+ best_score = 0.0
196
+
197
+ # Try removing common patterns and matching
198
+ for pattern, replacement in common_patterns:
199
+ if fragment_lower.endswith(pattern):
200
+ base = fragment_lower[: -len(pattern)]
201
+ if base in self._vertex_lower_map:
202
+ # High score for pattern-based matches
203
+ score = 0.9
204
+ if score > best_score:
205
+ best_score = score
206
+ best_match = self._vertex_lower_map[base]
207
+ elif fragment_lower.startswith(pattern):
208
+ base = fragment_lower[len(pattern) :]
209
+ if base in self._vertex_lower_map:
210
+ score = 0.9
211
+ if score > best_score:
212
+ best_score = score
213
+ best_match = self._vertex_lower_map[base]
214
+
215
+ return (best_match, best_score)
216
+
217
+
218
+ class FuzzyMatchCache:
219
+ """Cache for fuzzy matching fragments to vertex names.
220
+
221
+ Pre-computes fuzzy matches for all fragments to avoid redundant computations.
222
+ This significantly improves performance when processing multiple tables.
223
+ """
224
+
225
+ def __init__(self, vertex_names: list[str], threshold: float = 0.6):
226
+ """Initialize the fuzzy match cache.
227
+
228
+ Args:
229
+ vertex_names: List of vertex table names to match against
230
+ threshold: Similarity threshold (0.0 to 1.0)
231
+ """
232
+ self.vertex_names = vertex_names
233
+ self.threshold = threshold
234
+ self._matcher = FuzzyMatcher(vertex_names, threshold)
235
+ self._cache: dict[str, str | None] = {}
236
+ self._build_cache()
237
+
238
+ def _build_cache(self) -> None:
239
+ """Pre-compute fuzzy matches for common patterns."""
240
+ # Pre-compute exact matches (case-insensitive)
241
+ for vertex_name in self.vertex_names:
242
+ vertex_lower = vertex_name.lower()
243
+ self._cache[vertex_lower] = vertex_name
244
+ # Also cache common variations
245
+ for suffix in ["id", "fk", "key", "pk", "ref", "reference"]:
246
+ self._cache[f"{vertex_lower}_{suffix}"] = vertex_name
247
+ self._cache[f"{suffix}_{vertex_lower}"] = vertex_name
248
+
249
+ def get_match(self, fragment: str) -> str | None:
250
+ """Get cached fuzzy match for a fragment, computing if not cached.
251
+
252
+ Args:
253
+ fragment: Fragment to match
254
+
255
+ Returns:
256
+ Best matching vertex name or None if no match above threshold
257
+ """
258
+ fragment_lower = fragment.lower()
259
+
260
+ # Check cache first
261
+ if fragment_lower in self._cache:
262
+ return self._cache[fragment_lower]
263
+
264
+ # Compute match if not cached using improved matcher
265
+ match, _ = self._matcher.match(fragment)
266
+ self._cache[fragment_lower] = match
267
+ return match
268
+
269
+ def batch_match(self, fragments: list[str]) -> dict[str, str | None]:
270
+ """Match multiple fragments in batch, using cache when possible.
271
+
272
+ Args:
273
+ fragments: List of fragments to match
274
+
275
+ Returns:
276
+ Dictionary mapping fragments to their matched vertex names (or None)
277
+ """
278
+ results = {}
279
+ for fragment in fragments:
280
+ results[fragment] = self.get_match(fragment)
281
+ return results
@@ -0,0 +1,133 @@
1
+ import logging
2
+
3
+ from graflo.util.onto import Patterns, TablePattern
4
+ from graflo.db.postgres.conn import (
5
+ PostgresConnection,
6
+ )
7
+ from graflo.db.postgres.resource_mapping import PostgresResourceMapper
8
+ from graflo.db.postgres.schema_inference import PostgresSchemaInferencer
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def create_patterns_from_postgres(
14
+ conn: PostgresConnection, schema_name: str | None = None
15
+ ) -> Patterns:
16
+ """Create Patterns from PostgreSQL tables.
17
+
18
+ Args:
19
+ conn: PostgresConnection instance
20
+ schema_name: Schema name to introspect
21
+
22
+ Returns:
23
+ Patterns: Patterns object with TablePattern instances for all tables
24
+ """
25
+
26
+ # Introspect the schema
27
+ introspection_result = conn.introspect_schema(schema_name=schema_name)
28
+
29
+ # Create patterns
30
+ patterns = Patterns()
31
+
32
+ # Get schema name
33
+ effective_schema = schema_name or introspection_result.schema_name
34
+
35
+ # Store the connection config
36
+ config_key = "default"
37
+ patterns.postgres_configs[(config_key, effective_schema)] = conn.config
38
+
39
+ # Add patterns for vertex tables
40
+ for table_info in introspection_result.vertex_tables:
41
+ table_name = table_info.name
42
+ table_pattern = TablePattern(
43
+ table_name=table_name,
44
+ schema_name=effective_schema,
45
+ resource_name=table_name,
46
+ )
47
+ patterns.patterns[table_name] = table_pattern
48
+ patterns.postgres_table_configs[table_name] = (
49
+ config_key,
50
+ effective_schema,
51
+ table_name,
52
+ )
53
+
54
+ # Add patterns for edge tables
55
+ for table_info in introspection_result.edge_tables:
56
+ table_name = table_info.name
57
+ table_pattern = TablePattern(
58
+ table_name=table_name,
59
+ schema_name=effective_schema,
60
+ resource_name=table_name,
61
+ )
62
+ patterns.patterns[table_name] = table_pattern
63
+ patterns.postgres_table_configs[table_name] = (
64
+ config_key,
65
+ effective_schema,
66
+ table_name,
67
+ )
68
+
69
+ return patterns
70
+
71
+
72
+ def create_resources_from_postgres(
73
+ conn: PostgresConnection, schema, schema_name: str | None = None
74
+ ):
75
+ """Create Resources from PostgreSQL tables for an existing schema.
76
+
77
+ Args:
78
+ conn: PostgresConnection instance
79
+ schema: Existing Schema object
80
+ schema_name: Schema name to introspect
81
+
82
+ Returns:
83
+ list[Resource]: List of Resources for PostgreSQL tables
84
+ """
85
+ # Introspect the schema
86
+ introspection_result = conn.introspect_schema(schema_name=schema_name)
87
+
88
+ # Map tables to resources
89
+ mapper = PostgresResourceMapper()
90
+ resources = mapper.map_tables_to_resources(
91
+ introspection_result, schema.vertex_config, schema.edge_config
92
+ )
93
+
94
+ return resources
95
+
96
+
97
+ def infer_schema_from_postgres(
98
+ conn: PostgresConnection, schema_name: str | None = None, db_flavor=None
99
+ ):
100
+ """Convenience function to infer a graflo Schema from PostgreSQL database.
101
+
102
+ Args:
103
+ conn: PostgresConnection instance
104
+ schema_name: Schema name to introspect (defaults to config schema_name or 'public')
105
+ db_flavor: Target database flavor (defaults to ARANGO)
106
+
107
+ Returns:
108
+ Schema: Inferred schema with vertices, edges, and resources
109
+ """
110
+ from graflo.onto import DBFlavor
111
+
112
+ if db_flavor is None:
113
+ db_flavor = DBFlavor.ARANGO
114
+
115
+ # Introspect the schema
116
+ introspection_result = conn.introspect_schema(schema_name=schema_name)
117
+
118
+ # Infer schema (pass connection for type sampling)
119
+ inferencer = PostgresSchemaInferencer(db_flavor=db_flavor, conn=conn)
120
+ schema = inferencer.infer_schema(introspection_result, schema_name=schema_name)
121
+
122
+ # Create and add resources
123
+ mapper = PostgresResourceMapper()
124
+ resources = mapper.map_tables_to_resources(
125
+ introspection_result, schema.vertex_config, schema.edge_config
126
+ )
127
+
128
+ # Update schema with resources
129
+ schema.resources = resources
130
+ # Re-initialize to set up resource mappings
131
+ schema.__post_init__()
132
+
133
+ return schema