graflo 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1120 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +297 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +586 -0
  13. graflo/caster.py +655 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +194 -0
  16. graflo/cli/manage_dbs.py +197 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/data_source/__init__.py +48 -0
  20. graflo/data_source/api.py +339 -0
  21. graflo/data_source/base.py +97 -0
  22. graflo/data_source/factory.py +298 -0
  23. graflo/data_source/file.py +133 -0
  24. graflo/data_source/memory.py +72 -0
  25. graflo/data_source/registry.py +82 -0
  26. graflo/data_source/sql.py +185 -0
  27. graflo/db/__init__.py +44 -0
  28. graflo/db/arango/__init__.py +22 -0
  29. graflo/db/arango/conn.py +1026 -0
  30. graflo/db/arango/query.py +180 -0
  31. graflo/db/arango/util.py +88 -0
  32. graflo/db/conn.py +377 -0
  33. graflo/db/connection/__init__.py +6 -0
  34. graflo/db/connection/config_mapping.py +18 -0
  35. graflo/db/connection/onto.py +688 -0
  36. graflo/db/connection/wsgi.py +29 -0
  37. graflo/db/manager.py +119 -0
  38. graflo/db/neo4j/__init__.py +16 -0
  39. graflo/db/neo4j/conn.py +639 -0
  40. graflo/db/postgres/__init__.py +156 -0
  41. graflo/db/postgres/conn.py +425 -0
  42. graflo/db/postgres/resource_mapping.py +139 -0
  43. graflo/db/postgres/schema_inference.py +245 -0
  44. graflo/db/postgres/types.py +148 -0
  45. graflo/db/tigergraph/__init__.py +9 -0
  46. graflo/db/tigergraph/conn.py +2212 -0
  47. graflo/db/util.py +49 -0
  48. graflo/filter/__init__.py +21 -0
  49. graflo/filter/onto.py +525 -0
  50. graflo/logging.conf +22 -0
  51. graflo/onto.py +190 -0
  52. graflo/plot/__init__.py +17 -0
  53. graflo/plot/plotter.py +556 -0
  54. graflo/util/__init__.py +23 -0
  55. graflo/util/chunker.py +751 -0
  56. graflo/util/merge.py +150 -0
  57. graflo/util/misc.py +37 -0
  58. graflo/util/onto.py +332 -0
  59. graflo/util/transform.py +448 -0
  60. graflo-1.3.3.dist-info/METADATA +190 -0
  61. graflo-1.3.3.dist-info/RECORD +64 -0
  62. graflo-1.3.3.dist-info/WHEEL +4 -0
  63. graflo-1.3.3.dist-info/entry_points.txt +5 -0
  64. graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
graflo/util/merge.py ADDED
@@ -0,0 +1,150 @@
1
+ """Document merging utilities.
2
+
3
+ This module provides functions for merging documents based on common index keys,
4
+ preserving order and handling both dict and VertexRep objects.
5
+
6
+ Key Functions:
7
+ - merge_doc_basis: Merge documents based on common index keys, preserving order
8
+
9
+ """
10
+
11
+ from typing import cast, overload
12
+
13
+ from graflo.architecture.onto import VertexRep
14
+
15
+
16
+ @overload
17
+ def merge_doc_basis(
18
+ docs: list[dict],
19
+ index_keys: tuple[str, ...],
20
+ ) -> list[dict]: ...
21
+
22
+
23
+ @overload
24
+ def merge_doc_basis(
25
+ docs: list[VertexRep],
26
+ index_keys: tuple[str, ...],
27
+ ) -> list[VertexRep]: ...
28
+
29
+
30
+ def merge_doc_basis(
31
+ docs: list[dict] | list[VertexRep],
32
+ index_keys: tuple[str, ...],
33
+ ) -> list[dict] | list[VertexRep]:
34
+ """Merge documents based on common index keys, preserving order.
35
+
36
+ This function merges documents that share common index key-value combinations,
37
+ preserving the order of documents based on the first occurrence of each index
38
+ key combination. Documents without index keys are merged into the closest
39
+ preceding document with index keys. If no documents have index keys, all
40
+ documents are merged into a single document.
41
+
42
+ For VertexRep objects, the merge is performed on the `vertex` attribute, and
43
+ `ctx` dicts are merged among merged VertexReps.
44
+
45
+ Args:
46
+ docs: Homogeneous list of documents (all dict or all VertexRep) to merge
47
+ index_keys: Tuple of key names to use for merging
48
+
49
+ Returns:
50
+ Merged documents in order of first occurrence (same type as input)
51
+ """
52
+ if not docs:
53
+ return docs
54
+
55
+ # Check if we're working with VertexRep objects
56
+ is_vertexrep = isinstance(docs[0], VertexRep)
57
+
58
+ # Track merged documents in order of first occurrence
59
+ # Type: list[dict] if not is_vertexrep, list[VertexRep] if is_vertexrep
60
+ merged_docs: list[dict | VertexRep] = []
61
+ # Map from index tuple to position in merged_docs
62
+ index_to_position: dict[tuple, int] = {}
63
+ # Accumulate documents without index keys
64
+ # Type: list[dict] if not is_vertexrep, list[VertexRep] if is_vertexrep
65
+ pending_non_ids: list[dict | VertexRep] = []
66
+
67
+ def get_index_tuple(doc: dict | VertexRep) -> tuple:
68
+ """Extract index tuple from a document."""
69
+ if is_vertexrep:
70
+ assert isinstance(doc, VertexRep)
71
+ data = doc.vertex
72
+ else:
73
+ assert isinstance(doc, dict)
74
+ data = doc
75
+ return tuple(sorted((k, v) for k, v in data.items() if k in index_keys))
76
+
77
+ def has_index_keys(doc: dict | VertexRep) -> bool:
78
+ """Check if document has any index keys."""
79
+ if is_vertexrep:
80
+ assert isinstance(doc, VertexRep)
81
+ return any(k in doc.vertex for k in index_keys)
82
+ else:
83
+ assert isinstance(doc, dict)
84
+ return any(k in doc for k in index_keys)
85
+
86
+ def merge_doc(target: dict | VertexRep, source: dict | VertexRep) -> None:
87
+ """Merge source into target."""
88
+ if is_vertexrep:
89
+ assert isinstance(target, VertexRep) and isinstance(source, VertexRep)
90
+ target.vertex.update(source.vertex)
91
+ target.ctx.update(source.ctx)
92
+ else:
93
+ assert isinstance(target, dict) and isinstance(source, dict)
94
+ target.update(source)
95
+
96
+ def copy_doc(doc: dict | VertexRep) -> dict | VertexRep:
97
+ """Create a copy of a document."""
98
+ if is_vertexrep:
99
+ assert isinstance(doc, VertexRep)
100
+ return VertexRep(vertex=doc.vertex.copy(), ctx=doc.ctx.copy())
101
+ else:
102
+ assert isinstance(doc, dict)
103
+ return doc.copy()
104
+
105
+ for doc in docs:
106
+ if has_index_keys(doc):
107
+ # This is a document with index keys
108
+ index_tuple = get_index_tuple(doc)
109
+
110
+ # First, handle any accumulated non-ID documents
111
+ if pending_non_ids:
112
+ if merged_docs:
113
+ # Merge accumulated non-IDs into the last ID doc
114
+ for pending in pending_non_ids:
115
+ merge_doc(merged_docs[-1], pending)
116
+ else:
117
+ # No previous ID doc, merge pending non-IDs into the current ID doc
118
+ for pending in pending_non_ids:
119
+ merge_doc(doc, pending)
120
+ pending_non_ids.clear()
121
+
122
+ # Handle the current document with index keys
123
+ if index_tuple in index_to_position:
124
+ # Merge into existing document at that position
125
+ merge_doc(merged_docs[index_to_position[index_tuple]], doc)
126
+ else:
127
+ # First occurrence of this index tuple, add new document
128
+ merged_docs.append(copy_doc(doc))
129
+ index_to_position[index_tuple] = len(merged_docs) - 1
130
+ else:
131
+ # This is a document without index keys, accumulate it
132
+ pending_non_ids.append(doc)
133
+
134
+ # Handle any remaining non-ID documents at the end
135
+ if pending_non_ids and merged_docs:
136
+ # Merge into last ID doc
137
+ for pending in pending_non_ids:
138
+ merge_doc(merged_docs[-1], pending)
139
+ elif pending_non_ids:
140
+ # No documents with index keys: merge all into a single document
141
+ if is_vertexrep:
142
+ merged_doc = VertexRep(vertex={}, ctx={})
143
+ else:
144
+ merged_doc = {}
145
+ for pending in pending_non_ids:
146
+ merge_doc(merged_doc, pending)
147
+ merged_docs.append(merged_doc)
148
+
149
+ # Type narrowing: return type matches input type due to homogeneous list requirement
150
+ return cast(list[dict] | list[VertexRep], merged_docs)
graflo/util/misc.py ADDED
@@ -0,0 +1,37 @@
1
+ """Miscellaneous utility functions.
2
+
3
+ This module provides various utility functions for data manipulation and processing.
4
+
5
+ Key Functions:
6
+ - sorted_dicts: Recursively sort dictionaries and lists for consistent ordering
7
+ """
8
+
9
+
10
+ def sorted_dicts(d):
11
+ """Recursively sort dictionaries and lists for consistent ordering.
12
+
13
+ This function recursively sorts dictionaries and lists to ensure consistent
14
+ ordering of data structures. It handles nested structures and preserves
15
+ non-collection values.
16
+
17
+ Args:
18
+ d: Data structure to sort (dict, list, tuple, or other)
19
+
20
+ Returns:
21
+ The sorted data structure with consistent ordering
22
+
23
+ Example:
24
+ >>> data = {"b": 2, "a": 1, "c": [3, 1, 2]}
25
+ >>> sorted_dicts(data)
26
+ {"a": 1, "b": 2, "c": [1, 2, 3]}
27
+ """
28
+ if isinstance(d, (tuple, list)):
29
+ if d and all([not isinstance(dd, (list, tuple, dict)) for dd in d[0].values()]):
30
+ return sorted(d, key=lambda x: tuple(x.items()))
31
+ elif isinstance(d, dict):
32
+ return {
33
+ k: v if not isinstance(v, (list, tuple, dict)) else sorted_dicts(v)
34
+ for k, v in d.items()
35
+ }
36
+
37
+ return d
graflo/util/onto.py ADDED
@@ -0,0 +1,332 @@
1
+ """Utility ontology classes for resource patterns and configurations.
2
+
3
+ This module provides data classes for managing resource patterns (files and database tables)
4
+ and configurations used throughout the system. These classes support resource discovery,
5
+ pattern matching, and configuration management.
6
+
7
+ Key Components:
8
+ - ResourcePattern: Abstract base class for resource patterns
9
+ - FilePattern: Configuration for file pattern matching
10
+ - TablePattern: Configuration for database table pattern matching
11
+ - Patterns: Collection of named resource patterns with connection management
12
+ """
13
+
14
+ import abc
15
+ import dataclasses
16
+ import pathlib
17
+ import re
18
+ from typing import TYPE_CHECKING, Any, Union
19
+
20
+ from graflo.onto import BaseDataclass
21
+
22
+ if TYPE_CHECKING:
23
+ from graflo.db.connection.onto import PostgresConfig
24
+ else:
25
+ # Import at runtime for type evaluation
26
+ try:
27
+ from graflo.db.connection.onto import PostgresConfig
28
+ except ImportError:
29
+ PostgresConfig = Any # type: ignore
30
+
31
+
32
+ @dataclasses.dataclass
33
+ class ResourcePattern(BaseDataclass, abc.ABC):
34
+ """Abstract base class for resource patterns (files or tables).
35
+
36
+ Provides common API for pattern matching and resource identification.
37
+ All concrete pattern types inherit from this class.
38
+
39
+ Attributes:
40
+ resource_name: Name of the resource this pattern matches
41
+ """
42
+
43
+ resource_name: str | None = None
44
+
45
+ @abc.abstractmethod
46
+ def matches(self, resource_identifier: str) -> bool:
47
+ """Check if pattern matches a resource identifier.
48
+
49
+ Args:
50
+ resource_identifier: Identifier to match (filename or table name)
51
+
52
+ Returns:
53
+ bool: True if pattern matches
54
+ """
55
+ pass
56
+
57
+ @abc.abstractmethod
58
+ def get_resource_type(self) -> str:
59
+ """Get the type of resource this pattern matches.
60
+
61
+ Returns:
62
+ str: Resource type ("file" or "table")
63
+ """
64
+ pass
65
+
66
+
67
+ @dataclasses.dataclass
68
+ class FilePattern(ResourcePattern):
69
+ """Pattern for matching files.
70
+
71
+ Attributes:
72
+ regex: Regular expression pattern for matching filenames
73
+ sub_path: Path to search for matching files (default: "./")
74
+ """
75
+
76
+ class _(BaseDataclass.Meta):
77
+ tag = "file"
78
+
79
+ regex: str | None = None
80
+ sub_path: None | pathlib.Path = dataclasses.field(
81
+ default_factory=lambda: pathlib.Path("./")
82
+ )
83
+
84
+ def __post_init__(self):
85
+ """Initialize and validate the file pattern.
86
+
87
+ Ensures that sub_path is a Path object and is not None.
88
+ """
89
+ if not isinstance(self.sub_path, pathlib.Path):
90
+ self.sub_path = pathlib.Path(self.sub_path)
91
+ assert self.sub_path is not None
92
+
93
+ def matches(self, filename: str) -> bool:
94
+ """Check if pattern matches a filename.
95
+
96
+ Args:
97
+ filename: Filename to match
98
+
99
+ Returns:
100
+ bool: True if pattern matches
101
+ """
102
+ if self.regex is None:
103
+ return False
104
+ return bool(re.match(self.regex, filename))
105
+
106
+ def get_resource_type(self) -> str:
107
+ """Get resource type."""
108
+ return "file"
109
+
110
+
111
+ @dataclasses.dataclass
112
+ class TablePattern(ResourcePattern):
113
+ """Pattern for matching database tables.
114
+
115
+ Attributes:
116
+ table_name: Exact table name or regex pattern
117
+ schema_name: Schema name (optional, defaults to public)
118
+ database: Database name (optional)
119
+ """
120
+
121
+ class _(BaseDataclass.Meta):
122
+ tag = "table"
123
+
124
+ table_name: str = ""
125
+ schema_name: str | None = None
126
+ database: str | None = None
127
+
128
+ def __post_init__(self):
129
+ """Validate table pattern after initialization."""
130
+ if not self.table_name:
131
+ raise ValueError("table_name is required for TablePattern")
132
+
133
+ def matches(self, table_identifier: str) -> bool:
134
+ """Check if pattern matches a table name.
135
+
136
+ Args:
137
+ table_identifier: Table name to match (format: schema.table or just table)
138
+
139
+ Returns:
140
+ bool: True if pattern matches
141
+ """
142
+ if not self.table_name:
143
+ return False
144
+
145
+ # Compile regex pattern
146
+ if self.table_name.startswith("^") or self.table_name.endswith("$"):
147
+ # Already a regex pattern
148
+ pattern = re.compile(self.table_name)
149
+ else:
150
+ # Exact match pattern
151
+ pattern = re.compile(f"^{re.escape(self.table_name)}$")
152
+
153
+ # Check if table_identifier matches
154
+ if pattern.match(table_identifier):
155
+ return True
156
+
157
+ # If schema_name is specified, also check schema.table format
158
+ if self.schema_name:
159
+ full_name = f"{self.schema_name}.{table_identifier}"
160
+ if pattern.match(full_name):
161
+ return True
162
+
163
+ return False
164
+
165
+ def get_resource_type(self) -> str:
166
+ """Get resource type."""
167
+ return "table"
168
+
169
+
170
+ @dataclasses.dataclass
171
+ class Patterns(BaseDataclass):
172
+ """Collection of named resource patterns with connection management.
173
+
174
+ This class manages a collection of resource patterns (files or tables),
175
+ each associated with a name. It efficiently handles PostgreSQL connections
176
+ by grouping tables that share the same connection configuration.
177
+
178
+ The constructor accepts:
179
+ - resource_mapping: dict mapping resource_name -> (file_path or table_name)
180
+ - postgres_connections: dict mapping config_key -> PostgresConfig
181
+ where config_key identifies a connection configuration
182
+ - postgres_tables: dict mapping table_name -> (config_key, schema_name, table_name)
183
+
184
+ Attributes:
185
+ patterns: Dictionary mapping resource names to ResourcePattern instances
186
+ postgres_configs: Dictionary mapping (config_key, schema_name) to PostgresConfig
187
+ postgres_table_configs: Dictionary mapping resource_name to (config_key, schema_name, table_name)
188
+ """
189
+
190
+ patterns: dict[str, Union[FilePattern, TablePattern]] = dataclasses.field(
191
+ default_factory=dict
192
+ )
193
+ postgres_configs: dict[tuple[str, str | None], Any] = dataclasses.field(
194
+ default_factory=dict, metadata={"exclude": True}
195
+ )
196
+ postgres_table_configs: dict[str, tuple[str, str | None, str]] = dataclasses.field(
197
+ default_factory=dict, metadata={"exclude": True}
198
+ )
199
+ # Initialization parameters (not stored as fields, excluded from serialization)
200
+ # Use Any for _postgres_connections to avoid type evaluation issues with dataclass_wizard
201
+ _resource_mapping: dict[str, str | tuple[str, str]] | None = dataclasses.field(
202
+ default=None, repr=False, compare=False, metadata={"exclude": True}
203
+ )
204
+ _postgres_connections: dict[str, Any] | None = dataclasses.field(
205
+ default=None, repr=False, compare=False, metadata={"exclude": True}
206
+ )
207
+ _postgres_tables: dict[str, tuple[str, str | None, str]] | None = dataclasses.field(
208
+ default=None, repr=False, compare=False, metadata={"exclude": True}
209
+ )
210
+
211
+ def __post_init__(self):
212
+ """Initialize Patterns from resource mappings and PostgreSQL configurations."""
213
+ # Store PostgreSQL connection configs
214
+ if self._postgres_connections:
215
+ for config_key, config in self._postgres_connections.items():
216
+ if config is not None:
217
+ schema_name = config.schema_name
218
+ self.postgres_configs[(config_key, schema_name)] = config
219
+
220
+ # Process resource mappings
221
+ if self._resource_mapping:
222
+ for resource_name, resource_spec in self._resource_mapping.items():
223
+ if isinstance(resource_spec, str):
224
+ # File path - create FilePattern
225
+ file_path = pathlib.Path(resource_spec)
226
+ pattern = FilePattern(
227
+ regex=f"^{re.escape(file_path.name)}$",
228
+ sub_path=file_path.parent,
229
+ resource_name=resource_name,
230
+ )
231
+ self.patterns[resource_name] = pattern
232
+ elif isinstance(resource_spec, tuple) and len(resource_spec) == 2:
233
+ # (config_key, table_name) tuple - create TablePattern
234
+ config_key, table_name = resource_spec
235
+ # Find the schema_name from the config
236
+ config = (
237
+ self._postgres_connections.get(config_key)
238
+ if self._postgres_connections
239
+ else None
240
+ )
241
+ schema_name = config.schema_name if config else None
242
+
243
+ pattern = TablePattern(
244
+ table_name=table_name,
245
+ schema_name=schema_name,
246
+ resource_name=resource_name,
247
+ )
248
+ self.patterns[resource_name] = pattern
249
+ # Store the config mapping
250
+ self.postgres_table_configs[resource_name] = (
251
+ config_key,
252
+ schema_name,
253
+ table_name,
254
+ )
255
+
256
+ # Process explicit postgres_tables mapping
257
+ if self._postgres_tables:
258
+ for table_name, (
259
+ config_key,
260
+ schema_name,
261
+ actual_table_name,
262
+ ) in self._postgres_tables.items():
263
+ pattern = TablePattern(
264
+ table_name=actual_table_name,
265
+ schema_name=schema_name,
266
+ resource_name=table_name,
267
+ )
268
+ self.patterns[table_name] = pattern
269
+ self.postgres_table_configs[table_name] = (
270
+ config_key,
271
+ schema_name,
272
+ actual_table_name,
273
+ )
274
+
275
+ def add_file_pattern(self, name: str, file_pattern: FilePattern):
276
+ """Add a file pattern to the collection.
277
+
278
+ Args:
279
+ name: Name of the pattern
280
+ file_pattern: FilePattern instance
281
+ """
282
+ self.patterns[name] = file_pattern
283
+
284
+ def add_table_pattern(self, name: str, table_pattern: TablePattern):
285
+ """Add a table pattern to the collection.
286
+
287
+ Args:
288
+ name: Name of the pattern
289
+ table_pattern: TablePattern instance
290
+ """
291
+ self.patterns[name] = table_pattern
292
+
293
+ def get_postgres_config(self, resource_name: str) -> Any:
294
+ """Get PostgreSQL connection config for a resource.
295
+
296
+ Args:
297
+ resource_name: Name of the resource
298
+
299
+ Returns:
300
+ PostgresConfig if resource is a PostgreSQL table, None otherwise
301
+ """
302
+ if resource_name in self.postgres_table_configs:
303
+ config_key, schema_name, _ = self.postgres_table_configs[resource_name]
304
+ return self.postgres_configs.get((config_key, schema_name))
305
+ return None
306
+
307
+ def get_resource_type(self, resource_name: str) -> str | None:
308
+ """Get the resource type for a resource name.
309
+
310
+ Args:
311
+ resource_name: Name of the resource
312
+
313
+ Returns:
314
+ "file", "table", or None if not found
315
+ """
316
+ if resource_name in self.patterns:
317
+ return self.patterns[resource_name].get_resource_type()
318
+ return None
319
+
320
+ def get_table_info(self, resource_name: str) -> tuple[str, str | None] | None:
321
+ """Get table name and schema for a PostgreSQL table resource.
322
+
323
+ Args:
324
+ resource_name: Name of the resource
325
+
326
+ Returns:
327
+ Tuple of (table_name, schema_name) or None if not a table resource
328
+ """
329
+ if resource_name in self.postgres_table_configs:
330
+ _, schema_name, table_name = self.postgres_table_configs[resource_name]
331
+ return (table_name, schema_name)
332
+ return None