graflo 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1120 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +297 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +586 -0
  13. graflo/caster.py +655 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +194 -0
  16. graflo/cli/manage_dbs.py +197 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/data_source/__init__.py +48 -0
  20. graflo/data_source/api.py +339 -0
  21. graflo/data_source/base.py +97 -0
  22. graflo/data_source/factory.py +298 -0
  23. graflo/data_source/file.py +133 -0
  24. graflo/data_source/memory.py +72 -0
  25. graflo/data_source/registry.py +82 -0
  26. graflo/data_source/sql.py +185 -0
  27. graflo/db/__init__.py +44 -0
  28. graflo/db/arango/__init__.py +22 -0
  29. graflo/db/arango/conn.py +1026 -0
  30. graflo/db/arango/query.py +180 -0
  31. graflo/db/arango/util.py +88 -0
  32. graflo/db/conn.py +377 -0
  33. graflo/db/connection/__init__.py +6 -0
  34. graflo/db/connection/config_mapping.py +18 -0
  35. graflo/db/connection/onto.py +688 -0
  36. graflo/db/connection/wsgi.py +29 -0
  37. graflo/db/manager.py +119 -0
  38. graflo/db/neo4j/__init__.py +16 -0
  39. graflo/db/neo4j/conn.py +639 -0
  40. graflo/db/postgres/__init__.py +156 -0
  41. graflo/db/postgres/conn.py +425 -0
  42. graflo/db/postgres/resource_mapping.py +139 -0
  43. graflo/db/postgres/schema_inference.py +245 -0
  44. graflo/db/postgres/types.py +148 -0
  45. graflo/db/tigergraph/__init__.py +9 -0
  46. graflo/db/tigergraph/conn.py +2212 -0
  47. graflo/db/util.py +49 -0
  48. graflo/filter/__init__.py +21 -0
  49. graflo/filter/onto.py +525 -0
  50. graflo/logging.conf +22 -0
  51. graflo/onto.py +190 -0
  52. graflo/plot/__init__.py +17 -0
  53. graflo/plot/plotter.py +556 -0
  54. graflo/util/__init__.py +23 -0
  55. graflo/util/chunker.py +751 -0
  56. graflo/util/merge.py +150 -0
  57. graflo/util/misc.py +37 -0
  58. graflo/util/onto.py +332 -0
  59. graflo/util/transform.py +448 -0
  60. graflo-1.3.3.dist-info/METADATA +190 -0
  61. graflo-1.3.3.dist-info/RECORD +64 -0
  62. graflo-1.3.3.dist-info/WHEEL +4 -0
  63. graflo-1.3.3.dist-info/entry_points.txt +5 -0
  64. graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
graflo/caster.py ADDED
@@ -0,0 +1,655 @@
1
+ """Data casting and ingestion system for graph databases.
2
+
3
+ This module provides functionality for casting and ingesting data into graph databases.
4
+ It handles batch processing, file discovery, and database operations for both ArangoDB
5
+ and Neo4j.
6
+
7
+ Key Components:
8
+ - Caster: Main class for data casting and ingestion
9
+ - FilePattern: Pattern matching for file discovery
10
+ - Patterns: Collection of file patterns for different resources
11
+
12
+ Example:
13
+ >>> caster = Caster(schema=schema)
14
+ >>> caster.ingest(path="data/", conn_conf=db_config)
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ import multiprocessing as mp
21
+ import queue
22
+ import re
23
+ import sys
24
+ from concurrent.futures import ThreadPoolExecutor
25
+ from functools import partial
26
+ from pathlib import Path
27
+ from typing import Any, cast
28
+
29
+ import pandas as pd
30
+ from suthing import Timer
31
+
32
+ from graflo.architecture.onto import EncodingType, GraphContainer
33
+ from graflo.architecture.schema import Schema
34
+ from graflo.data_source import (
35
+ AbstractDataSource,
36
+ DataSourceFactory,
37
+ DataSourceRegistry,
38
+ )
39
+ from graflo.db import DBType, ConnectionManager, DBConfig
40
+ from graflo.util.chunker import ChunkerType
41
+ from graflo.util.onto import FilePattern, Patterns, TablePattern
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ class Caster:
47
+ """Main class for data casting and ingestion.
48
+
49
+ This class handles the process of casting data into graph structures and
50
+ ingesting them into the database. It supports batch processing, parallel
51
+ execution, and various data formats.
52
+
53
+ Attributes:
54
+ clean_start: Whether to clean the database before ingestion
55
+ n_cores: Number of CPU cores to use for parallel processing
56
+ max_items: Maximum number of items to process
57
+ batch_size: Size of batches for processing
58
+ n_threads: Number of threads for parallel processing
59
+ dry: Whether to perform a dry run (no database changes)
60
+ schema: Schema configuration for the graph
61
+ """
62
+
63
+ def __init__(self, schema: Schema, **kwargs):
64
+ """Initialize the caster with schema and configuration.
65
+
66
+ Args:
67
+ schema: Schema configuration for the graph
68
+ **kwargs: Additional configuration options:
69
+ - clean_start: Whether to clean the database before ingestion
70
+ - n_cores: Number of CPU cores to use
71
+ - max_items: Maximum number of items to process
72
+ - batch_size: Size of batches for processing
73
+ - n_threads: Number of threads for parallel processing
74
+ - dry: Whether to perform a dry run
75
+ """
76
+ self.clean_start: bool = False
77
+ self.n_cores = kwargs.pop("n_cores", 1)
78
+ self.max_items = kwargs.pop("max_items", None)
79
+ self.batch_size = kwargs.pop("batch_size", 10000)
80
+ self.n_threads = kwargs.pop("n_threads", 1)
81
+ self.dry = kwargs.pop("dry", False)
82
+ self.schema = schema
83
+
84
+ @staticmethod
85
+ def discover_files(
86
+ fpath: Path | str, pattern: FilePattern, limit_files=None
87
+ ) -> list[Path]:
88
+ """Discover files matching a pattern in a directory.
89
+
90
+ Args:
91
+ fpath: Path to search in
92
+ pattern: Pattern to match files against
93
+ limit_files: Optional limit on number of files to return
94
+
95
+ Returns:
96
+ list[Path]: List of matching file paths
97
+
98
+ Raises:
99
+ AssertionError: If pattern.sub_path is None
100
+ """
101
+ assert pattern.sub_path is not None
102
+ if isinstance(fpath, str):
103
+ fpath_pathlib = Path(fpath)
104
+ else:
105
+ fpath_pathlib = fpath
106
+
107
+ files = [
108
+ f
109
+ for f in (fpath_pathlib / pattern.sub_path).iterdir()
110
+ if f.is_file()
111
+ and (
112
+ True
113
+ if pattern.regex is None
114
+ else re.search(pattern.regex, f.name) is not None
115
+ )
116
+ ]
117
+
118
+ if limit_files is not None:
119
+ files = files[:limit_files]
120
+
121
+ return files
122
+
123
+ def cast_normal_resource(
124
+ self, data, resource_name: str | None = None
125
+ ) -> GraphContainer:
126
+ """Cast data into a graph container using a resource.
127
+
128
+ Args:
129
+ data: Data to cast
130
+ resource_name: Optional name of the resource to use
131
+
132
+ Returns:
133
+ GraphContainer: Container with cast graph data
134
+ """
135
+ rr = self.schema.fetch_resource(resource_name)
136
+
137
+ with ThreadPoolExecutor(max_workers=self.n_threads) as executor:
138
+ docs = list(
139
+ executor.map(
140
+ lambda doc: rr(doc),
141
+ data,
142
+ )
143
+ )
144
+
145
+ graph = GraphContainer.from_docs_list(docs)
146
+ return graph
147
+
148
+ def process_batch(
149
+ self,
150
+ batch,
151
+ resource_name: str | None,
152
+ conn_conf: None | DBConfig = None,
153
+ ):
154
+ """Process a batch of data.
155
+
156
+ Args:
157
+ batch: Batch of data to process
158
+ resource_name: Optional name of the resource to use
159
+ conn_conf: Optional database connection configuration
160
+ """
161
+ gc = self.cast_normal_resource(batch, resource_name=resource_name)
162
+
163
+ if conn_conf is not None:
164
+ self.push_db(gc=gc, conn_conf=conn_conf, resource_name=resource_name)
165
+
166
+ def process_data_source(
167
+ self,
168
+ data_source: AbstractDataSource,
169
+ resource_name: str | None = None,
170
+ conn_conf: None | DBConfig = None,
171
+ ):
172
+ """Process a data source.
173
+
174
+ Args:
175
+ data_source: Data source to process
176
+ resource_name: Optional name of the resource (overrides data_source.resource_name)
177
+ conn_conf: Optional database connection configuration
178
+ """
179
+ # Use provided resource_name or fall back to data_source's resource_name
180
+ actual_resource_name = resource_name or data_source.resource_name
181
+
182
+ for batch in data_source.iter_batches(
183
+ batch_size=self.batch_size, limit=self.max_items
184
+ ):
185
+ self.process_batch(
186
+ batch, resource_name=actual_resource_name, conn_conf=conn_conf
187
+ )
188
+
189
+ def process_resource(
190
+ self,
191
+ resource_instance: (
192
+ Path | str | list[dict] | list[list] | pd.DataFrame | dict[str, Any]
193
+ ),
194
+ resource_name: str | None,
195
+ conn_conf: None | DBConfig = None,
196
+ **kwargs,
197
+ ):
198
+ """Process a resource instance from configuration or direct data.
199
+
200
+ This method accepts either:
201
+ 1. A configuration dictionary with 'source_type' and data source parameters
202
+ 2. A file path (Path or str) - creates FileDataSource
203
+ 3. In-memory data (list[dict], list[list], or pd.DataFrame) - creates InMemoryDataSource
204
+
205
+ Args:
206
+ resource_instance: Configuration dict, file path, or in-memory data.
207
+ Configuration dict format:
208
+ - {"source_type": "file", "path": "data.json"}
209
+ - {"source_type": "api", "config": {"url": "https://..."}}
210
+ - {"source_type": "sql", "config": {"connection_string": "...", "query": "..."}}
211
+ - {"source_type": "in_memory", "data": [...]}
212
+ resource_name: Optional name of the resource
213
+ conn_conf: Optional database connection configuration
214
+ **kwargs: Additional arguments passed to data source creation
215
+ (e.g., columns for list[list], encoding for files)
216
+ """
217
+ # Handle configuration dictionary
218
+ if isinstance(resource_instance, dict):
219
+ config = resource_instance.copy()
220
+ # Merge with kwargs (kwargs take precedence)
221
+ config.update(kwargs)
222
+ data_source = DataSourceFactory.create_data_source_from_config(config)
223
+ # Handle file paths
224
+ elif isinstance(resource_instance, (Path, str)):
225
+ # File path - create FileDataSource
226
+ # Extract only valid file data source parameters with proper typing
227
+ file_type: str | ChunkerType | None = cast(
228
+ str | ChunkerType | None, kwargs.get("file_type", None)
229
+ )
230
+ encoding: EncodingType = cast(
231
+ EncodingType, kwargs.get("encoding", EncodingType.UTF_8)
232
+ )
233
+ sep: str | None = cast(str | None, kwargs.get("sep", None))
234
+ data_source = DataSourceFactory.create_file_data_source(
235
+ path=resource_instance,
236
+ file_type=file_type,
237
+ encoding=encoding,
238
+ sep=sep,
239
+ )
240
+ # Handle in-memory data
241
+ else:
242
+ # In-memory data - create InMemoryDataSource
243
+ # Extract only valid in-memory data source parameters with proper typing
244
+ columns: list[str] | None = cast(
245
+ list[str] | None, kwargs.get("columns", None)
246
+ )
247
+ data_source = DataSourceFactory.create_in_memory_data_source(
248
+ data=resource_instance,
249
+ columns=columns,
250
+ )
251
+
252
+ data_source.resource_name = resource_name
253
+
254
+ # Process using the data source
255
+ self.process_data_source(
256
+ data_source=data_source,
257
+ resource_name=resource_name,
258
+ conn_conf=conn_conf,
259
+ )
260
+
261
+ def push_db(
262
+ self,
263
+ gc: GraphContainer,
264
+ conn_conf: DBConfig,
265
+ resource_name: str | None,
266
+ ):
267
+ """Push graph container data to the database.
268
+
269
+ Args:
270
+ gc: Graph container with data to push
271
+ conn_conf: Database connection configuration
272
+ resource_name: Optional name of the resource
273
+ """
274
+ vc = self.schema.vertex_config
275
+ resource = self.schema.fetch_resource(resource_name)
276
+ with ConnectionManager(connection_config=conn_conf) as db_client:
277
+ for vcol, data in gc.vertices.items():
278
+ # blank nodes: push and get back their keys {"_key": ...}
279
+ if vcol in vc.blank_vertices:
280
+ query0 = db_client.insert_return_batch(data, vc.vertex_dbname(vcol))
281
+ cursor = db_client.execute(query0)
282
+ gc.vertices[vcol] = [item for item in cursor]
283
+ else:
284
+ db_client.upsert_docs_batch(
285
+ data,
286
+ vc.vertex_dbname(vcol),
287
+ vc.index(vcol),
288
+ update_keys="doc",
289
+ filter_uniques=True,
290
+ dry=self.dry,
291
+ )
292
+
293
+ # update edge misc with blank node edges
294
+ for vcol in vc.blank_vertices:
295
+ for edge_id, edge in self.schema.edge_config.edges_items():
296
+ vfrom, vto, relation = edge_id
297
+ if vcol == vfrom or vcol == vto:
298
+ if edge_id not in gc.edges:
299
+ gc.edges[edge_id] = []
300
+ gc.edges[edge_id].extend(
301
+ [
302
+ (x, y, {})
303
+ for x, y in zip(gc.vertices[vfrom], gc.vertices[vto])
304
+ ]
305
+ )
306
+
307
+ with ConnectionManager(connection_config=conn_conf) as db_client:
308
+ # currently works only on item level
309
+ for edge in resource.extra_weights:
310
+ if edge.weights is None:
311
+ continue
312
+ for weight in edge.weights.vertices:
313
+ if weight.name in vc.vertex_set:
314
+ index_fields = vc.index(weight.name)
315
+
316
+ if not self.dry and weight.name in gc.vertices:
317
+ weights_per_item = db_client.fetch_present_documents(
318
+ class_name=vc.vertex_dbname(weight.name),
319
+ batch=gc.vertices[weight.name],
320
+ match_keys=index_fields.fields,
321
+ keep_keys=weight.fields,
322
+ )
323
+
324
+ for j, item in enumerate(gc.linear):
325
+ weights = weights_per_item[j]
326
+
327
+ for ee in item[edge.edge_id]:
328
+ weight_collection_attached = {
329
+ weight.cfield(k): v
330
+ for k, v in weights[0].items()
331
+ }
332
+ ee.update(weight_collection_attached)
333
+ else:
334
+ logger.error(f"{weight.name} not a valid vertex")
335
+
336
+ with ConnectionManager(connection_config=conn_conf) as db_client:
337
+ for edge_id, edge in self.schema.edge_config.edges_items():
338
+ for ee in gc.loop_over_relations(edge_id):
339
+ _, _, relation = ee
340
+ if not self.dry:
341
+ data = gc.edges[ee]
342
+ db_client.insert_edges_batch(
343
+ docs_edges=data,
344
+ source_class=vc.vertex_dbname(edge.source),
345
+ target_class=vc.vertex_dbname(edge.target),
346
+ relation_name=relation,
347
+ collection_name=edge.collection_name,
348
+ match_keys_source=vc.index(edge.source).fields,
349
+ match_keys_target=vc.index(edge.target).fields,
350
+ filter_uniques=False,
351
+ dry=self.dry,
352
+ )
353
+
354
+ def process_with_queue(self, tasks: mp.Queue, **kwargs):
355
+ """Process tasks from a queue.
356
+
357
+ Args:
358
+ tasks: Queue of tasks to process
359
+ **kwargs: Additional keyword arguments
360
+ """
361
+ while True:
362
+ try:
363
+ task = tasks.get_nowait()
364
+ # Support both (Path, str) tuples and DataSource instances
365
+ if isinstance(task, tuple) and len(task) == 2:
366
+ filepath, resource_name = task
367
+ self.process_resource(
368
+ resource_instance=filepath,
369
+ resource_name=resource_name,
370
+ **kwargs,
371
+ )
372
+ elif isinstance(task, AbstractDataSource):
373
+ self.process_data_source(data_source=task, **kwargs)
374
+ except queue.Empty:
375
+ break
376
+
377
+ @staticmethod
378
+ def normalize_resource(
379
+ data: pd.DataFrame | list[list] | list[dict], columns: list[str] | None = None
380
+ ) -> list[dict]:
381
+ """Normalize resource data into a list of dictionaries.
382
+
383
+ Args:
384
+ data: Data to normalize (DataFrame, list of lists, or list of dicts)
385
+ columns: Optional column names for list data
386
+
387
+ Returns:
388
+ list[dict]: Normalized data as list of dictionaries
389
+
390
+ Raises:
391
+ ValueError: If columns is not provided for list data
392
+ """
393
+ if isinstance(data, pd.DataFrame):
394
+ columns = data.columns.tolist()
395
+ _data = data.values.tolist()
396
+ elif data and isinstance(data[0], list):
397
+ _data = cast(list[list], data) # Tell mypy this is list[list]
398
+ if columns is None:
399
+ raise ValueError("columns should be set")
400
+ else:
401
+ return cast(list[dict], data) # Tell mypy this is list[dict]
402
+ rows_dressed = [{k: v for k, v in zip(columns, item)} for item in _data]
403
+ return rows_dressed
404
+
405
+ def ingest_data_sources(
406
+ self,
407
+ data_source_registry: DataSourceRegistry,
408
+ conn_conf: None | DBConfig = None,
409
+ **kwargs,
410
+ ):
411
+ """Ingest data from data sources in a registry.
412
+
413
+ Args:
414
+ data_source_registry: Registry containing data sources mapped to resources
415
+ conn_conf: Database connection configuration
416
+ **kwargs: Additional keyword arguments:
417
+ - clean_start: Whether to clean the database before ingestion
418
+ - n_cores: Number of CPU cores to use
419
+ - max_items: Maximum number of items to process
420
+ - batch_size: Size of batches for processing
421
+ - dry: Whether to perform a dry run
422
+ - init_only: Whether to only initialize the database
423
+ """
424
+ conn_conf = cast(DBConfig, kwargs.get("conn_conf", conn_conf))
425
+ self.clean_start = kwargs.pop("clean_start", self.clean_start)
426
+ self.n_cores = kwargs.pop("n_cores", self.n_cores)
427
+ self.max_items = kwargs.pop("max_items", self.max_items)
428
+ self.batch_size = kwargs.pop("batch_size", self.batch_size)
429
+ self.dry = kwargs.pop("dry", self.dry)
430
+ init_only = kwargs.pop("init_only", False)
431
+
432
+ if conn_conf is None:
433
+ raise ValueError("conn_conf is required for ingest_data_sources")
434
+
435
+ # If effective_schema is not set, use schema.general.name as fallback
436
+ if conn_conf.can_be_target() and conn_conf.effective_schema is None:
437
+ schema_name = self.schema.general.name
438
+ # Map to the appropriate field based on DB type
439
+ if conn_conf.connection_type == DBType.TIGERGRAPH:
440
+ # TigerGraph uses 'schema_name' field
441
+ conn_conf.schema_name = schema_name
442
+ else:
443
+ # ArangoDB, Neo4j use 'database' field (which maps to effective_schema)
444
+ conn_conf.database = schema_name
445
+
446
+ # init_db() now handles database/schema creation automatically
447
+ # It checks if the database exists and creates it if needed
448
+ # Uses schema.general.name if database is not set in config
449
+ with ConnectionManager(connection_config=conn_conf) as db_client:
450
+ db_client.init_db(self.schema, self.clean_start)
451
+
452
+ if init_only:
453
+ logger.info("ingest execution bound to init")
454
+ sys.exit(0)
455
+
456
+ # Collect all data sources
457
+ tasks: list[AbstractDataSource] = []
458
+ for resource_name in self.schema._resources.keys():
459
+ data_sources = data_source_registry.get_data_sources(resource_name)
460
+ if data_sources:
461
+ logger.info(
462
+ f"For resource name {resource_name} {len(data_sources)} data sources were found"
463
+ )
464
+ tasks.extend(data_sources)
465
+
466
+ with Timer() as klepsidra:
467
+ if self.n_cores > 1:
468
+ queue_tasks: mp.Queue = mp.Queue()
469
+ for item in tasks:
470
+ queue_tasks.put(item)
471
+
472
+ func = partial(
473
+ self.process_with_queue,
474
+ conn_conf=conn_conf,
475
+ **kwargs,
476
+ )
477
+ assert mp.get_start_method() == "fork", (
478
+ "Requires 'forking' operating system"
479
+ )
480
+
481
+ processes = []
482
+
483
+ for w in range(self.n_cores):
484
+ p = mp.Process(target=func, args=(queue_tasks,), kwargs=kwargs)
485
+ processes.append(p)
486
+ p.start()
487
+ for p in processes:
488
+ p.join()
489
+ else:
490
+ for data_source in tasks:
491
+ self.process_data_source(
492
+ data_source=data_source, conn_conf=conn_conf
493
+ )
494
+ logger.info(f"Processing took {klepsidra.elapsed:.1f} sec")
495
+
496
+ def ingest(
497
+ self,
498
+ output_config: DBConfig,
499
+ patterns: "Patterns | None" = None,
500
+ **kwargs,
501
+ ):
502
+ """Ingest data into the graph database.
503
+
504
+ This is the main ingestion method that takes:
505
+ - Schema: Graph structure (already set in Caster)
506
+ - OutputConfig: Target graph database configuration
507
+ - Patterns: Mapping of resources to physical data sources
508
+
509
+ Args:
510
+ output_config: Target database connection configuration (for writing graph)
511
+ patterns: Patterns instance mapping resources to data sources
512
+ If None, will try to use legacy 'patterns' kwarg
513
+ **kwargs: Additional keyword arguments:
514
+ - clean_start: Whether to clean the database before ingestion
515
+ - n_cores: Number of CPU cores to use
516
+ - max_items: Maximum number of items to process
517
+ - batch_size: Size of batches for processing
518
+ - dry: Whether to perform a dry run
519
+ - init_only: Whether to only initialize the database
520
+ - limit_files: Optional limit on number of files to process
521
+ - conn_conf: Legacy parameter (use output_config instead)
522
+ """
523
+ # Backward compatibility: support legacy conn_conf parameter
524
+ if "conn_conf" in kwargs:
525
+ output_config = kwargs.pop("conn_conf")
526
+
527
+ # Backward compatibility: support legacy patterns parameter
528
+ if patterns is None:
529
+ patterns = kwargs.pop("patterns", Patterns())
530
+
531
+ # Create DataSourceRegistry from patterns
532
+ registry = DataSourceRegistry()
533
+
534
+ for r in self.schema.resources:
535
+ resource_name = r.name
536
+ resource_type = patterns.get_resource_type(resource_name)
537
+
538
+ if resource_type == "file":
539
+ # Handle file pattern
540
+ pattern = patterns.patterns[resource_name]
541
+ if not isinstance(pattern, FilePattern):
542
+ logger.warning(
543
+ f"Pattern for resource '{resource_name}' is not a FilePattern, skipping"
544
+ )
545
+ continue
546
+
547
+ # Use sub_path from FilePattern (path is now part of the pattern)
548
+ if pattern.sub_path is None:
549
+ logger.warning(
550
+ f"FilePattern for resource '{resource_name}' has no sub_path, skipping"
551
+ )
552
+ continue
553
+ path_obj = pattern.sub_path.expanduser()
554
+ limit_files = kwargs.get("limit_files", None)
555
+
556
+ files = Caster.discover_files(
557
+ path_obj, limit_files=limit_files, pattern=pattern
558
+ )
559
+ logger.info(
560
+ f"For resource name {resource_name} {len(files)} files were found"
561
+ )
562
+
563
+ # Create FileDataSource for each file
564
+ for file_path in files:
565
+ file_source = DataSourceFactory.create_file_data_source(
566
+ path=file_path
567
+ )
568
+ registry.register(file_source, resource_name=resource_name)
569
+
570
+ elif resource_type == "table":
571
+ # Handle PostgreSQL table
572
+ pattern = patterns.patterns[resource_name]
573
+ if not isinstance(pattern, TablePattern):
574
+ logger.warning(
575
+ f"Pattern for resource '{resource_name}' is not a TablePattern, skipping"
576
+ )
577
+ continue
578
+
579
+ postgres_config = patterns.get_postgres_config(resource_name)
580
+ if postgres_config is None:
581
+ logger.warning(
582
+ f"PostgreSQL table '{resource_name}' has no connection config, skipping"
583
+ )
584
+ continue
585
+
586
+ # Get table info
587
+ table_info = patterns.get_table_info(resource_name)
588
+ if table_info is None:
589
+ logger.warning(
590
+ f"Could not get table info for resource '{resource_name}', skipping"
591
+ )
592
+ continue
593
+
594
+ table_name, schema_name = table_info
595
+ effective_schema = (
596
+ schema_name or postgres_config.schema_name or "public"
597
+ )
598
+
599
+ # Create SQLDataSource for PostgreSQL table
600
+ try:
601
+ query = f'SELECT * FROM "{effective_schema}"."{table_name}"'
602
+
603
+ from graflo.data_source.sql import SQLConfig, SQLDataSource
604
+ from urllib.parse import urlparse
605
+
606
+ parsed = urlparse(postgres_config.uri or "")
607
+ host = parsed.hostname or "localhost"
608
+ port = parsed.port or 5432
609
+ database = (
610
+ postgres_config.database
611
+ or parsed.path.lstrip("/")
612
+ or "postgres"
613
+ )
614
+ user = postgres_config.username or parsed.username or "postgres"
615
+ password = postgres_config.password or parsed.password or ""
616
+
617
+ # Build PostgreSQL connection string
618
+ if password:
619
+ connection_string = (
620
+ f"postgresql://{user}:{password}@{host}:{port}/{database}"
621
+ )
622
+ else:
623
+ connection_string = (
624
+ f"postgresql://{user}@{host}:{port}/{database}"
625
+ )
626
+
627
+ # Create SQLDataSource
628
+ sql_config = SQLConfig(
629
+ connection_string=connection_string,
630
+ query=query,
631
+ pagination=True,
632
+ page_size=1000,
633
+ )
634
+ sql_source = SQLDataSource(config=sql_config)
635
+ registry.register(sql_source, resource_name=resource_name)
636
+
637
+ logger.info(
638
+ f"Created SQLDataSource for table '{effective_schema}.{table_name}' "
639
+ f"mapped to resource '{resource_name}'"
640
+ )
641
+ except Exception as e:
642
+ logger.error(
643
+ f"Failed to create data source for PostgreSQL table '{resource_name}': {e}",
644
+ exc_info=True,
645
+ )
646
+ continue
647
+
648
+ else:
649
+ logger.warning(
650
+ f"No pattern configuration found for resource '{resource_name}', skipping"
651
+ )
652
+
653
+ # Use the new ingest_data_sources method with output_config
654
+ kwargs["conn_conf"] = output_config
655
+ self.ingest_data_sources(registry, **kwargs)
graflo/cli/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """Command-line interface for graflo.
2
+
3
+ This module provides command-line tools for working with graflo, including
4
+ utilities for data ingestion, schema management, and graph operations.
5
+
6
+ Key Components:
7
+ - Command-line tools for data processing
8
+ - Schema management utilities
9
+ - Graph database operations
10
+
11
+ Example:
12
+ >>> uv run ingest --config config.json --data data.json
13
+ >>> uv run plot_schema --config schema.yaml --output figs
14
+ """