graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1276 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +418 -0
  7. graflo/architecture/onto.py +376 -0
  8. graflo/architecture/onto_sql.py +54 -0
  9. graflo/architecture/resource.py +163 -0
  10. graflo/architecture/schema.py +135 -0
  11. graflo/architecture/transform.py +292 -0
  12. graflo/architecture/util.py +89 -0
  13. graflo/architecture/vertex.py +562 -0
  14. graflo/caster.py +736 -0
  15. graflo/cli/__init__.py +14 -0
  16. graflo/cli/ingest.py +203 -0
  17. graflo/cli/manage_dbs.py +197 -0
  18. graflo/cli/plot_schema.py +132 -0
  19. graflo/cli/xml2json.py +93 -0
  20. graflo/data_source/__init__.py +48 -0
  21. graflo/data_source/api.py +339 -0
  22. graflo/data_source/base.py +95 -0
  23. graflo/data_source/factory.py +304 -0
  24. graflo/data_source/file.py +148 -0
  25. graflo/data_source/memory.py +70 -0
  26. graflo/data_source/registry.py +82 -0
  27. graflo/data_source/sql.py +183 -0
  28. graflo/db/__init__.py +44 -0
  29. graflo/db/arango/__init__.py +22 -0
  30. graflo/db/arango/conn.py +1025 -0
  31. graflo/db/arango/query.py +180 -0
  32. graflo/db/arango/util.py +88 -0
  33. graflo/db/conn.py +377 -0
  34. graflo/db/connection/__init__.py +6 -0
  35. graflo/db/connection/config_mapping.py +18 -0
  36. graflo/db/connection/onto.py +717 -0
  37. graflo/db/connection/wsgi.py +29 -0
  38. graflo/db/manager.py +119 -0
  39. graflo/db/neo4j/__init__.py +16 -0
  40. graflo/db/neo4j/conn.py +639 -0
  41. graflo/db/postgres/__init__.py +37 -0
  42. graflo/db/postgres/conn.py +948 -0
  43. graflo/db/postgres/fuzzy_matcher.py +281 -0
  44. graflo/db/postgres/heuristics.py +133 -0
  45. graflo/db/postgres/inference_utils.py +428 -0
  46. graflo/db/postgres/resource_mapping.py +273 -0
  47. graflo/db/postgres/schema_inference.py +372 -0
  48. graflo/db/postgres/types.py +148 -0
  49. graflo/db/postgres/util.py +87 -0
  50. graflo/db/tigergraph/__init__.py +9 -0
  51. graflo/db/tigergraph/conn.py +2365 -0
  52. graflo/db/tigergraph/onto.py +26 -0
  53. graflo/db/util.py +49 -0
  54. graflo/filter/__init__.py +21 -0
  55. graflo/filter/onto.py +525 -0
  56. graflo/logging.conf +22 -0
  57. graflo/onto.py +312 -0
  58. graflo/plot/__init__.py +17 -0
  59. graflo/plot/plotter.py +616 -0
  60. graflo/util/__init__.py +23 -0
  61. graflo/util/chunker.py +807 -0
  62. graflo/util/merge.py +150 -0
  63. graflo/util/misc.py +37 -0
  64. graflo/util/onto.py +422 -0
  65. graflo/util/transform.py +454 -0
  66. graflo-1.3.7.dist-info/METADATA +243 -0
  67. graflo-1.3.7.dist-info/RECORD +70 -0
  68. graflo-1.3.7.dist-info/WHEEL +4 -0
  69. graflo-1.3.7.dist-info/entry_points.txt +5 -0
  70. graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
graflo/caster.py ADDED
@@ -0,0 +1,736 @@
1
+ """Data casting and ingestion system for graph databases.
2
+
3
+ This module provides functionality for casting and ingesting data into graph databases.
4
+ It handles batch processing, file discovery, and database operations for both ArangoDB
5
+ and Neo4j.
6
+
7
+ Key Components:
8
+ - Caster: Main class for data casting and ingestion
9
+ - FilePattern: Pattern matching for file discovery
10
+ - Patterns: Collection of file patterns for different resources
11
+
12
+ Example:
13
+ >>> caster = Caster(schema=schema)
14
+ >>> caster.ingest(path="data/", conn_conf=db_config)
15
+ """
16
+
17
+ import logging
18
+ import multiprocessing as mp
19
+ import queue
20
+ import re
21
+ import sys
22
+ from concurrent.futures import ThreadPoolExecutor
23
+ from functools import partial
24
+ from pathlib import Path
25
+ from typing import Any, cast
26
+
27
+ import pandas as pd
28
+ from pydantic import BaseModel
29
+ from suthing import Timer
30
+
31
+ from graflo.architecture.onto import EncodingType, GraphContainer
32
+ from graflo.architecture.schema import Schema
33
+ from graflo.data_source import (
34
+ AbstractDataSource,
35
+ DataSourceFactory,
36
+ DataSourceRegistry,
37
+ )
38
+ from graflo.data_source.sql import SQLConfig, SQLDataSource
39
+ from graflo.db import DBType, ConnectionManager, DBConfig
40
+ from graflo.onto import DBFlavor
41
+ from graflo.util.chunker import ChunkerType
42
+ from graflo.util.onto import FilePattern, Patterns, ResourceType, TablePattern
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ class IngestionParams(BaseModel):
48
+ """Parameters for controlling the ingestion process.
49
+
50
+ Attributes:
51
+ clean_start: Whether to clean the database before ingestion
52
+ n_cores: Number of CPU cores/threads to use for parallel processing
53
+ max_items: Maximum number of items to process per resource (applies to all data sources)
54
+ batch_size: Size of batches for processing
55
+ dry: Whether to perform a dry run (no database changes)
56
+ init_only: Whether to only initialize the database without ingestion
57
+ limit_files: Optional limit on number of files to process
58
+ """
59
+
60
+ clean_start: bool = False
61
+ n_cores: int = 1
62
+ max_items: int | None = None
63
+ batch_size: int = 10000
64
+ dry: bool = False
65
+ init_only: bool = False
66
+ limit_files: int | None = None
67
+
68
+
69
+ class Caster:
70
+ """Main class for data casting and ingestion.
71
+
72
+ This class handles the process of casting data into graph structures and
73
+ ingesting them into the database. It supports batch processing, parallel
74
+ execution, and various data formats.
75
+
76
+ Attributes:
77
+ schema: Schema configuration for the graph
78
+ ingestion_params: IngestionParams instance controlling ingestion behavior
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ schema: Schema,
84
+ ingestion_params: IngestionParams | None = None,
85
+ **kwargs,
86
+ ):
87
+ """Initialize the caster with schema and configuration.
88
+
89
+ Args:
90
+ schema: Schema configuration for the graph
91
+ ingestion_params: IngestionParams instance with ingestion configuration.
92
+ If None, creates IngestionParams from kwargs or uses defaults
93
+ **kwargs: Additional configuration options (for backward compatibility):
94
+ - clean_start: Whether to clean the database before ingestion
95
+ - n_cores: Number of CPU cores/threads to use for parallel processing
96
+ - max_items: Maximum number of items to process
97
+ - batch_size: Size of batches for processing
98
+ - dry: Whether to perform a dry run
99
+ """
100
+ if ingestion_params is None:
101
+ # Create IngestionParams from kwargs or use defaults
102
+ ingestion_params = IngestionParams(**kwargs)
103
+ self.ingestion_params = ingestion_params
104
+ self.schema = schema
105
+
106
+ @staticmethod
107
+ def discover_files(
108
+ fpath: Path | str, pattern: FilePattern, limit_files=None
109
+ ) -> list[Path]:
110
+ """Discover files matching a pattern in a directory.
111
+
112
+ Args:
113
+ fpath: Path to search in (should be the directory containing files)
114
+ pattern: Pattern to match files against
115
+ limit_files: Optional limit on number of files to return
116
+
117
+ Returns:
118
+ list[Path]: List of matching file paths
119
+
120
+ Raises:
121
+ AssertionError: If pattern.sub_path is None
122
+ """
123
+ assert pattern.sub_path is not None
124
+ if isinstance(fpath, str):
125
+ fpath_pathlib = Path(fpath)
126
+ else:
127
+ fpath_pathlib = fpath
128
+
129
+ # fpath is already the directory to search (pattern.sub_path from caller)
130
+ # so we use it directly, not combined with pattern.sub_path again
131
+ files = [
132
+ f
133
+ for f in fpath_pathlib.iterdir()
134
+ if f.is_file()
135
+ and (
136
+ True
137
+ if pattern.regex is None
138
+ else re.search(pattern.regex, f.name) is not None
139
+ )
140
+ ]
141
+
142
+ if limit_files is not None:
143
+ files = files[:limit_files]
144
+
145
+ return files
146
+
147
+ def cast_normal_resource(
148
+ self, data, resource_name: str | None = None
149
+ ) -> GraphContainer:
150
+ """Cast data into a graph container using a resource.
151
+
152
+ Args:
153
+ data: Data to cast
154
+ resource_name: Optional name of the resource to use
155
+
156
+ Returns:
157
+ GraphContainer: Container with cast graph data
158
+ """
159
+ rr = self.schema.fetch_resource(resource_name)
160
+
161
+ with ThreadPoolExecutor(max_workers=self.ingestion_params.n_cores) as executor:
162
+ docs = list(
163
+ executor.map(
164
+ lambda doc: rr(doc),
165
+ data,
166
+ )
167
+ )
168
+
169
+ graph = GraphContainer.from_docs_list(docs)
170
+ return graph
171
+
172
+ def process_batch(
173
+ self,
174
+ batch,
175
+ resource_name: str | None,
176
+ conn_conf: None | DBConfig = None,
177
+ ):
178
+ """Process a batch of data.
179
+
180
+ Args:
181
+ batch: Batch of data to process
182
+ resource_name: Optional name of the resource to use
183
+ conn_conf: Optional database connection configuration
184
+ """
185
+ gc = self.cast_normal_resource(batch, resource_name=resource_name)
186
+
187
+ if conn_conf is not None:
188
+ self.push_db(gc=gc, conn_conf=conn_conf, resource_name=resource_name)
189
+
190
+ def process_data_source(
191
+ self,
192
+ data_source: AbstractDataSource,
193
+ resource_name: str | None = None,
194
+ conn_conf: None | DBConfig = None,
195
+ ):
196
+ """Process a data source.
197
+
198
+ Args:
199
+ data_source: Data source to process
200
+ resource_name: Optional name of the resource (overrides data_source.resource_name)
201
+ conn_conf: Optional database connection configuration
202
+ """
203
+ # Use provided resource_name or fall back to data_source's resource_name
204
+ actual_resource_name = resource_name or data_source.resource_name
205
+
206
+ # Use pattern-specific limit if available, otherwise use global max_items
207
+ limit = getattr(data_source, "_pattern_limit", None)
208
+ if limit is None:
209
+ limit = self.ingestion_params.max_items
210
+
211
+ for batch in data_source.iter_batches(
212
+ batch_size=self.ingestion_params.batch_size, limit=limit
213
+ ):
214
+ self.process_batch(
215
+ batch, resource_name=actual_resource_name, conn_conf=conn_conf
216
+ )
217
+
218
+ def process_resource(
219
+ self,
220
+ resource_instance: (
221
+ Path | str | list[dict] | list[list] | pd.DataFrame | dict[str, Any]
222
+ ),
223
+ resource_name: str | None,
224
+ conn_conf: None | DBConfig = None,
225
+ **kwargs,
226
+ ):
227
+ """Process a resource instance from configuration or direct data.
228
+
229
+ This method accepts either:
230
+ 1. A configuration dictionary with 'source_type' and data source parameters
231
+ 2. A file path (Path or str) - creates FileDataSource
232
+ 3. In-memory data (list[dict], list[list], or pd.DataFrame) - creates InMemoryDataSource
233
+
234
+ Args:
235
+ resource_instance: Configuration dict, file path, or in-memory data.
236
+ Configuration dict format:
237
+ - {"source_type": "file", "path": "data.json"}
238
+ - {"source_type": "api", "config": {"url": "https://..."}}
239
+ - {"source_type": "sql", "config": {"connection_string": "...", "query": "..."}}
240
+ - {"source_type": "in_memory", "data": [...]}
241
+ resource_name: Optional name of the resource
242
+ conn_conf: Optional database connection configuration
243
+ **kwargs: Additional arguments passed to data source creation
244
+ (e.g., columns for list[list], encoding for files)
245
+ """
246
+ # Handle configuration dictionary
247
+ if isinstance(resource_instance, dict):
248
+ config = resource_instance.copy()
249
+ # Merge with kwargs (kwargs take precedence)
250
+ config.update(kwargs)
251
+ data_source = DataSourceFactory.create_data_source_from_config(config)
252
+ # Handle file paths
253
+ elif isinstance(resource_instance, (Path, str)):
254
+ # File path - create FileDataSource
255
+ # Extract only valid file data source parameters with proper typing
256
+ file_type: str | ChunkerType | None = cast(
257
+ str | ChunkerType | None, kwargs.get("file_type", None)
258
+ )
259
+ encoding: EncodingType = cast(
260
+ EncodingType, kwargs.get("encoding", EncodingType.UTF_8)
261
+ )
262
+ sep: str | None = cast(str | None, kwargs.get("sep", None))
263
+ data_source = DataSourceFactory.create_file_data_source(
264
+ path=resource_instance,
265
+ file_type=file_type,
266
+ encoding=encoding,
267
+ sep=sep,
268
+ )
269
+ # Handle in-memory data
270
+ else:
271
+ # In-memory data - create InMemoryDataSource
272
+ # Extract only valid in-memory data source parameters with proper typing
273
+ columns: list[str] | None = cast(
274
+ list[str] | None, kwargs.get("columns", None)
275
+ )
276
+ data_source = DataSourceFactory.create_in_memory_data_source(
277
+ data=resource_instance,
278
+ columns=columns,
279
+ )
280
+
281
+ data_source.resource_name = resource_name
282
+
283
+ # Process using the data source
284
+ self.process_data_source(
285
+ data_source=data_source,
286
+ resource_name=resource_name,
287
+ conn_conf=conn_conf,
288
+ )
289
+
290
+ def push_db(
291
+ self,
292
+ gc: GraphContainer,
293
+ conn_conf: DBConfig,
294
+ resource_name: str | None,
295
+ ):
296
+ """Push graph container data to the database.
297
+
298
+ Args:
299
+ gc: Graph container with data to push
300
+ conn_conf: Database connection configuration
301
+ resource_name: Optional name of the resource
302
+ """
303
+ vc = self.schema.vertex_config
304
+ resource = self.schema.fetch_resource(resource_name)
305
+ with ConnectionManager(connection_config=conn_conf) as db_client:
306
+ for vcol, data in gc.vertices.items():
307
+ # blank nodes: push and get back their keys {"_key": ...}
308
+ if vcol in vc.blank_vertices:
309
+ query0 = db_client.insert_return_batch(data, vc.vertex_dbname(vcol))
310
+ cursor = db_client.execute(query0)
311
+ gc.vertices[vcol] = [item for item in cursor]
312
+ else:
313
+ db_client.upsert_docs_batch(
314
+ data,
315
+ vc.vertex_dbname(vcol),
316
+ vc.index(vcol),
317
+ update_keys="doc",
318
+ filter_uniques=True,
319
+ dry=self.ingestion_params.dry,
320
+ )
321
+
322
+ # update edge misc with blank node edges
323
+ for vcol in vc.blank_vertices:
324
+ for edge_id, edge in self.schema.edge_config.edges_items():
325
+ vfrom, vto, relation = edge_id
326
+ if vcol == vfrom or vcol == vto:
327
+ if edge_id not in gc.edges:
328
+ gc.edges[edge_id] = []
329
+ gc.edges[edge_id].extend(
330
+ [
331
+ (x, y, {})
332
+ for x, y in zip(gc.vertices[vfrom], gc.vertices[vto])
333
+ ]
334
+ )
335
+
336
+ with ConnectionManager(connection_config=conn_conf) as db_client:
337
+ # currently works only on item level
338
+ for edge in resource.extra_weights:
339
+ if edge.weights is None:
340
+ continue
341
+ for weight in edge.weights.vertices:
342
+ if weight.name in vc.vertex_set:
343
+ index_fields = vc.index(weight.name)
344
+
345
+ if not self.ingestion_params.dry and weight.name in gc.vertices:
346
+ weights_per_item = db_client.fetch_present_documents(
347
+ class_name=vc.vertex_dbname(weight.name),
348
+ batch=gc.vertices[weight.name],
349
+ match_keys=index_fields.fields,
350
+ keep_keys=weight.fields,
351
+ )
352
+
353
+ for j, item in enumerate(gc.linear):
354
+ weights = weights_per_item[j]
355
+
356
+ for ee in item[edge.edge_id]:
357
+ weight_collection_attached = {
358
+ weight.cfield(k): v
359
+ for k, v in weights[0].items()
360
+ }
361
+ ee.update(weight_collection_attached)
362
+ else:
363
+ logger.error(f"{weight.name} not a valid vertex")
364
+
365
+ with ConnectionManager(connection_config=conn_conf) as db_client:
366
+ for edge_id, edge in self.schema.edge_config.edges_items():
367
+ for ee in gc.loop_over_relations(edge_id):
368
+ _, _, relation = ee
369
+ if not self.ingestion_params.dry:
370
+ data = gc.edges[ee]
371
+ db_client.insert_edges_batch(
372
+ docs_edges=data,
373
+ source_class=vc.vertex_dbname(edge.source),
374
+ target_class=vc.vertex_dbname(edge.target),
375
+ relation_name=relation,
376
+ collection_name=edge.database_name,
377
+ match_keys_source=vc.index(edge.source).fields,
378
+ match_keys_target=vc.index(edge.target).fields,
379
+ filter_uniques=False,
380
+ dry=self.ingestion_params.dry,
381
+ )
382
+
383
+ def process_with_queue(self, tasks: mp.Queue, conn_conf: DBConfig | None = None):
384
+ """Process tasks from a queue.
385
+
386
+ Args:
387
+ tasks: Queue of tasks to process
388
+ conn_conf: Optional database connection configuration
389
+ """
390
+ while True:
391
+ try:
392
+ task = tasks.get_nowait()
393
+ # Support both (Path, str) tuples and DataSource instances
394
+ if isinstance(task, tuple) and len(task) == 2:
395
+ filepath, resource_name = task
396
+ self.process_resource(
397
+ resource_instance=filepath,
398
+ resource_name=resource_name,
399
+ conn_conf=conn_conf,
400
+ )
401
+ elif isinstance(task, AbstractDataSource):
402
+ self.process_data_source(data_source=task, conn_conf=conn_conf)
403
+ except queue.Empty:
404
+ break
405
+
406
+ @staticmethod
407
+ def normalize_resource(
408
+ data: pd.DataFrame | list[list] | list[dict], columns: list[str] | None = None
409
+ ) -> list[dict]:
410
+ """Normalize resource data into a list of dictionaries.
411
+
412
+ Args:
413
+ data: Data to normalize (DataFrame, list of lists, or list of dicts)
414
+ columns: Optional column names for list data
415
+
416
+ Returns:
417
+ list[dict]: Normalized data as list of dictionaries
418
+
419
+ Raises:
420
+ ValueError: If columns is not provided for list data
421
+ """
422
+ if isinstance(data, pd.DataFrame):
423
+ columns = data.columns.tolist()
424
+ _data = data.values.tolist()
425
+ elif data and isinstance(data[0], list):
426
+ _data = cast(list[list], data) # Tell mypy this is list[list]
427
+ if columns is None:
428
+ raise ValueError("columns should be set")
429
+ else:
430
+ return cast(list[dict], data) # Tell mypy this is list[dict]
431
+ rows_dressed = [{k: v for k, v in zip(columns, item)} for item in _data]
432
+ return rows_dressed
433
+
434
+ def ingest_data_sources(
435
+ self,
436
+ data_source_registry: DataSourceRegistry,
437
+ conn_conf: DBConfig,
438
+ ingestion_params: IngestionParams | None = None,
439
+ ):
440
+ """Ingest data from data sources in a registry.
441
+
442
+ Args:
443
+ data_source_registry: Registry containing data sources mapped to resources
444
+ conn_conf: Database connection configuration
445
+ ingestion_params: IngestionParams instance with ingestion configuration.
446
+ If None, uses default IngestionParams()
447
+ """
448
+ if ingestion_params is None:
449
+ ingestion_params = IngestionParams()
450
+
451
+ # Update ingestion params (may override defaults set in __init__)
452
+ self.ingestion_params = ingestion_params
453
+ init_only = ingestion_params.init_only
454
+
455
+ # If effective_schema is not set, use schema.general.name as fallback
456
+ if conn_conf.can_be_target() and conn_conf.effective_schema is None:
457
+ schema_name = self.schema.general.name
458
+ # Map to the appropriate field based on DB type
459
+ if conn_conf.connection_type == DBType.TIGERGRAPH:
460
+ # TigerGraph uses 'schema_name' field
461
+ conn_conf.schema_name = schema_name
462
+ else:
463
+ # ArangoDB, Neo4j use 'database' field (which maps to effective_schema)
464
+ conn_conf.database = schema_name
465
+
466
+ # init_db() now handles database/schema creation automatically
467
+ # It checks if the database exists and creates it if needed
468
+ # Uses schema.general.name if database is not set in config
469
+ with ConnectionManager(connection_config=conn_conf) as db_client:
470
+ db_client.init_db(self.schema, self.ingestion_params.clean_start)
471
+
472
+ if init_only:
473
+ logger.info("ingest execution bound to init")
474
+ sys.exit(0)
475
+
476
+ # Collect all data sources
477
+ tasks: list[AbstractDataSource] = []
478
+ for resource_name in self.schema._resources.keys():
479
+ data_sources = data_source_registry.get_data_sources(resource_name)
480
+ if data_sources:
481
+ logger.info(
482
+ f"For resource name {resource_name} {len(data_sources)} data sources were found"
483
+ )
484
+ tasks.extend(data_sources)
485
+
486
+ with Timer() as klepsidra:
487
+ if self.ingestion_params.n_cores > 1:
488
+ queue_tasks: mp.Queue = mp.Queue()
489
+ for item in tasks:
490
+ queue_tasks.put(item)
491
+
492
+ func = partial(
493
+ self.process_with_queue,
494
+ conn_conf=conn_conf,
495
+ )
496
+ assert mp.get_start_method() == "fork", (
497
+ "Requires 'forking' operating system"
498
+ )
499
+
500
+ processes = []
501
+
502
+ for w in range(self.ingestion_params.n_cores):
503
+ p = mp.Process(target=func, args=(queue_tasks,))
504
+ processes.append(p)
505
+ p.start()
506
+ for p in processes:
507
+ p.join()
508
+ else:
509
+ for data_source in tasks:
510
+ self.process_data_source(
511
+ data_source=data_source, conn_conf=conn_conf
512
+ )
513
+ logger.info(f"Processing took {klepsidra.elapsed:.1f} sec")
514
+
515
+ @staticmethod
516
+ def _get_db_flavor_from_config(output_config: DBConfig) -> DBFlavor:
517
+ """Convert DBConfig connection type to DBFlavor.
518
+
519
+ Args:
520
+ output_config: Database configuration
521
+
522
+ Returns:
523
+ DBFlavor enum value corresponding to the database type
524
+ """
525
+ db_type = output_config.connection_type
526
+ if db_type == DBType.ARANGO:
527
+ return DBFlavor.ARANGO
528
+ elif db_type == DBType.NEO4J:
529
+ return DBFlavor.NEO4J
530
+ elif db_type == DBType.TIGERGRAPH:
531
+ return DBFlavor.TIGERGRAPH
532
+ else:
533
+ # Default to ARANGO for unknown types
534
+ return DBFlavor.ARANGO
535
+
536
+ def _register_file_sources(
537
+ self,
538
+ registry: DataSourceRegistry,
539
+ resource_name: str,
540
+ pattern: FilePattern,
541
+ ingestion_params: IngestionParams,
542
+ ) -> None:
543
+ """Register file data sources for a resource.
544
+
545
+ Args:
546
+ registry: Data source registry to add sources to
547
+ resource_name: Name of the resource
548
+ pattern: File pattern configuration
549
+ ingestion_params: Ingestion parameters
550
+ """
551
+ if pattern.sub_path is None:
552
+ logger.warning(
553
+ f"FilePattern for resource '{resource_name}' has no sub_path, skipping"
554
+ )
555
+ return
556
+
557
+ path_obj = pattern.sub_path.expanduser()
558
+ files = Caster.discover_files(
559
+ path_obj, limit_files=ingestion_params.limit_files, pattern=pattern
560
+ )
561
+ logger.info(f"For resource name {resource_name} {len(files)} files were found")
562
+
563
+ for file_path in files:
564
+ file_source = DataSourceFactory.create_file_data_source(path=file_path)
565
+ registry.register(file_source, resource_name=resource_name)
566
+
567
+ def _register_sql_table_sources(
568
+ self,
569
+ registry: DataSourceRegistry,
570
+ resource_name: str,
571
+ pattern: TablePattern,
572
+ patterns: "Patterns",
573
+ ingestion_params: IngestionParams,
574
+ ) -> None:
575
+ """Register SQL table data sources for a resource.
576
+
577
+ Uses SQLDataSource with batch processing (cursors) instead of loading
578
+ all data into memory. This is efficient for large tables.
579
+
580
+ Args:
581
+ registry: Data source registry to add sources to
582
+ resource_name: Name of the resource
583
+ pattern: Table pattern configuration
584
+ patterns: Patterns instance for accessing configs
585
+ ingestion_params: Ingestion parameters
586
+ """
587
+ postgres_config = patterns.get_postgres_config(resource_name)
588
+ if postgres_config is None:
589
+ logger.warning(
590
+ f"PostgreSQL table '{resource_name}' has no connection config, skipping"
591
+ )
592
+ return
593
+
594
+ table_info = patterns.get_table_info(resource_name)
595
+ if table_info is None:
596
+ logger.warning(
597
+ f"Could not get table info for resource '{resource_name}', skipping"
598
+ )
599
+ return
600
+
601
+ table_name, schema_name = table_info
602
+ effective_schema = schema_name or postgres_config.schema_name or "public"
603
+
604
+ try:
605
+ # Build base query
606
+ query = f'SELECT * FROM "{effective_schema}"."{table_name}"'
607
+ where_clause = pattern.build_where_clause()
608
+ if where_clause:
609
+ query += f" WHERE {where_clause}"
610
+
611
+ # Get SQLAlchemy connection string from PostgresConfig
612
+ connection_string = postgres_config.to_sqlalchemy_connection_string()
613
+
614
+ # Create SQLDataSource with pagination for efficient batch processing
615
+ # Note: max_items limit is handled by SQLDataSource.iter_batches() limit parameter
616
+ sql_config = SQLConfig(
617
+ connection_string=connection_string,
618
+ query=query,
619
+ pagination=True,
620
+ page_size=ingestion_params.batch_size, # Use batch_size for page size
621
+ )
622
+ sql_source = SQLDataSource(config=sql_config)
623
+
624
+ # Register the SQL data source (it will be processed in batches)
625
+ registry.register(sql_source, resource_name=resource_name)
626
+
627
+ logger.info(
628
+ f"Created SQL data source for table '{effective_schema}.{table_name}' "
629
+ f"mapped to resource '{resource_name}' (will process in batches of {ingestion_params.batch_size})"
630
+ )
631
+ except Exception as e:
632
+ logger.error(
633
+ f"Failed to create data source for PostgreSQL table '{resource_name}': {e}",
634
+ exc_info=True,
635
+ )
636
+
637
+ def _build_registry_from_patterns(
638
+ self,
639
+ patterns: "Patterns",
640
+ ingestion_params: IngestionParams,
641
+ ) -> DataSourceRegistry:
642
+ """Build data source registry from patterns.
643
+
644
+ Args:
645
+ patterns: Patterns instance mapping resources to data sources
646
+ ingestion_params: Ingestion parameters
647
+
648
+ Returns:
649
+ DataSourceRegistry with registered data sources
650
+ """
651
+ registry = DataSourceRegistry()
652
+
653
+ for resource in self.schema.resources:
654
+ resource_name = resource.name
655
+ resource_type = patterns.get_resource_type(resource_name)
656
+
657
+ if resource_type is None:
658
+ logger.warning(
659
+ f"No resource type found for resource '{resource_name}', skipping"
660
+ )
661
+ continue
662
+
663
+ pattern = patterns.patterns.get(resource_name)
664
+ if pattern is None:
665
+ logger.warning(
666
+ f"No pattern found for resource '{resource_name}', skipping"
667
+ )
668
+ continue
669
+
670
+ if resource_type == ResourceType.FILE:
671
+ if not isinstance(pattern, FilePattern):
672
+ logger.warning(
673
+ f"Pattern for resource '{resource_name}' is not a FilePattern, skipping"
674
+ )
675
+ continue
676
+ self._register_file_sources(
677
+ registry, resource_name, pattern, ingestion_params
678
+ )
679
+
680
+ elif resource_type == ResourceType.SQL_TABLE:
681
+ if not isinstance(pattern, TablePattern):
682
+ logger.warning(
683
+ f"Pattern for resource '{resource_name}' is not a TablePattern, skipping"
684
+ )
685
+ continue
686
+ self._register_sql_table_sources(
687
+ registry, resource_name, pattern, patterns, ingestion_params
688
+ )
689
+
690
+ else:
691
+ logger.warning(
692
+ f"Unsupported resource type '{resource_type}' for resource '{resource_name}', skipping"
693
+ )
694
+
695
+ return registry
696
+
697
+ def ingest(
698
+ self,
699
+ output_config: DBConfig,
700
+ patterns: "Patterns | None" = None,
701
+ ingestion_params: IngestionParams | None = None,
702
+ ):
703
+ """Ingest data into the graph database.
704
+
705
+ This is the main ingestion method that takes:
706
+ - Schema: Graph structure (already set in Caster)
707
+ - OutputConfig: Target graph database configuration
708
+ - Patterns: Mapping of resources to physical data sources
709
+ - IngestionParams: Parameters controlling the ingestion process
710
+
711
+ Args:
712
+ output_config: Target database connection configuration (for writing graph)
713
+ patterns: Patterns instance mapping resources to data sources
714
+ If None, defaults to empty Patterns()
715
+ ingestion_params: IngestionParams instance with ingestion configuration.
716
+ If None, uses default IngestionParams()
717
+ """
718
+ # Normalize parameters
719
+ patterns = patterns or Patterns()
720
+ ingestion_params = ingestion_params or IngestionParams()
721
+
722
+ # Initialize vertex config with correct field types based on database type
723
+ db_flavor = self._get_db_flavor_from_config(output_config)
724
+ self.schema.vertex_config.finish_init(db_flavor)
725
+ # Initialize edge config after vertex config is fully initialized
726
+ self.schema.edge_config.finish_init(self.schema.vertex_config)
727
+
728
+ # Build registry from patterns
729
+ registry = self._build_registry_from_patterns(patterns, ingestion_params)
730
+
731
+ # Ingest data sources
732
+ self.ingest_data_sources(
733
+ data_source_registry=registry,
734
+ conn_conf=output_config,
735
+ ingestion_params=ingestion_params,
736
+ )