graflo 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graflo/README.md +18 -0
- graflo/__init__.py +70 -0
- graflo/architecture/__init__.py +38 -0
- graflo/architecture/actor.py +1120 -0
- graflo/architecture/actor_util.py +450 -0
- graflo/architecture/edge.py +297 -0
- graflo/architecture/onto.py +374 -0
- graflo/architecture/resource.py +161 -0
- graflo/architecture/schema.py +136 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +93 -0
- graflo/architecture/vertex.py +586 -0
- graflo/caster.py +655 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +194 -0
- graflo/cli/manage_dbs.py +197 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/data_source/__init__.py +48 -0
- graflo/data_source/api.py +339 -0
- graflo/data_source/base.py +97 -0
- graflo/data_source/factory.py +298 -0
- graflo/data_source/file.py +133 -0
- graflo/data_source/memory.py +72 -0
- graflo/data_source/registry.py +82 -0
- graflo/data_source/sql.py +185 -0
- graflo/db/__init__.py +44 -0
- graflo/db/arango/__init__.py +22 -0
- graflo/db/arango/conn.py +1026 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/conn.py +377 -0
- graflo/db/connection/__init__.py +6 -0
- graflo/db/connection/config_mapping.py +18 -0
- graflo/db/connection/onto.py +688 -0
- graflo/db/connection/wsgi.py +29 -0
- graflo/db/manager.py +119 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +639 -0
- graflo/db/postgres/__init__.py +156 -0
- graflo/db/postgres/conn.py +425 -0
- graflo/db/postgres/resource_mapping.py +139 -0
- graflo/db/postgres/schema_inference.py +245 -0
- graflo/db/postgres/types.py +148 -0
- graflo/db/tigergraph/__init__.py +9 -0
- graflo/db/tigergraph/conn.py +2212 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +525 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +190 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +556 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +751 -0
- graflo/util/merge.py +150 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +332 -0
- graflo/util/transform.py +448 -0
- graflo-1.3.3.dist-info/METADATA +190 -0
- graflo-1.3.3.dist-info/RECORD +64 -0
- graflo-1.3.3.dist-info/WHEEL +4 -0
- graflo-1.3.3.dist-info/entry_points.txt +5 -0
- graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
graflo/caster.py
ADDED
|
@@ -0,0 +1,655 @@
|
|
|
1
|
+
"""Data casting and ingestion system for graph databases.
|
|
2
|
+
|
|
3
|
+
This module provides functionality for casting and ingesting data into graph databases.
|
|
4
|
+
It handles batch processing, file discovery, and database operations for both ArangoDB
|
|
5
|
+
and Neo4j.
|
|
6
|
+
|
|
7
|
+
Key Components:
|
|
8
|
+
- Caster: Main class for data casting and ingestion
|
|
9
|
+
- FilePattern: Pattern matching for file discovery
|
|
10
|
+
- Patterns: Collection of file patterns for different resources
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
>>> caster = Caster(schema=schema)
|
|
14
|
+
>>> caster.ingest(path="data/", conn_conf=db_config)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import multiprocessing as mp
|
|
21
|
+
import queue
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
25
|
+
from functools import partial
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any, cast
|
|
28
|
+
|
|
29
|
+
import pandas as pd
|
|
30
|
+
from suthing import Timer
|
|
31
|
+
|
|
32
|
+
from graflo.architecture.onto import EncodingType, GraphContainer
|
|
33
|
+
from graflo.architecture.schema import Schema
|
|
34
|
+
from graflo.data_source import (
|
|
35
|
+
AbstractDataSource,
|
|
36
|
+
DataSourceFactory,
|
|
37
|
+
DataSourceRegistry,
|
|
38
|
+
)
|
|
39
|
+
from graflo.db import DBType, ConnectionManager, DBConfig
|
|
40
|
+
from graflo.util.chunker import ChunkerType
|
|
41
|
+
from graflo.util.onto import FilePattern, Patterns, TablePattern
|
|
42
|
+
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Caster:
|
|
47
|
+
"""Main class for data casting and ingestion.
|
|
48
|
+
|
|
49
|
+
This class handles the process of casting data into graph structures and
|
|
50
|
+
ingesting them into the database. It supports batch processing, parallel
|
|
51
|
+
execution, and various data formats.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
clean_start: Whether to clean the database before ingestion
|
|
55
|
+
n_cores: Number of CPU cores to use for parallel processing
|
|
56
|
+
max_items: Maximum number of items to process
|
|
57
|
+
batch_size: Size of batches for processing
|
|
58
|
+
n_threads: Number of threads for parallel processing
|
|
59
|
+
dry: Whether to perform a dry run (no database changes)
|
|
60
|
+
schema: Schema configuration for the graph
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, schema: Schema, **kwargs):
|
|
64
|
+
"""Initialize the caster with schema and configuration.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
schema: Schema configuration for the graph
|
|
68
|
+
**kwargs: Additional configuration options:
|
|
69
|
+
- clean_start: Whether to clean the database before ingestion
|
|
70
|
+
- n_cores: Number of CPU cores to use
|
|
71
|
+
- max_items: Maximum number of items to process
|
|
72
|
+
- batch_size: Size of batches for processing
|
|
73
|
+
- n_threads: Number of threads for parallel processing
|
|
74
|
+
- dry: Whether to perform a dry run
|
|
75
|
+
"""
|
|
76
|
+
self.clean_start: bool = False
|
|
77
|
+
self.n_cores = kwargs.pop("n_cores", 1)
|
|
78
|
+
self.max_items = kwargs.pop("max_items", None)
|
|
79
|
+
self.batch_size = kwargs.pop("batch_size", 10000)
|
|
80
|
+
self.n_threads = kwargs.pop("n_threads", 1)
|
|
81
|
+
self.dry = kwargs.pop("dry", False)
|
|
82
|
+
self.schema = schema
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def discover_files(
|
|
86
|
+
fpath: Path | str, pattern: FilePattern, limit_files=None
|
|
87
|
+
) -> list[Path]:
|
|
88
|
+
"""Discover files matching a pattern in a directory.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
fpath: Path to search in
|
|
92
|
+
pattern: Pattern to match files against
|
|
93
|
+
limit_files: Optional limit on number of files to return
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
list[Path]: List of matching file paths
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
AssertionError: If pattern.sub_path is None
|
|
100
|
+
"""
|
|
101
|
+
assert pattern.sub_path is not None
|
|
102
|
+
if isinstance(fpath, str):
|
|
103
|
+
fpath_pathlib = Path(fpath)
|
|
104
|
+
else:
|
|
105
|
+
fpath_pathlib = fpath
|
|
106
|
+
|
|
107
|
+
files = [
|
|
108
|
+
f
|
|
109
|
+
for f in (fpath_pathlib / pattern.sub_path).iterdir()
|
|
110
|
+
if f.is_file()
|
|
111
|
+
and (
|
|
112
|
+
True
|
|
113
|
+
if pattern.regex is None
|
|
114
|
+
else re.search(pattern.regex, f.name) is not None
|
|
115
|
+
)
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
if limit_files is not None:
|
|
119
|
+
files = files[:limit_files]
|
|
120
|
+
|
|
121
|
+
return files
|
|
122
|
+
|
|
123
|
+
def cast_normal_resource(
|
|
124
|
+
self, data, resource_name: str | None = None
|
|
125
|
+
) -> GraphContainer:
|
|
126
|
+
"""Cast data into a graph container using a resource.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
data: Data to cast
|
|
130
|
+
resource_name: Optional name of the resource to use
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
GraphContainer: Container with cast graph data
|
|
134
|
+
"""
|
|
135
|
+
rr = self.schema.fetch_resource(resource_name)
|
|
136
|
+
|
|
137
|
+
with ThreadPoolExecutor(max_workers=self.n_threads) as executor:
|
|
138
|
+
docs = list(
|
|
139
|
+
executor.map(
|
|
140
|
+
lambda doc: rr(doc),
|
|
141
|
+
data,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
graph = GraphContainer.from_docs_list(docs)
|
|
146
|
+
return graph
|
|
147
|
+
|
|
148
|
+
def process_batch(
|
|
149
|
+
self,
|
|
150
|
+
batch,
|
|
151
|
+
resource_name: str | None,
|
|
152
|
+
conn_conf: None | DBConfig = None,
|
|
153
|
+
):
|
|
154
|
+
"""Process a batch of data.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
batch: Batch of data to process
|
|
158
|
+
resource_name: Optional name of the resource to use
|
|
159
|
+
conn_conf: Optional database connection configuration
|
|
160
|
+
"""
|
|
161
|
+
gc = self.cast_normal_resource(batch, resource_name=resource_name)
|
|
162
|
+
|
|
163
|
+
if conn_conf is not None:
|
|
164
|
+
self.push_db(gc=gc, conn_conf=conn_conf, resource_name=resource_name)
|
|
165
|
+
|
|
166
|
+
def process_data_source(
|
|
167
|
+
self,
|
|
168
|
+
data_source: AbstractDataSource,
|
|
169
|
+
resource_name: str | None = None,
|
|
170
|
+
conn_conf: None | DBConfig = None,
|
|
171
|
+
):
|
|
172
|
+
"""Process a data source.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
data_source: Data source to process
|
|
176
|
+
resource_name: Optional name of the resource (overrides data_source.resource_name)
|
|
177
|
+
conn_conf: Optional database connection configuration
|
|
178
|
+
"""
|
|
179
|
+
# Use provided resource_name or fall back to data_source's resource_name
|
|
180
|
+
actual_resource_name = resource_name or data_source.resource_name
|
|
181
|
+
|
|
182
|
+
for batch in data_source.iter_batches(
|
|
183
|
+
batch_size=self.batch_size, limit=self.max_items
|
|
184
|
+
):
|
|
185
|
+
self.process_batch(
|
|
186
|
+
batch, resource_name=actual_resource_name, conn_conf=conn_conf
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
def process_resource(
|
|
190
|
+
self,
|
|
191
|
+
resource_instance: (
|
|
192
|
+
Path | str | list[dict] | list[list] | pd.DataFrame | dict[str, Any]
|
|
193
|
+
),
|
|
194
|
+
resource_name: str | None,
|
|
195
|
+
conn_conf: None | DBConfig = None,
|
|
196
|
+
**kwargs,
|
|
197
|
+
):
|
|
198
|
+
"""Process a resource instance from configuration or direct data.
|
|
199
|
+
|
|
200
|
+
This method accepts either:
|
|
201
|
+
1. A configuration dictionary with 'source_type' and data source parameters
|
|
202
|
+
2. A file path (Path or str) - creates FileDataSource
|
|
203
|
+
3. In-memory data (list[dict], list[list], or pd.DataFrame) - creates InMemoryDataSource
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
resource_instance: Configuration dict, file path, or in-memory data.
|
|
207
|
+
Configuration dict format:
|
|
208
|
+
- {"source_type": "file", "path": "data.json"}
|
|
209
|
+
- {"source_type": "api", "config": {"url": "https://..."}}
|
|
210
|
+
- {"source_type": "sql", "config": {"connection_string": "...", "query": "..."}}
|
|
211
|
+
- {"source_type": "in_memory", "data": [...]}
|
|
212
|
+
resource_name: Optional name of the resource
|
|
213
|
+
conn_conf: Optional database connection configuration
|
|
214
|
+
**kwargs: Additional arguments passed to data source creation
|
|
215
|
+
(e.g., columns for list[list], encoding for files)
|
|
216
|
+
"""
|
|
217
|
+
# Handle configuration dictionary
|
|
218
|
+
if isinstance(resource_instance, dict):
|
|
219
|
+
config = resource_instance.copy()
|
|
220
|
+
# Merge with kwargs (kwargs take precedence)
|
|
221
|
+
config.update(kwargs)
|
|
222
|
+
data_source = DataSourceFactory.create_data_source_from_config(config)
|
|
223
|
+
# Handle file paths
|
|
224
|
+
elif isinstance(resource_instance, (Path, str)):
|
|
225
|
+
# File path - create FileDataSource
|
|
226
|
+
# Extract only valid file data source parameters with proper typing
|
|
227
|
+
file_type: str | ChunkerType | None = cast(
|
|
228
|
+
str | ChunkerType | None, kwargs.get("file_type", None)
|
|
229
|
+
)
|
|
230
|
+
encoding: EncodingType = cast(
|
|
231
|
+
EncodingType, kwargs.get("encoding", EncodingType.UTF_8)
|
|
232
|
+
)
|
|
233
|
+
sep: str | None = cast(str | None, kwargs.get("sep", None))
|
|
234
|
+
data_source = DataSourceFactory.create_file_data_source(
|
|
235
|
+
path=resource_instance,
|
|
236
|
+
file_type=file_type,
|
|
237
|
+
encoding=encoding,
|
|
238
|
+
sep=sep,
|
|
239
|
+
)
|
|
240
|
+
# Handle in-memory data
|
|
241
|
+
else:
|
|
242
|
+
# In-memory data - create InMemoryDataSource
|
|
243
|
+
# Extract only valid in-memory data source parameters with proper typing
|
|
244
|
+
columns: list[str] | None = cast(
|
|
245
|
+
list[str] | None, kwargs.get("columns", None)
|
|
246
|
+
)
|
|
247
|
+
data_source = DataSourceFactory.create_in_memory_data_source(
|
|
248
|
+
data=resource_instance,
|
|
249
|
+
columns=columns,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
data_source.resource_name = resource_name
|
|
253
|
+
|
|
254
|
+
# Process using the data source
|
|
255
|
+
self.process_data_source(
|
|
256
|
+
data_source=data_source,
|
|
257
|
+
resource_name=resource_name,
|
|
258
|
+
conn_conf=conn_conf,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def push_db(
|
|
262
|
+
self,
|
|
263
|
+
gc: GraphContainer,
|
|
264
|
+
conn_conf: DBConfig,
|
|
265
|
+
resource_name: str | None,
|
|
266
|
+
):
|
|
267
|
+
"""Push graph container data to the database.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
gc: Graph container with data to push
|
|
271
|
+
conn_conf: Database connection configuration
|
|
272
|
+
resource_name: Optional name of the resource
|
|
273
|
+
"""
|
|
274
|
+
vc = self.schema.vertex_config
|
|
275
|
+
resource = self.schema.fetch_resource(resource_name)
|
|
276
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
277
|
+
for vcol, data in gc.vertices.items():
|
|
278
|
+
# blank nodes: push and get back their keys {"_key": ...}
|
|
279
|
+
if vcol in vc.blank_vertices:
|
|
280
|
+
query0 = db_client.insert_return_batch(data, vc.vertex_dbname(vcol))
|
|
281
|
+
cursor = db_client.execute(query0)
|
|
282
|
+
gc.vertices[vcol] = [item for item in cursor]
|
|
283
|
+
else:
|
|
284
|
+
db_client.upsert_docs_batch(
|
|
285
|
+
data,
|
|
286
|
+
vc.vertex_dbname(vcol),
|
|
287
|
+
vc.index(vcol),
|
|
288
|
+
update_keys="doc",
|
|
289
|
+
filter_uniques=True,
|
|
290
|
+
dry=self.dry,
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# update edge misc with blank node edges
|
|
294
|
+
for vcol in vc.blank_vertices:
|
|
295
|
+
for edge_id, edge in self.schema.edge_config.edges_items():
|
|
296
|
+
vfrom, vto, relation = edge_id
|
|
297
|
+
if vcol == vfrom or vcol == vto:
|
|
298
|
+
if edge_id not in gc.edges:
|
|
299
|
+
gc.edges[edge_id] = []
|
|
300
|
+
gc.edges[edge_id].extend(
|
|
301
|
+
[
|
|
302
|
+
(x, y, {})
|
|
303
|
+
for x, y in zip(gc.vertices[vfrom], gc.vertices[vto])
|
|
304
|
+
]
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
308
|
+
# currently works only on item level
|
|
309
|
+
for edge in resource.extra_weights:
|
|
310
|
+
if edge.weights is None:
|
|
311
|
+
continue
|
|
312
|
+
for weight in edge.weights.vertices:
|
|
313
|
+
if weight.name in vc.vertex_set:
|
|
314
|
+
index_fields = vc.index(weight.name)
|
|
315
|
+
|
|
316
|
+
if not self.dry and weight.name in gc.vertices:
|
|
317
|
+
weights_per_item = db_client.fetch_present_documents(
|
|
318
|
+
class_name=vc.vertex_dbname(weight.name),
|
|
319
|
+
batch=gc.vertices[weight.name],
|
|
320
|
+
match_keys=index_fields.fields,
|
|
321
|
+
keep_keys=weight.fields,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
for j, item in enumerate(gc.linear):
|
|
325
|
+
weights = weights_per_item[j]
|
|
326
|
+
|
|
327
|
+
for ee in item[edge.edge_id]:
|
|
328
|
+
weight_collection_attached = {
|
|
329
|
+
weight.cfield(k): v
|
|
330
|
+
for k, v in weights[0].items()
|
|
331
|
+
}
|
|
332
|
+
ee.update(weight_collection_attached)
|
|
333
|
+
else:
|
|
334
|
+
logger.error(f"{weight.name} not a valid vertex")
|
|
335
|
+
|
|
336
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
337
|
+
for edge_id, edge in self.schema.edge_config.edges_items():
|
|
338
|
+
for ee in gc.loop_over_relations(edge_id):
|
|
339
|
+
_, _, relation = ee
|
|
340
|
+
if not self.dry:
|
|
341
|
+
data = gc.edges[ee]
|
|
342
|
+
db_client.insert_edges_batch(
|
|
343
|
+
docs_edges=data,
|
|
344
|
+
source_class=vc.vertex_dbname(edge.source),
|
|
345
|
+
target_class=vc.vertex_dbname(edge.target),
|
|
346
|
+
relation_name=relation,
|
|
347
|
+
collection_name=edge.collection_name,
|
|
348
|
+
match_keys_source=vc.index(edge.source).fields,
|
|
349
|
+
match_keys_target=vc.index(edge.target).fields,
|
|
350
|
+
filter_uniques=False,
|
|
351
|
+
dry=self.dry,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
def process_with_queue(self, tasks: mp.Queue, **kwargs):
|
|
355
|
+
"""Process tasks from a queue.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
tasks: Queue of tasks to process
|
|
359
|
+
**kwargs: Additional keyword arguments
|
|
360
|
+
"""
|
|
361
|
+
while True:
|
|
362
|
+
try:
|
|
363
|
+
task = tasks.get_nowait()
|
|
364
|
+
# Support both (Path, str) tuples and DataSource instances
|
|
365
|
+
if isinstance(task, tuple) and len(task) == 2:
|
|
366
|
+
filepath, resource_name = task
|
|
367
|
+
self.process_resource(
|
|
368
|
+
resource_instance=filepath,
|
|
369
|
+
resource_name=resource_name,
|
|
370
|
+
**kwargs,
|
|
371
|
+
)
|
|
372
|
+
elif isinstance(task, AbstractDataSource):
|
|
373
|
+
self.process_data_source(data_source=task, **kwargs)
|
|
374
|
+
except queue.Empty:
|
|
375
|
+
break
|
|
376
|
+
|
|
377
|
+
@staticmethod
|
|
378
|
+
def normalize_resource(
|
|
379
|
+
data: pd.DataFrame | list[list] | list[dict], columns: list[str] | None = None
|
|
380
|
+
) -> list[dict]:
|
|
381
|
+
"""Normalize resource data into a list of dictionaries.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
data: Data to normalize (DataFrame, list of lists, or list of dicts)
|
|
385
|
+
columns: Optional column names for list data
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
list[dict]: Normalized data as list of dictionaries
|
|
389
|
+
|
|
390
|
+
Raises:
|
|
391
|
+
ValueError: If columns is not provided for list data
|
|
392
|
+
"""
|
|
393
|
+
if isinstance(data, pd.DataFrame):
|
|
394
|
+
columns = data.columns.tolist()
|
|
395
|
+
_data = data.values.tolist()
|
|
396
|
+
elif data and isinstance(data[0], list):
|
|
397
|
+
_data = cast(list[list], data) # Tell mypy this is list[list]
|
|
398
|
+
if columns is None:
|
|
399
|
+
raise ValueError("columns should be set")
|
|
400
|
+
else:
|
|
401
|
+
return cast(list[dict], data) # Tell mypy this is list[dict]
|
|
402
|
+
rows_dressed = [{k: v for k, v in zip(columns, item)} for item in _data]
|
|
403
|
+
return rows_dressed
|
|
404
|
+
|
|
405
|
+
def ingest_data_sources(
|
|
406
|
+
self,
|
|
407
|
+
data_source_registry: DataSourceRegistry,
|
|
408
|
+
conn_conf: None | DBConfig = None,
|
|
409
|
+
**kwargs,
|
|
410
|
+
):
|
|
411
|
+
"""Ingest data from data sources in a registry.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
data_source_registry: Registry containing data sources mapped to resources
|
|
415
|
+
conn_conf: Database connection configuration
|
|
416
|
+
**kwargs: Additional keyword arguments:
|
|
417
|
+
- clean_start: Whether to clean the database before ingestion
|
|
418
|
+
- n_cores: Number of CPU cores to use
|
|
419
|
+
- max_items: Maximum number of items to process
|
|
420
|
+
- batch_size: Size of batches for processing
|
|
421
|
+
- dry: Whether to perform a dry run
|
|
422
|
+
- init_only: Whether to only initialize the database
|
|
423
|
+
"""
|
|
424
|
+
conn_conf = cast(DBConfig, kwargs.get("conn_conf", conn_conf))
|
|
425
|
+
self.clean_start = kwargs.pop("clean_start", self.clean_start)
|
|
426
|
+
self.n_cores = kwargs.pop("n_cores", self.n_cores)
|
|
427
|
+
self.max_items = kwargs.pop("max_items", self.max_items)
|
|
428
|
+
self.batch_size = kwargs.pop("batch_size", self.batch_size)
|
|
429
|
+
self.dry = kwargs.pop("dry", self.dry)
|
|
430
|
+
init_only = kwargs.pop("init_only", False)
|
|
431
|
+
|
|
432
|
+
if conn_conf is None:
|
|
433
|
+
raise ValueError("conn_conf is required for ingest_data_sources")
|
|
434
|
+
|
|
435
|
+
# If effective_schema is not set, use schema.general.name as fallback
|
|
436
|
+
if conn_conf.can_be_target() and conn_conf.effective_schema is None:
|
|
437
|
+
schema_name = self.schema.general.name
|
|
438
|
+
# Map to the appropriate field based on DB type
|
|
439
|
+
if conn_conf.connection_type == DBType.TIGERGRAPH:
|
|
440
|
+
# TigerGraph uses 'schema_name' field
|
|
441
|
+
conn_conf.schema_name = schema_name
|
|
442
|
+
else:
|
|
443
|
+
# ArangoDB, Neo4j use 'database' field (which maps to effective_schema)
|
|
444
|
+
conn_conf.database = schema_name
|
|
445
|
+
|
|
446
|
+
# init_db() now handles database/schema creation automatically
|
|
447
|
+
# It checks if the database exists and creates it if needed
|
|
448
|
+
# Uses schema.general.name if database is not set in config
|
|
449
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
450
|
+
db_client.init_db(self.schema, self.clean_start)
|
|
451
|
+
|
|
452
|
+
if init_only:
|
|
453
|
+
logger.info("ingest execution bound to init")
|
|
454
|
+
sys.exit(0)
|
|
455
|
+
|
|
456
|
+
# Collect all data sources
|
|
457
|
+
tasks: list[AbstractDataSource] = []
|
|
458
|
+
for resource_name in self.schema._resources.keys():
|
|
459
|
+
data_sources = data_source_registry.get_data_sources(resource_name)
|
|
460
|
+
if data_sources:
|
|
461
|
+
logger.info(
|
|
462
|
+
f"For resource name {resource_name} {len(data_sources)} data sources were found"
|
|
463
|
+
)
|
|
464
|
+
tasks.extend(data_sources)
|
|
465
|
+
|
|
466
|
+
with Timer() as klepsidra:
|
|
467
|
+
if self.n_cores > 1:
|
|
468
|
+
queue_tasks: mp.Queue = mp.Queue()
|
|
469
|
+
for item in tasks:
|
|
470
|
+
queue_tasks.put(item)
|
|
471
|
+
|
|
472
|
+
func = partial(
|
|
473
|
+
self.process_with_queue,
|
|
474
|
+
conn_conf=conn_conf,
|
|
475
|
+
**kwargs,
|
|
476
|
+
)
|
|
477
|
+
assert mp.get_start_method() == "fork", (
|
|
478
|
+
"Requires 'forking' operating system"
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
processes = []
|
|
482
|
+
|
|
483
|
+
for w in range(self.n_cores):
|
|
484
|
+
p = mp.Process(target=func, args=(queue_tasks,), kwargs=kwargs)
|
|
485
|
+
processes.append(p)
|
|
486
|
+
p.start()
|
|
487
|
+
for p in processes:
|
|
488
|
+
p.join()
|
|
489
|
+
else:
|
|
490
|
+
for data_source in tasks:
|
|
491
|
+
self.process_data_source(
|
|
492
|
+
data_source=data_source, conn_conf=conn_conf
|
|
493
|
+
)
|
|
494
|
+
logger.info(f"Processing took {klepsidra.elapsed:.1f} sec")
|
|
495
|
+
|
|
496
|
+
def ingest(
|
|
497
|
+
self,
|
|
498
|
+
output_config: DBConfig,
|
|
499
|
+
patterns: "Patterns | None" = None,
|
|
500
|
+
**kwargs,
|
|
501
|
+
):
|
|
502
|
+
"""Ingest data into the graph database.
|
|
503
|
+
|
|
504
|
+
This is the main ingestion method that takes:
|
|
505
|
+
- Schema: Graph structure (already set in Caster)
|
|
506
|
+
- OutputConfig: Target graph database configuration
|
|
507
|
+
- Patterns: Mapping of resources to physical data sources
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
output_config: Target database connection configuration (for writing graph)
|
|
511
|
+
patterns: Patterns instance mapping resources to data sources
|
|
512
|
+
If None, will try to use legacy 'patterns' kwarg
|
|
513
|
+
**kwargs: Additional keyword arguments:
|
|
514
|
+
- clean_start: Whether to clean the database before ingestion
|
|
515
|
+
- n_cores: Number of CPU cores to use
|
|
516
|
+
- max_items: Maximum number of items to process
|
|
517
|
+
- batch_size: Size of batches for processing
|
|
518
|
+
- dry: Whether to perform a dry run
|
|
519
|
+
- init_only: Whether to only initialize the database
|
|
520
|
+
- limit_files: Optional limit on number of files to process
|
|
521
|
+
- conn_conf: Legacy parameter (use output_config instead)
|
|
522
|
+
"""
|
|
523
|
+
# Backward compatibility: support legacy conn_conf parameter
|
|
524
|
+
if "conn_conf" in kwargs:
|
|
525
|
+
output_config = kwargs.pop("conn_conf")
|
|
526
|
+
|
|
527
|
+
# Backward compatibility: support legacy patterns parameter
|
|
528
|
+
if patterns is None:
|
|
529
|
+
patterns = kwargs.pop("patterns", Patterns())
|
|
530
|
+
|
|
531
|
+
# Create DataSourceRegistry from patterns
|
|
532
|
+
registry = DataSourceRegistry()
|
|
533
|
+
|
|
534
|
+
for r in self.schema.resources:
|
|
535
|
+
resource_name = r.name
|
|
536
|
+
resource_type = patterns.get_resource_type(resource_name)
|
|
537
|
+
|
|
538
|
+
if resource_type == "file":
|
|
539
|
+
# Handle file pattern
|
|
540
|
+
pattern = patterns.patterns[resource_name]
|
|
541
|
+
if not isinstance(pattern, FilePattern):
|
|
542
|
+
logger.warning(
|
|
543
|
+
f"Pattern for resource '{resource_name}' is not a FilePattern, skipping"
|
|
544
|
+
)
|
|
545
|
+
continue
|
|
546
|
+
|
|
547
|
+
# Use sub_path from FilePattern (path is now part of the pattern)
|
|
548
|
+
if pattern.sub_path is None:
|
|
549
|
+
logger.warning(
|
|
550
|
+
f"FilePattern for resource '{resource_name}' has no sub_path, skipping"
|
|
551
|
+
)
|
|
552
|
+
continue
|
|
553
|
+
path_obj = pattern.sub_path.expanduser()
|
|
554
|
+
limit_files = kwargs.get("limit_files", None)
|
|
555
|
+
|
|
556
|
+
files = Caster.discover_files(
|
|
557
|
+
path_obj, limit_files=limit_files, pattern=pattern
|
|
558
|
+
)
|
|
559
|
+
logger.info(
|
|
560
|
+
f"For resource name {resource_name} {len(files)} files were found"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
# Create FileDataSource for each file
|
|
564
|
+
for file_path in files:
|
|
565
|
+
file_source = DataSourceFactory.create_file_data_source(
|
|
566
|
+
path=file_path
|
|
567
|
+
)
|
|
568
|
+
registry.register(file_source, resource_name=resource_name)
|
|
569
|
+
|
|
570
|
+
elif resource_type == "table":
|
|
571
|
+
# Handle PostgreSQL table
|
|
572
|
+
pattern = patterns.patterns[resource_name]
|
|
573
|
+
if not isinstance(pattern, TablePattern):
|
|
574
|
+
logger.warning(
|
|
575
|
+
f"Pattern for resource '{resource_name}' is not a TablePattern, skipping"
|
|
576
|
+
)
|
|
577
|
+
continue
|
|
578
|
+
|
|
579
|
+
postgres_config = patterns.get_postgres_config(resource_name)
|
|
580
|
+
if postgres_config is None:
|
|
581
|
+
logger.warning(
|
|
582
|
+
f"PostgreSQL table '{resource_name}' has no connection config, skipping"
|
|
583
|
+
)
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
# Get table info
|
|
587
|
+
table_info = patterns.get_table_info(resource_name)
|
|
588
|
+
if table_info is None:
|
|
589
|
+
logger.warning(
|
|
590
|
+
f"Could not get table info for resource '{resource_name}', skipping"
|
|
591
|
+
)
|
|
592
|
+
continue
|
|
593
|
+
|
|
594
|
+
table_name, schema_name = table_info
|
|
595
|
+
effective_schema = (
|
|
596
|
+
schema_name or postgres_config.schema_name or "public"
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
# Create SQLDataSource for PostgreSQL table
|
|
600
|
+
try:
|
|
601
|
+
query = f'SELECT * FROM "{effective_schema}"."{table_name}"'
|
|
602
|
+
|
|
603
|
+
from graflo.data_source.sql import SQLConfig, SQLDataSource
|
|
604
|
+
from urllib.parse import urlparse
|
|
605
|
+
|
|
606
|
+
parsed = urlparse(postgres_config.uri or "")
|
|
607
|
+
host = parsed.hostname or "localhost"
|
|
608
|
+
port = parsed.port or 5432
|
|
609
|
+
database = (
|
|
610
|
+
postgres_config.database
|
|
611
|
+
or parsed.path.lstrip("/")
|
|
612
|
+
or "postgres"
|
|
613
|
+
)
|
|
614
|
+
user = postgres_config.username or parsed.username or "postgres"
|
|
615
|
+
password = postgres_config.password or parsed.password or ""
|
|
616
|
+
|
|
617
|
+
# Build PostgreSQL connection string
|
|
618
|
+
if password:
|
|
619
|
+
connection_string = (
|
|
620
|
+
f"postgresql://{user}:{password}@{host}:{port}/{database}"
|
|
621
|
+
)
|
|
622
|
+
else:
|
|
623
|
+
connection_string = (
|
|
624
|
+
f"postgresql://{user}@{host}:{port}/{database}"
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
# Create SQLDataSource
|
|
628
|
+
sql_config = SQLConfig(
|
|
629
|
+
connection_string=connection_string,
|
|
630
|
+
query=query,
|
|
631
|
+
pagination=True,
|
|
632
|
+
page_size=1000,
|
|
633
|
+
)
|
|
634
|
+
sql_source = SQLDataSource(config=sql_config)
|
|
635
|
+
registry.register(sql_source, resource_name=resource_name)
|
|
636
|
+
|
|
637
|
+
logger.info(
|
|
638
|
+
f"Created SQLDataSource for table '{effective_schema}.{table_name}' "
|
|
639
|
+
f"mapped to resource '{resource_name}'"
|
|
640
|
+
)
|
|
641
|
+
except Exception as e:
|
|
642
|
+
logger.error(
|
|
643
|
+
f"Failed to create data source for PostgreSQL table '{resource_name}': {e}",
|
|
644
|
+
exc_info=True,
|
|
645
|
+
)
|
|
646
|
+
continue
|
|
647
|
+
|
|
648
|
+
else:
|
|
649
|
+
logger.warning(
|
|
650
|
+
f"No pattern configuration found for resource '{resource_name}', skipping"
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
# Use the new ingest_data_sources method with output_config
|
|
654
|
+
kwargs["conn_conf"] = output_config
|
|
655
|
+
self.ingest_data_sources(registry, **kwargs)
|
graflo/cli/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Command-line interface for graflo.
|
|
2
|
+
|
|
3
|
+
This module provides command-line tools for working with graflo, including
|
|
4
|
+
utilities for data ingestion, schema management, and graph operations.
|
|
5
|
+
|
|
6
|
+
Key Components:
|
|
7
|
+
- Command-line tools for data processing
|
|
8
|
+
- Schema management utilities
|
|
9
|
+
- Graph database operations
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
>>> uv run ingest --config config.json --data data.json
|
|
13
|
+
>>> uv run plot_schema --config schema.yaml --output figs
|
|
14
|
+
"""
|