graflo 1.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of graflo might be problematic. Click here for more details.
- graflo/README.md +18 -0
- graflo/__init__.py +70 -0
- graflo/architecture/__init__.py +38 -0
- graflo/architecture/actor.py +1276 -0
- graflo/architecture/actor_util.py +450 -0
- graflo/architecture/edge.py +418 -0
- graflo/architecture/onto.py +376 -0
- graflo/architecture/onto_sql.py +54 -0
- graflo/architecture/resource.py +163 -0
- graflo/architecture/schema.py +135 -0
- graflo/architecture/transform.py +292 -0
- graflo/architecture/util.py +89 -0
- graflo/architecture/vertex.py +562 -0
- graflo/caster.py +736 -0
- graflo/cli/__init__.py +14 -0
- graflo/cli/ingest.py +203 -0
- graflo/cli/manage_dbs.py +197 -0
- graflo/cli/plot_schema.py +132 -0
- graflo/cli/xml2json.py +93 -0
- graflo/data_source/__init__.py +48 -0
- graflo/data_source/api.py +339 -0
- graflo/data_source/base.py +95 -0
- graflo/data_source/factory.py +304 -0
- graflo/data_source/file.py +148 -0
- graflo/data_source/memory.py +70 -0
- graflo/data_source/registry.py +82 -0
- graflo/data_source/sql.py +183 -0
- graflo/db/__init__.py +44 -0
- graflo/db/arango/__init__.py +22 -0
- graflo/db/arango/conn.py +1025 -0
- graflo/db/arango/query.py +180 -0
- graflo/db/arango/util.py +88 -0
- graflo/db/conn.py +377 -0
- graflo/db/connection/__init__.py +6 -0
- graflo/db/connection/config_mapping.py +18 -0
- graflo/db/connection/onto.py +717 -0
- graflo/db/connection/wsgi.py +29 -0
- graflo/db/manager.py +119 -0
- graflo/db/neo4j/__init__.py +16 -0
- graflo/db/neo4j/conn.py +639 -0
- graflo/db/postgres/__init__.py +37 -0
- graflo/db/postgres/conn.py +948 -0
- graflo/db/postgres/fuzzy_matcher.py +281 -0
- graflo/db/postgres/heuristics.py +133 -0
- graflo/db/postgres/inference_utils.py +428 -0
- graflo/db/postgres/resource_mapping.py +273 -0
- graflo/db/postgres/schema_inference.py +372 -0
- graflo/db/postgres/types.py +148 -0
- graflo/db/postgres/util.py +87 -0
- graflo/db/tigergraph/__init__.py +9 -0
- graflo/db/tigergraph/conn.py +2365 -0
- graflo/db/tigergraph/onto.py +26 -0
- graflo/db/util.py +49 -0
- graflo/filter/__init__.py +21 -0
- graflo/filter/onto.py +525 -0
- graflo/logging.conf +22 -0
- graflo/onto.py +312 -0
- graflo/plot/__init__.py +17 -0
- graflo/plot/plotter.py +616 -0
- graflo/util/__init__.py +23 -0
- graflo/util/chunker.py +807 -0
- graflo/util/merge.py +150 -0
- graflo/util/misc.py +37 -0
- graflo/util/onto.py +422 -0
- graflo/util/transform.py +454 -0
- graflo-1.3.7.dist-info/METADATA +243 -0
- graflo-1.3.7.dist-info/RECORD +70 -0
- graflo-1.3.7.dist-info/WHEEL +4 -0
- graflo-1.3.7.dist-info/entry_points.txt +5 -0
- graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
graflo/caster.py
ADDED
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
"""Data casting and ingestion system for graph databases.
|
|
2
|
+
|
|
3
|
+
This module provides functionality for casting and ingesting data into graph databases.
|
|
4
|
+
It handles batch processing, file discovery, and database operations for both ArangoDB
|
|
5
|
+
and Neo4j.
|
|
6
|
+
|
|
7
|
+
Key Components:
|
|
8
|
+
- Caster: Main class for data casting and ingestion
|
|
9
|
+
- FilePattern: Pattern matching for file discovery
|
|
10
|
+
- Patterns: Collection of file patterns for different resources
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
>>> caster = Caster(schema=schema)
|
|
14
|
+
>>> caster.ingest(path="data/", conn_conf=db_config)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import multiprocessing as mp
|
|
19
|
+
import queue
|
|
20
|
+
import re
|
|
21
|
+
import sys
|
|
22
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
23
|
+
from functools import partial
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any, cast
|
|
26
|
+
|
|
27
|
+
import pandas as pd
|
|
28
|
+
from pydantic import BaseModel
|
|
29
|
+
from suthing import Timer
|
|
30
|
+
|
|
31
|
+
from graflo.architecture.onto import EncodingType, GraphContainer
|
|
32
|
+
from graflo.architecture.schema import Schema
|
|
33
|
+
from graflo.data_source import (
|
|
34
|
+
AbstractDataSource,
|
|
35
|
+
DataSourceFactory,
|
|
36
|
+
DataSourceRegistry,
|
|
37
|
+
)
|
|
38
|
+
from graflo.data_source.sql import SQLConfig, SQLDataSource
|
|
39
|
+
from graflo.db import DBType, ConnectionManager, DBConfig
|
|
40
|
+
from graflo.onto import DBFlavor
|
|
41
|
+
from graflo.util.chunker import ChunkerType
|
|
42
|
+
from graflo.util.onto import FilePattern, Patterns, ResourceType, TablePattern
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class IngestionParams(BaseModel):
|
|
48
|
+
"""Parameters for controlling the ingestion process.
|
|
49
|
+
|
|
50
|
+
Attributes:
|
|
51
|
+
clean_start: Whether to clean the database before ingestion
|
|
52
|
+
n_cores: Number of CPU cores/threads to use for parallel processing
|
|
53
|
+
max_items: Maximum number of items to process per resource (applies to all data sources)
|
|
54
|
+
batch_size: Size of batches for processing
|
|
55
|
+
dry: Whether to perform a dry run (no database changes)
|
|
56
|
+
init_only: Whether to only initialize the database without ingestion
|
|
57
|
+
limit_files: Optional limit on number of files to process
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
clean_start: bool = False
|
|
61
|
+
n_cores: int = 1
|
|
62
|
+
max_items: int | None = None
|
|
63
|
+
batch_size: int = 10000
|
|
64
|
+
dry: bool = False
|
|
65
|
+
init_only: bool = False
|
|
66
|
+
limit_files: int | None = None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Caster:
|
|
70
|
+
"""Main class for data casting and ingestion.
|
|
71
|
+
|
|
72
|
+
This class handles the process of casting data into graph structures and
|
|
73
|
+
ingesting them into the database. It supports batch processing, parallel
|
|
74
|
+
execution, and various data formats.
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
schema: Schema configuration for the graph
|
|
78
|
+
ingestion_params: IngestionParams instance controlling ingestion behavior
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
schema: Schema,
|
|
84
|
+
ingestion_params: IngestionParams | None = None,
|
|
85
|
+
**kwargs,
|
|
86
|
+
):
|
|
87
|
+
"""Initialize the caster with schema and configuration.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
schema: Schema configuration for the graph
|
|
91
|
+
ingestion_params: IngestionParams instance with ingestion configuration.
|
|
92
|
+
If None, creates IngestionParams from kwargs or uses defaults
|
|
93
|
+
**kwargs: Additional configuration options (for backward compatibility):
|
|
94
|
+
- clean_start: Whether to clean the database before ingestion
|
|
95
|
+
- n_cores: Number of CPU cores/threads to use for parallel processing
|
|
96
|
+
- max_items: Maximum number of items to process
|
|
97
|
+
- batch_size: Size of batches for processing
|
|
98
|
+
- dry: Whether to perform a dry run
|
|
99
|
+
"""
|
|
100
|
+
if ingestion_params is None:
|
|
101
|
+
# Create IngestionParams from kwargs or use defaults
|
|
102
|
+
ingestion_params = IngestionParams(**kwargs)
|
|
103
|
+
self.ingestion_params = ingestion_params
|
|
104
|
+
self.schema = schema
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def discover_files(
|
|
108
|
+
fpath: Path | str, pattern: FilePattern, limit_files=None
|
|
109
|
+
) -> list[Path]:
|
|
110
|
+
"""Discover files matching a pattern in a directory.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
fpath: Path to search in (should be the directory containing files)
|
|
114
|
+
pattern: Pattern to match files against
|
|
115
|
+
limit_files: Optional limit on number of files to return
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
list[Path]: List of matching file paths
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
AssertionError: If pattern.sub_path is None
|
|
122
|
+
"""
|
|
123
|
+
assert pattern.sub_path is not None
|
|
124
|
+
if isinstance(fpath, str):
|
|
125
|
+
fpath_pathlib = Path(fpath)
|
|
126
|
+
else:
|
|
127
|
+
fpath_pathlib = fpath
|
|
128
|
+
|
|
129
|
+
# fpath is already the directory to search (pattern.sub_path from caller)
|
|
130
|
+
# so we use it directly, not combined with pattern.sub_path again
|
|
131
|
+
files = [
|
|
132
|
+
f
|
|
133
|
+
for f in fpath_pathlib.iterdir()
|
|
134
|
+
if f.is_file()
|
|
135
|
+
and (
|
|
136
|
+
True
|
|
137
|
+
if pattern.regex is None
|
|
138
|
+
else re.search(pattern.regex, f.name) is not None
|
|
139
|
+
)
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
if limit_files is not None:
|
|
143
|
+
files = files[:limit_files]
|
|
144
|
+
|
|
145
|
+
return files
|
|
146
|
+
|
|
147
|
+
def cast_normal_resource(
|
|
148
|
+
self, data, resource_name: str | None = None
|
|
149
|
+
) -> GraphContainer:
|
|
150
|
+
"""Cast data into a graph container using a resource.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
data: Data to cast
|
|
154
|
+
resource_name: Optional name of the resource to use
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
GraphContainer: Container with cast graph data
|
|
158
|
+
"""
|
|
159
|
+
rr = self.schema.fetch_resource(resource_name)
|
|
160
|
+
|
|
161
|
+
with ThreadPoolExecutor(max_workers=self.ingestion_params.n_cores) as executor:
|
|
162
|
+
docs = list(
|
|
163
|
+
executor.map(
|
|
164
|
+
lambda doc: rr(doc),
|
|
165
|
+
data,
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
graph = GraphContainer.from_docs_list(docs)
|
|
170
|
+
return graph
|
|
171
|
+
|
|
172
|
+
def process_batch(
|
|
173
|
+
self,
|
|
174
|
+
batch,
|
|
175
|
+
resource_name: str | None,
|
|
176
|
+
conn_conf: None | DBConfig = None,
|
|
177
|
+
):
|
|
178
|
+
"""Process a batch of data.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
batch: Batch of data to process
|
|
182
|
+
resource_name: Optional name of the resource to use
|
|
183
|
+
conn_conf: Optional database connection configuration
|
|
184
|
+
"""
|
|
185
|
+
gc = self.cast_normal_resource(batch, resource_name=resource_name)
|
|
186
|
+
|
|
187
|
+
if conn_conf is not None:
|
|
188
|
+
self.push_db(gc=gc, conn_conf=conn_conf, resource_name=resource_name)
|
|
189
|
+
|
|
190
|
+
def process_data_source(
|
|
191
|
+
self,
|
|
192
|
+
data_source: AbstractDataSource,
|
|
193
|
+
resource_name: str | None = None,
|
|
194
|
+
conn_conf: None | DBConfig = None,
|
|
195
|
+
):
|
|
196
|
+
"""Process a data source.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
data_source: Data source to process
|
|
200
|
+
resource_name: Optional name of the resource (overrides data_source.resource_name)
|
|
201
|
+
conn_conf: Optional database connection configuration
|
|
202
|
+
"""
|
|
203
|
+
# Use provided resource_name or fall back to data_source's resource_name
|
|
204
|
+
actual_resource_name = resource_name or data_source.resource_name
|
|
205
|
+
|
|
206
|
+
# Use pattern-specific limit if available, otherwise use global max_items
|
|
207
|
+
limit = getattr(data_source, "_pattern_limit", None)
|
|
208
|
+
if limit is None:
|
|
209
|
+
limit = self.ingestion_params.max_items
|
|
210
|
+
|
|
211
|
+
for batch in data_source.iter_batches(
|
|
212
|
+
batch_size=self.ingestion_params.batch_size, limit=limit
|
|
213
|
+
):
|
|
214
|
+
self.process_batch(
|
|
215
|
+
batch, resource_name=actual_resource_name, conn_conf=conn_conf
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def process_resource(
|
|
219
|
+
self,
|
|
220
|
+
resource_instance: (
|
|
221
|
+
Path | str | list[dict] | list[list] | pd.DataFrame | dict[str, Any]
|
|
222
|
+
),
|
|
223
|
+
resource_name: str | None,
|
|
224
|
+
conn_conf: None | DBConfig = None,
|
|
225
|
+
**kwargs,
|
|
226
|
+
):
|
|
227
|
+
"""Process a resource instance from configuration or direct data.
|
|
228
|
+
|
|
229
|
+
This method accepts either:
|
|
230
|
+
1. A configuration dictionary with 'source_type' and data source parameters
|
|
231
|
+
2. A file path (Path or str) - creates FileDataSource
|
|
232
|
+
3. In-memory data (list[dict], list[list], or pd.DataFrame) - creates InMemoryDataSource
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
resource_instance: Configuration dict, file path, or in-memory data.
|
|
236
|
+
Configuration dict format:
|
|
237
|
+
- {"source_type": "file", "path": "data.json"}
|
|
238
|
+
- {"source_type": "api", "config": {"url": "https://..."}}
|
|
239
|
+
- {"source_type": "sql", "config": {"connection_string": "...", "query": "..."}}
|
|
240
|
+
- {"source_type": "in_memory", "data": [...]}
|
|
241
|
+
resource_name: Optional name of the resource
|
|
242
|
+
conn_conf: Optional database connection configuration
|
|
243
|
+
**kwargs: Additional arguments passed to data source creation
|
|
244
|
+
(e.g., columns for list[list], encoding for files)
|
|
245
|
+
"""
|
|
246
|
+
# Handle configuration dictionary
|
|
247
|
+
if isinstance(resource_instance, dict):
|
|
248
|
+
config = resource_instance.copy()
|
|
249
|
+
# Merge with kwargs (kwargs take precedence)
|
|
250
|
+
config.update(kwargs)
|
|
251
|
+
data_source = DataSourceFactory.create_data_source_from_config(config)
|
|
252
|
+
# Handle file paths
|
|
253
|
+
elif isinstance(resource_instance, (Path, str)):
|
|
254
|
+
# File path - create FileDataSource
|
|
255
|
+
# Extract only valid file data source parameters with proper typing
|
|
256
|
+
file_type: str | ChunkerType | None = cast(
|
|
257
|
+
str | ChunkerType | None, kwargs.get("file_type", None)
|
|
258
|
+
)
|
|
259
|
+
encoding: EncodingType = cast(
|
|
260
|
+
EncodingType, kwargs.get("encoding", EncodingType.UTF_8)
|
|
261
|
+
)
|
|
262
|
+
sep: str | None = cast(str | None, kwargs.get("sep", None))
|
|
263
|
+
data_source = DataSourceFactory.create_file_data_source(
|
|
264
|
+
path=resource_instance,
|
|
265
|
+
file_type=file_type,
|
|
266
|
+
encoding=encoding,
|
|
267
|
+
sep=sep,
|
|
268
|
+
)
|
|
269
|
+
# Handle in-memory data
|
|
270
|
+
else:
|
|
271
|
+
# In-memory data - create InMemoryDataSource
|
|
272
|
+
# Extract only valid in-memory data source parameters with proper typing
|
|
273
|
+
columns: list[str] | None = cast(
|
|
274
|
+
list[str] | None, kwargs.get("columns", None)
|
|
275
|
+
)
|
|
276
|
+
data_source = DataSourceFactory.create_in_memory_data_source(
|
|
277
|
+
data=resource_instance,
|
|
278
|
+
columns=columns,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
data_source.resource_name = resource_name
|
|
282
|
+
|
|
283
|
+
# Process using the data source
|
|
284
|
+
self.process_data_source(
|
|
285
|
+
data_source=data_source,
|
|
286
|
+
resource_name=resource_name,
|
|
287
|
+
conn_conf=conn_conf,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def push_db(
|
|
291
|
+
self,
|
|
292
|
+
gc: GraphContainer,
|
|
293
|
+
conn_conf: DBConfig,
|
|
294
|
+
resource_name: str | None,
|
|
295
|
+
):
|
|
296
|
+
"""Push graph container data to the database.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
gc: Graph container with data to push
|
|
300
|
+
conn_conf: Database connection configuration
|
|
301
|
+
resource_name: Optional name of the resource
|
|
302
|
+
"""
|
|
303
|
+
vc = self.schema.vertex_config
|
|
304
|
+
resource = self.schema.fetch_resource(resource_name)
|
|
305
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
306
|
+
for vcol, data in gc.vertices.items():
|
|
307
|
+
# blank nodes: push and get back their keys {"_key": ...}
|
|
308
|
+
if vcol in vc.blank_vertices:
|
|
309
|
+
query0 = db_client.insert_return_batch(data, vc.vertex_dbname(vcol))
|
|
310
|
+
cursor = db_client.execute(query0)
|
|
311
|
+
gc.vertices[vcol] = [item for item in cursor]
|
|
312
|
+
else:
|
|
313
|
+
db_client.upsert_docs_batch(
|
|
314
|
+
data,
|
|
315
|
+
vc.vertex_dbname(vcol),
|
|
316
|
+
vc.index(vcol),
|
|
317
|
+
update_keys="doc",
|
|
318
|
+
filter_uniques=True,
|
|
319
|
+
dry=self.ingestion_params.dry,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# update edge misc with blank node edges
|
|
323
|
+
for vcol in vc.blank_vertices:
|
|
324
|
+
for edge_id, edge in self.schema.edge_config.edges_items():
|
|
325
|
+
vfrom, vto, relation = edge_id
|
|
326
|
+
if vcol == vfrom or vcol == vto:
|
|
327
|
+
if edge_id not in gc.edges:
|
|
328
|
+
gc.edges[edge_id] = []
|
|
329
|
+
gc.edges[edge_id].extend(
|
|
330
|
+
[
|
|
331
|
+
(x, y, {})
|
|
332
|
+
for x, y in zip(gc.vertices[vfrom], gc.vertices[vto])
|
|
333
|
+
]
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
337
|
+
# currently works only on item level
|
|
338
|
+
for edge in resource.extra_weights:
|
|
339
|
+
if edge.weights is None:
|
|
340
|
+
continue
|
|
341
|
+
for weight in edge.weights.vertices:
|
|
342
|
+
if weight.name in vc.vertex_set:
|
|
343
|
+
index_fields = vc.index(weight.name)
|
|
344
|
+
|
|
345
|
+
if not self.ingestion_params.dry and weight.name in gc.vertices:
|
|
346
|
+
weights_per_item = db_client.fetch_present_documents(
|
|
347
|
+
class_name=vc.vertex_dbname(weight.name),
|
|
348
|
+
batch=gc.vertices[weight.name],
|
|
349
|
+
match_keys=index_fields.fields,
|
|
350
|
+
keep_keys=weight.fields,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
for j, item in enumerate(gc.linear):
|
|
354
|
+
weights = weights_per_item[j]
|
|
355
|
+
|
|
356
|
+
for ee in item[edge.edge_id]:
|
|
357
|
+
weight_collection_attached = {
|
|
358
|
+
weight.cfield(k): v
|
|
359
|
+
for k, v in weights[0].items()
|
|
360
|
+
}
|
|
361
|
+
ee.update(weight_collection_attached)
|
|
362
|
+
else:
|
|
363
|
+
logger.error(f"{weight.name} not a valid vertex")
|
|
364
|
+
|
|
365
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
366
|
+
for edge_id, edge in self.schema.edge_config.edges_items():
|
|
367
|
+
for ee in gc.loop_over_relations(edge_id):
|
|
368
|
+
_, _, relation = ee
|
|
369
|
+
if not self.ingestion_params.dry:
|
|
370
|
+
data = gc.edges[ee]
|
|
371
|
+
db_client.insert_edges_batch(
|
|
372
|
+
docs_edges=data,
|
|
373
|
+
source_class=vc.vertex_dbname(edge.source),
|
|
374
|
+
target_class=vc.vertex_dbname(edge.target),
|
|
375
|
+
relation_name=relation,
|
|
376
|
+
collection_name=edge.database_name,
|
|
377
|
+
match_keys_source=vc.index(edge.source).fields,
|
|
378
|
+
match_keys_target=vc.index(edge.target).fields,
|
|
379
|
+
filter_uniques=False,
|
|
380
|
+
dry=self.ingestion_params.dry,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
def process_with_queue(self, tasks: mp.Queue, conn_conf: DBConfig | None = None):
|
|
384
|
+
"""Process tasks from a queue.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
tasks: Queue of tasks to process
|
|
388
|
+
conn_conf: Optional database connection configuration
|
|
389
|
+
"""
|
|
390
|
+
while True:
|
|
391
|
+
try:
|
|
392
|
+
task = tasks.get_nowait()
|
|
393
|
+
# Support both (Path, str) tuples and DataSource instances
|
|
394
|
+
if isinstance(task, tuple) and len(task) == 2:
|
|
395
|
+
filepath, resource_name = task
|
|
396
|
+
self.process_resource(
|
|
397
|
+
resource_instance=filepath,
|
|
398
|
+
resource_name=resource_name,
|
|
399
|
+
conn_conf=conn_conf,
|
|
400
|
+
)
|
|
401
|
+
elif isinstance(task, AbstractDataSource):
|
|
402
|
+
self.process_data_source(data_source=task, conn_conf=conn_conf)
|
|
403
|
+
except queue.Empty:
|
|
404
|
+
break
|
|
405
|
+
|
|
406
|
+
@staticmethod
|
|
407
|
+
def normalize_resource(
|
|
408
|
+
data: pd.DataFrame | list[list] | list[dict], columns: list[str] | None = None
|
|
409
|
+
) -> list[dict]:
|
|
410
|
+
"""Normalize resource data into a list of dictionaries.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
data: Data to normalize (DataFrame, list of lists, or list of dicts)
|
|
414
|
+
columns: Optional column names for list data
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
list[dict]: Normalized data as list of dictionaries
|
|
418
|
+
|
|
419
|
+
Raises:
|
|
420
|
+
ValueError: If columns is not provided for list data
|
|
421
|
+
"""
|
|
422
|
+
if isinstance(data, pd.DataFrame):
|
|
423
|
+
columns = data.columns.tolist()
|
|
424
|
+
_data = data.values.tolist()
|
|
425
|
+
elif data and isinstance(data[0], list):
|
|
426
|
+
_data = cast(list[list], data) # Tell mypy this is list[list]
|
|
427
|
+
if columns is None:
|
|
428
|
+
raise ValueError("columns should be set")
|
|
429
|
+
else:
|
|
430
|
+
return cast(list[dict], data) # Tell mypy this is list[dict]
|
|
431
|
+
rows_dressed = [{k: v for k, v in zip(columns, item)} for item in _data]
|
|
432
|
+
return rows_dressed
|
|
433
|
+
|
|
434
|
+
def ingest_data_sources(
|
|
435
|
+
self,
|
|
436
|
+
data_source_registry: DataSourceRegistry,
|
|
437
|
+
conn_conf: DBConfig,
|
|
438
|
+
ingestion_params: IngestionParams | None = None,
|
|
439
|
+
):
|
|
440
|
+
"""Ingest data from data sources in a registry.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
data_source_registry: Registry containing data sources mapped to resources
|
|
444
|
+
conn_conf: Database connection configuration
|
|
445
|
+
ingestion_params: IngestionParams instance with ingestion configuration.
|
|
446
|
+
If None, uses default IngestionParams()
|
|
447
|
+
"""
|
|
448
|
+
if ingestion_params is None:
|
|
449
|
+
ingestion_params = IngestionParams()
|
|
450
|
+
|
|
451
|
+
# Update ingestion params (may override defaults set in __init__)
|
|
452
|
+
self.ingestion_params = ingestion_params
|
|
453
|
+
init_only = ingestion_params.init_only
|
|
454
|
+
|
|
455
|
+
# If effective_schema is not set, use schema.general.name as fallback
|
|
456
|
+
if conn_conf.can_be_target() and conn_conf.effective_schema is None:
|
|
457
|
+
schema_name = self.schema.general.name
|
|
458
|
+
# Map to the appropriate field based on DB type
|
|
459
|
+
if conn_conf.connection_type == DBType.TIGERGRAPH:
|
|
460
|
+
# TigerGraph uses 'schema_name' field
|
|
461
|
+
conn_conf.schema_name = schema_name
|
|
462
|
+
else:
|
|
463
|
+
# ArangoDB, Neo4j use 'database' field (which maps to effective_schema)
|
|
464
|
+
conn_conf.database = schema_name
|
|
465
|
+
|
|
466
|
+
# init_db() now handles database/schema creation automatically
|
|
467
|
+
# It checks if the database exists and creates it if needed
|
|
468
|
+
# Uses schema.general.name if database is not set in config
|
|
469
|
+
with ConnectionManager(connection_config=conn_conf) as db_client:
|
|
470
|
+
db_client.init_db(self.schema, self.ingestion_params.clean_start)
|
|
471
|
+
|
|
472
|
+
if init_only:
|
|
473
|
+
logger.info("ingest execution bound to init")
|
|
474
|
+
sys.exit(0)
|
|
475
|
+
|
|
476
|
+
# Collect all data sources
|
|
477
|
+
tasks: list[AbstractDataSource] = []
|
|
478
|
+
for resource_name in self.schema._resources.keys():
|
|
479
|
+
data_sources = data_source_registry.get_data_sources(resource_name)
|
|
480
|
+
if data_sources:
|
|
481
|
+
logger.info(
|
|
482
|
+
f"For resource name {resource_name} {len(data_sources)} data sources were found"
|
|
483
|
+
)
|
|
484
|
+
tasks.extend(data_sources)
|
|
485
|
+
|
|
486
|
+
with Timer() as klepsidra:
|
|
487
|
+
if self.ingestion_params.n_cores > 1:
|
|
488
|
+
queue_tasks: mp.Queue = mp.Queue()
|
|
489
|
+
for item in tasks:
|
|
490
|
+
queue_tasks.put(item)
|
|
491
|
+
|
|
492
|
+
func = partial(
|
|
493
|
+
self.process_with_queue,
|
|
494
|
+
conn_conf=conn_conf,
|
|
495
|
+
)
|
|
496
|
+
assert mp.get_start_method() == "fork", (
|
|
497
|
+
"Requires 'forking' operating system"
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
processes = []
|
|
501
|
+
|
|
502
|
+
for w in range(self.ingestion_params.n_cores):
|
|
503
|
+
p = mp.Process(target=func, args=(queue_tasks,))
|
|
504
|
+
processes.append(p)
|
|
505
|
+
p.start()
|
|
506
|
+
for p in processes:
|
|
507
|
+
p.join()
|
|
508
|
+
else:
|
|
509
|
+
for data_source in tasks:
|
|
510
|
+
self.process_data_source(
|
|
511
|
+
data_source=data_source, conn_conf=conn_conf
|
|
512
|
+
)
|
|
513
|
+
logger.info(f"Processing took {klepsidra.elapsed:.1f} sec")
|
|
514
|
+
|
|
515
|
+
@staticmethod
|
|
516
|
+
def _get_db_flavor_from_config(output_config: DBConfig) -> DBFlavor:
|
|
517
|
+
"""Convert DBConfig connection type to DBFlavor.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
output_config: Database configuration
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
DBFlavor enum value corresponding to the database type
|
|
524
|
+
"""
|
|
525
|
+
db_type = output_config.connection_type
|
|
526
|
+
if db_type == DBType.ARANGO:
|
|
527
|
+
return DBFlavor.ARANGO
|
|
528
|
+
elif db_type == DBType.NEO4J:
|
|
529
|
+
return DBFlavor.NEO4J
|
|
530
|
+
elif db_type == DBType.TIGERGRAPH:
|
|
531
|
+
return DBFlavor.TIGERGRAPH
|
|
532
|
+
else:
|
|
533
|
+
# Default to ARANGO for unknown types
|
|
534
|
+
return DBFlavor.ARANGO
|
|
535
|
+
|
|
536
|
+
def _register_file_sources(
|
|
537
|
+
self,
|
|
538
|
+
registry: DataSourceRegistry,
|
|
539
|
+
resource_name: str,
|
|
540
|
+
pattern: FilePattern,
|
|
541
|
+
ingestion_params: IngestionParams,
|
|
542
|
+
) -> None:
|
|
543
|
+
"""Register file data sources for a resource.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
registry: Data source registry to add sources to
|
|
547
|
+
resource_name: Name of the resource
|
|
548
|
+
pattern: File pattern configuration
|
|
549
|
+
ingestion_params: Ingestion parameters
|
|
550
|
+
"""
|
|
551
|
+
if pattern.sub_path is None:
|
|
552
|
+
logger.warning(
|
|
553
|
+
f"FilePattern for resource '{resource_name}' has no sub_path, skipping"
|
|
554
|
+
)
|
|
555
|
+
return
|
|
556
|
+
|
|
557
|
+
path_obj = pattern.sub_path.expanduser()
|
|
558
|
+
files = Caster.discover_files(
|
|
559
|
+
path_obj, limit_files=ingestion_params.limit_files, pattern=pattern
|
|
560
|
+
)
|
|
561
|
+
logger.info(f"For resource name {resource_name} {len(files)} files were found")
|
|
562
|
+
|
|
563
|
+
for file_path in files:
|
|
564
|
+
file_source = DataSourceFactory.create_file_data_source(path=file_path)
|
|
565
|
+
registry.register(file_source, resource_name=resource_name)
|
|
566
|
+
|
|
567
|
+
def _register_sql_table_sources(
|
|
568
|
+
self,
|
|
569
|
+
registry: DataSourceRegistry,
|
|
570
|
+
resource_name: str,
|
|
571
|
+
pattern: TablePattern,
|
|
572
|
+
patterns: "Patterns",
|
|
573
|
+
ingestion_params: IngestionParams,
|
|
574
|
+
) -> None:
|
|
575
|
+
"""Register SQL table data sources for a resource.
|
|
576
|
+
|
|
577
|
+
Uses SQLDataSource with batch processing (cursors) instead of loading
|
|
578
|
+
all data into memory. This is efficient for large tables.
|
|
579
|
+
|
|
580
|
+
Args:
|
|
581
|
+
registry: Data source registry to add sources to
|
|
582
|
+
resource_name: Name of the resource
|
|
583
|
+
pattern: Table pattern configuration
|
|
584
|
+
patterns: Patterns instance for accessing configs
|
|
585
|
+
ingestion_params: Ingestion parameters
|
|
586
|
+
"""
|
|
587
|
+
postgres_config = patterns.get_postgres_config(resource_name)
|
|
588
|
+
if postgres_config is None:
|
|
589
|
+
logger.warning(
|
|
590
|
+
f"PostgreSQL table '{resource_name}' has no connection config, skipping"
|
|
591
|
+
)
|
|
592
|
+
return
|
|
593
|
+
|
|
594
|
+
table_info = patterns.get_table_info(resource_name)
|
|
595
|
+
if table_info is None:
|
|
596
|
+
logger.warning(
|
|
597
|
+
f"Could not get table info for resource '{resource_name}', skipping"
|
|
598
|
+
)
|
|
599
|
+
return
|
|
600
|
+
|
|
601
|
+
table_name, schema_name = table_info
|
|
602
|
+
effective_schema = schema_name or postgres_config.schema_name or "public"
|
|
603
|
+
|
|
604
|
+
try:
|
|
605
|
+
# Build base query
|
|
606
|
+
query = f'SELECT * FROM "{effective_schema}"."{table_name}"'
|
|
607
|
+
where_clause = pattern.build_where_clause()
|
|
608
|
+
if where_clause:
|
|
609
|
+
query += f" WHERE {where_clause}"
|
|
610
|
+
|
|
611
|
+
# Get SQLAlchemy connection string from PostgresConfig
|
|
612
|
+
connection_string = postgres_config.to_sqlalchemy_connection_string()
|
|
613
|
+
|
|
614
|
+
# Create SQLDataSource with pagination for efficient batch processing
|
|
615
|
+
# Note: max_items limit is handled by SQLDataSource.iter_batches() limit parameter
|
|
616
|
+
sql_config = SQLConfig(
|
|
617
|
+
connection_string=connection_string,
|
|
618
|
+
query=query,
|
|
619
|
+
pagination=True,
|
|
620
|
+
page_size=ingestion_params.batch_size, # Use batch_size for page size
|
|
621
|
+
)
|
|
622
|
+
sql_source = SQLDataSource(config=sql_config)
|
|
623
|
+
|
|
624
|
+
# Register the SQL data source (it will be processed in batches)
|
|
625
|
+
registry.register(sql_source, resource_name=resource_name)
|
|
626
|
+
|
|
627
|
+
logger.info(
|
|
628
|
+
f"Created SQL data source for table '{effective_schema}.{table_name}' "
|
|
629
|
+
f"mapped to resource '{resource_name}' (will process in batches of {ingestion_params.batch_size})"
|
|
630
|
+
)
|
|
631
|
+
except Exception as e:
|
|
632
|
+
logger.error(
|
|
633
|
+
f"Failed to create data source for PostgreSQL table '{resource_name}': {e}",
|
|
634
|
+
exc_info=True,
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
def _build_registry_from_patterns(
|
|
638
|
+
self,
|
|
639
|
+
patterns: "Patterns",
|
|
640
|
+
ingestion_params: IngestionParams,
|
|
641
|
+
) -> DataSourceRegistry:
|
|
642
|
+
"""Build data source registry from patterns.
|
|
643
|
+
|
|
644
|
+
Args:
|
|
645
|
+
patterns: Patterns instance mapping resources to data sources
|
|
646
|
+
ingestion_params: Ingestion parameters
|
|
647
|
+
|
|
648
|
+
Returns:
|
|
649
|
+
DataSourceRegistry with registered data sources
|
|
650
|
+
"""
|
|
651
|
+
registry = DataSourceRegistry()
|
|
652
|
+
|
|
653
|
+
for resource in self.schema.resources:
|
|
654
|
+
resource_name = resource.name
|
|
655
|
+
resource_type = patterns.get_resource_type(resource_name)
|
|
656
|
+
|
|
657
|
+
if resource_type is None:
|
|
658
|
+
logger.warning(
|
|
659
|
+
f"No resource type found for resource '{resource_name}', skipping"
|
|
660
|
+
)
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
pattern = patterns.patterns.get(resource_name)
|
|
664
|
+
if pattern is None:
|
|
665
|
+
logger.warning(
|
|
666
|
+
f"No pattern found for resource '{resource_name}', skipping"
|
|
667
|
+
)
|
|
668
|
+
continue
|
|
669
|
+
|
|
670
|
+
if resource_type == ResourceType.FILE:
|
|
671
|
+
if not isinstance(pattern, FilePattern):
|
|
672
|
+
logger.warning(
|
|
673
|
+
f"Pattern for resource '{resource_name}' is not a FilePattern, skipping"
|
|
674
|
+
)
|
|
675
|
+
continue
|
|
676
|
+
self._register_file_sources(
|
|
677
|
+
registry, resource_name, pattern, ingestion_params
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
elif resource_type == ResourceType.SQL_TABLE:
|
|
681
|
+
if not isinstance(pattern, TablePattern):
|
|
682
|
+
logger.warning(
|
|
683
|
+
f"Pattern for resource '{resource_name}' is not a TablePattern, skipping"
|
|
684
|
+
)
|
|
685
|
+
continue
|
|
686
|
+
self._register_sql_table_sources(
|
|
687
|
+
registry, resource_name, pattern, patterns, ingestion_params
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
else:
|
|
691
|
+
logger.warning(
|
|
692
|
+
f"Unsupported resource type '{resource_type}' for resource '{resource_name}', skipping"
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
return registry
|
|
696
|
+
|
|
697
|
+
def ingest(
|
|
698
|
+
self,
|
|
699
|
+
output_config: DBConfig,
|
|
700
|
+
patterns: "Patterns | None" = None,
|
|
701
|
+
ingestion_params: IngestionParams | None = None,
|
|
702
|
+
):
|
|
703
|
+
"""Ingest data into the graph database.
|
|
704
|
+
|
|
705
|
+
This is the main ingestion method that takes:
|
|
706
|
+
- Schema: Graph structure (already set in Caster)
|
|
707
|
+
- OutputConfig: Target graph database configuration
|
|
708
|
+
- Patterns: Mapping of resources to physical data sources
|
|
709
|
+
- IngestionParams: Parameters controlling the ingestion process
|
|
710
|
+
|
|
711
|
+
Args:
|
|
712
|
+
output_config: Target database connection configuration (for writing graph)
|
|
713
|
+
patterns: Patterns instance mapping resources to data sources
|
|
714
|
+
If None, defaults to empty Patterns()
|
|
715
|
+
ingestion_params: IngestionParams instance with ingestion configuration.
|
|
716
|
+
If None, uses default IngestionParams()
|
|
717
|
+
"""
|
|
718
|
+
# Normalize parameters
|
|
719
|
+
patterns = patterns or Patterns()
|
|
720
|
+
ingestion_params = ingestion_params or IngestionParams()
|
|
721
|
+
|
|
722
|
+
# Initialize vertex config with correct field types based on database type
|
|
723
|
+
db_flavor = self._get_db_flavor_from_config(output_config)
|
|
724
|
+
self.schema.vertex_config.finish_init(db_flavor)
|
|
725
|
+
# Initialize edge config after vertex config is fully initialized
|
|
726
|
+
self.schema.edge_config.finish_init(self.schema.vertex_config)
|
|
727
|
+
|
|
728
|
+
# Build registry from patterns
|
|
729
|
+
registry = self._build_registry_from_patterns(patterns, ingestion_params)
|
|
730
|
+
|
|
731
|
+
# Ingest data sources
|
|
732
|
+
self.ingest_data_sources(
|
|
733
|
+
data_source_registry=registry,
|
|
734
|
+
conn_conf=output_config,
|
|
735
|
+
ingestion_params=ingestion_params,
|
|
736
|
+
)
|