graflo 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (45) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +39 -0
  3. graflo/architecture/__init__.py +37 -0
  4. graflo/architecture/actor.py +974 -0
  5. graflo/architecture/actor_util.py +425 -0
  6. graflo/architecture/edge.py +295 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +277 -0
  13. graflo/caster.py +409 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +144 -0
  16. graflo/cli/manage_dbs.py +193 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/db/__init__.py +32 -0
  20. graflo/db/arango/__init__.py +16 -0
  21. graflo/db/arango/conn.py +734 -0
  22. graflo/db/arango/query.py +180 -0
  23. graflo/db/arango/util.py +88 -0
  24. graflo/db/connection.py +304 -0
  25. graflo/db/manager.py +104 -0
  26. graflo/db/neo4j/__init__.py +16 -0
  27. graflo/db/neo4j/conn.py +432 -0
  28. graflo/db/util.py +49 -0
  29. graflo/filter/__init__.py +21 -0
  30. graflo/filter/onto.py +400 -0
  31. graflo/logging.conf +22 -0
  32. graflo/onto.py +186 -0
  33. graflo/plot/__init__.py +17 -0
  34. graflo/plot/plotter.py +556 -0
  35. graflo/util/__init__.py +23 -0
  36. graflo/util/chunker.py +739 -0
  37. graflo/util/merge.py +148 -0
  38. graflo/util/misc.py +37 -0
  39. graflo/util/onto.py +63 -0
  40. graflo/util/transform.py +406 -0
  41. graflo-1.1.0.dist-info/METADATA +157 -0
  42. graflo-1.1.0.dist-info/RECORD +45 -0
  43. graflo-1.1.0.dist-info/WHEEL +4 -0
  44. graflo-1.1.0.dist-info/entry_points.txt +5 -0
  45. graflo-1.1.0.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,277 @@
1
+ """Vertex configuration and management for graph databases.
2
+
3
+ This module provides classes and utilities for managing vertices in graph databases.
4
+ It handles vertex configuration, field management, indexing, and filtering operations.
5
+ The module supports both ArangoDB and Neo4j through the DBFlavor enum.
6
+
7
+ Key Components:
8
+ - Vertex: Represents a vertex with its fields and indexes
9
+ - VertexConfig: Manages collections of vertices and their configurations
10
+
11
+ Example:
12
+ >>> vertex = Vertex(name="user", fields=["id", "name"])
13
+ >>> config = VertexConfig(vertices=[vertex])
14
+ >>> fields = config.fields("user", with_aux=True)
15
+ """
16
+
17
+ import dataclasses
18
+ import logging
19
+ from typing import Optional
20
+
21
+ from graflo.architecture.onto import Index
22
+ from graflo.filter.onto import Expression
23
+ from graflo.onto import BaseDataclass, DBFlavor
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ @dataclasses.dataclass
29
+ class Vertex(BaseDataclass):
30
+ """Represents a vertex in the graph database.
31
+
32
+ A vertex is a fundamental unit in the graph that can have fields, indexes,
33
+ and filters
34
+
35
+ Attributes:
36
+ name: Name of the vertex
37
+ fields: List of field names
38
+ fields_aux: List of auxiliary field names for weight passing
39
+ indexes: List of indexes for the vertex
40
+ filters: List of filter expressions
41
+ dbname: Optional database name (defaults to vertex name)
42
+ """
43
+
44
+ name: str
45
+ fields: list[str]
46
+ fields_aux: list[str] = dataclasses.field(
47
+ default_factory=list
48
+ ) # temporary field necessary to pass weights to edges
49
+ indexes: list[Index] = dataclasses.field(default_factory=list)
50
+ filters: list[Expression] = dataclasses.field(default_factory=list)
51
+ dbname: Optional[str] = None
52
+
53
+ @property
54
+ def fields_all(self):
55
+ """Get all fields including auxiliary fields.
56
+
57
+ Returns:
58
+ list[str]: Combined list of regular and auxiliary fields
59
+ """
60
+ return self.fields + self.fields_aux
61
+
62
+ def __post_init__(self):
63
+ """Initialize the vertex after dataclass initialization.
64
+
65
+ Sets the database name if not provided and updates fields based on indexes.
66
+ """
67
+ if self.dbname is None:
68
+ self.dbname = self.name
69
+ union_fields = set(self.fields)
70
+ if not self.indexes:
71
+ self.indexes = [Index(fields=self.fields)]
72
+ for ei in self.indexes:
73
+ union_fields |= set(ei.fields)
74
+ self.fields = list(union_fields)
75
+
76
+ def update_aux_fields(self, fields_aux: list):
77
+ """Update auxiliary fields.
78
+
79
+ Args:
80
+ fields_aux: List of new auxiliary fields to add
81
+
82
+ Returns:
83
+ Vertex: Self for method chaining
84
+ """
85
+ self.fields_aux = list(set(self.fields_aux) | set(fields_aux))
86
+ return self
87
+
88
+
89
+ @dataclasses.dataclass
90
+ class VertexConfig(BaseDataclass):
91
+ """Configuration for managing collections of vertices.
92
+
93
+ This class manages a collection of vertices, providing methods for accessing
94
+ and manipulating vertex configurations.
95
+
96
+ Attributes:
97
+ vertices: List of vertex configurations
98
+ blank_vertices: List of blank vertex names
99
+ force_types: Dictionary mapping vertex names to type lists
100
+ db_flavor: Database flavor (ARANGO or NEO4J)
101
+ """
102
+
103
+ vertices: list[Vertex]
104
+ blank_vertices: list[str] = dataclasses.field(default_factory=list)
105
+ force_types: dict[str, list] = dataclasses.field(default_factory=dict)
106
+ db_flavor: DBFlavor = DBFlavor.ARANGO
107
+
108
+ def __post_init__(self):
109
+ """Initialize the vertex configuration.
110
+
111
+ Creates internal mappings and validates blank vertices.
112
+
113
+ Raises:
114
+ ValueError: If blank vertices are not defined in the configuration
115
+ """
116
+ self._vertices_map: dict[str, Vertex] = {
117
+ item.name: item for item in self.vertices
118
+ }
119
+
120
+ # TODO replace by types
121
+ # vertex_collection_name -> [numeric fields]
122
+ self._vcollection_numeric_fields_map = {}
123
+
124
+ if set(self.blank_vertices) - set(self.vertex_set):
125
+ raise ValueError(
126
+ f" Blank collections {self.blank_vertices} are not defined"
127
+ " as vertex collections"
128
+ )
129
+
130
+ @property
131
+ def vertex_set(self):
132
+ """Get set of vertex names.
133
+
134
+ Returns:
135
+ set[str]: Set of vertex names
136
+ """
137
+ return set(self._vertices_map.keys())
138
+
139
+ @property
140
+ def vertex_list(self):
141
+ """Get list of vertex configurations.
142
+
143
+ Returns:
144
+ list[Vertex]: List of vertex configurations
145
+ """
146
+ return list(self._vertices_map.values())
147
+
148
+ def vertex_dbname(self, vertex_name):
149
+ """Get database name for a vertex.
150
+
151
+ Args:
152
+ vertex_name: Name of the vertex
153
+
154
+ Returns:
155
+ str: Database name for the vertex
156
+
157
+ Raises:
158
+ KeyError: If vertex is not found
159
+ """
160
+ try:
161
+ value = self._vertices_map[vertex_name].dbname
162
+ except KeyError as e:
163
+ logger.error(
164
+ "Available vertex collections :"
165
+ f" {self._vertices_map.keys()}; vertex collection"
166
+ f" requested : {vertex_name}"
167
+ )
168
+ raise e
169
+ return value
170
+
171
+ def index(self, vertex_name) -> Index:
172
+ """Get primary index for a vertex.
173
+
174
+ Args:
175
+ vertex_name: Name of the vertex
176
+
177
+ Returns:
178
+ Index: Primary index for the vertex
179
+ """
180
+ return self._vertices_map[vertex_name].indexes[0]
181
+
182
+ def indexes(self, vertex_name) -> list[Index]:
183
+ """Get all indexes for a vertex.
184
+
185
+ Args:
186
+ vertex_name: Name of the vertex
187
+
188
+ Returns:
189
+ list[Index]: List of indexes for the vertex
190
+ """
191
+ return self._vertices_map[vertex_name].indexes
192
+
193
+ def fields(self, vertex_name: str, with_aux=False):
194
+ """Get fields for a vertex.
195
+
196
+ Args:
197
+ vertex_name: Name of the vertex
198
+ with_aux: Whether to include auxiliary fields
199
+
200
+ Returns:
201
+ list[str]: List of fields
202
+ """
203
+ if with_aux:
204
+ return self._vertices_map[vertex_name].fields_all
205
+ else:
206
+ return self._vertices_map[vertex_name].fields
207
+
208
+ def numeric_fields_list(self, vertex_name):
209
+ """Get list of numeric fields for a vertex.
210
+
211
+ Args:
212
+ vertex_name: Name of the vertex
213
+
214
+ Returns:
215
+ tuple: Tuple of numeric field names
216
+
217
+ Raises:
218
+ ValueError: If vertex is not defined in config
219
+ """
220
+ if vertex_name in self.vertex_set:
221
+ if vertex_name in self._vcollection_numeric_fields_map:
222
+ return self._vcollection_numeric_fields_map[vertex_name]
223
+ else:
224
+ return ()
225
+ else:
226
+ raise ValueError(
227
+ " Accessing vertex collection numeric fields: vertex"
228
+ f" collection {vertex_name} was not defined in config"
229
+ )
230
+
231
+ def filters(self, vertex_name) -> list[Expression]:
232
+ """Get filter expressions for a vertex.
233
+
234
+ Args:
235
+ vertex_name: Name of the vertex
236
+
237
+ Returns:
238
+ list[Expression]: List of filter expressions
239
+ """
240
+ if vertex_name in self._vertices_map:
241
+ return self._vertices_map[vertex_name].filters
242
+ else:
243
+ return []
244
+
245
+ def update_vertex(self, v: Vertex):
246
+ """Update vertex configuration.
247
+
248
+ Args:
249
+ v: Vertex configuration to update
250
+ """
251
+ self._vertices_map[v.name] = v
252
+
253
+ def __getitem__(self, key: str):
254
+ """Get vertex configuration by name.
255
+
256
+ Args:
257
+ key: Vertex name
258
+
259
+ Returns:
260
+ Vertex: Vertex configuration
261
+
262
+ Raises:
263
+ KeyError: If vertex is not found
264
+ """
265
+ if key in self._vertices_map:
266
+ return self._vertices_map[key]
267
+ else:
268
+ raise KeyError(f"Vertex {key} absent")
269
+
270
+ def __setitem__(self, key: str, value: Vertex):
271
+ """Set vertex configuration by name.
272
+
273
+ Args:
274
+ key: Vertex name
275
+ value: Vertex configuration
276
+ """
277
+ self._vertices_map[key] = value
graflo/caster.py ADDED
@@ -0,0 +1,409 @@
1
+ """Data casting and ingestion system for graph databases.
2
+
3
+ This module provides functionality for casting and ingesting data into graph databases.
4
+ It handles batch processing, file discovery, and database operations for both ArangoDB
5
+ and Neo4j.
6
+
7
+ Key Components:
8
+ - Caster: Main class for data casting and ingestion
9
+ - FilePattern: Pattern matching for file discovery
10
+ - Patterns: Collection of file patterns for different resources
11
+
12
+ Example:
13
+ >>> caster = Caster(schema=schema)
14
+ >>> caster.ingest_files(path="data/", conn_conf=db_config)
15
+ """
16
+
17
+ import logging
18
+ import multiprocessing as mp
19
+ import queue
20
+ import re
21
+ import sys
22
+ from concurrent.futures import ThreadPoolExecutor
23
+ from functools import partial
24
+ from pathlib import Path
25
+ from typing import cast
26
+
27
+ import pandas as pd
28
+ from suthing import ConnectionKind, DBConnectionConfig, Timer
29
+
30
+ from graflo.architecture.onto import GraphContainer
31
+ from graflo.architecture.schema import Schema
32
+ from graflo.db import ConnectionManager
33
+ from graflo.util.chunker import ChunkerFactory
34
+ from graflo.util.onto import FilePattern, Patterns
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class Caster:
40
+ """Main class for data casting and ingestion.
41
+
42
+ This class handles the process of casting data into graph structures and
43
+ ingesting them into the database. It supports batch processing, parallel
44
+ execution, and various data formats.
45
+
46
+ Attributes:
47
+ clean_start: Whether to clean the database before ingestion
48
+ n_cores: Number of CPU cores to use for parallel processing
49
+ max_items: Maximum number of items to process
50
+ batch_size: Size of batches for processing
51
+ n_threads: Number of threads for parallel processing
52
+ dry: Whether to perform a dry run (no database changes)
53
+ schema: Schema configuration for the graph
54
+ """
55
+
56
+ def __init__(self, schema: Schema, **kwargs):
57
+ """Initialize the caster with schema and configuration.
58
+
59
+ Args:
60
+ schema: Schema configuration for the graph
61
+ **kwargs: Additional configuration options:
62
+ - clean_start: Whether to clean the database before ingestion
63
+ - n_cores: Number of CPU cores to use
64
+ - max_items: Maximum number of items to process
65
+ - batch_size: Size of batches for processing
66
+ - n_threads: Number of threads for parallel processing
67
+ - dry: Whether to perform a dry run
68
+ """
69
+ self.clean_start: bool = False
70
+ self.n_cores = kwargs.pop("n_cores", 1)
71
+ self.max_items = kwargs.pop("max_items", None)
72
+ self.batch_size = kwargs.pop("batch_size", 10000)
73
+ self.n_threads = kwargs.pop("n_threads", 1)
74
+ self.dry = kwargs.pop("dry", False)
75
+ self.schema = schema
76
+
77
+ @staticmethod
78
+ def discover_files(
79
+ fpath: Path | str, pattern: FilePattern, limit_files=None
80
+ ) -> list[Path]:
81
+ """Discover files matching a pattern in a directory.
82
+
83
+ Args:
84
+ fpath: Path to search in
85
+ pattern: Pattern to match files against
86
+ limit_files: Optional limit on number of files to return
87
+
88
+ Returns:
89
+ list[Path]: List of matching file paths
90
+
91
+ Raises:
92
+ AssertionError: If pattern.sub_path is None
93
+ """
94
+ assert pattern.sub_path is not None
95
+ if isinstance(fpath, str):
96
+ fpath_pathlib = Path(fpath)
97
+ else:
98
+ fpath_pathlib = fpath
99
+
100
+ files = [
101
+ f
102
+ for f in (fpath_pathlib / pattern.sub_path).iterdir()
103
+ if f.is_file()
104
+ and (
105
+ True
106
+ if pattern.regex is None
107
+ else re.search(pattern.regex, f.name) is not None
108
+ )
109
+ ]
110
+
111
+ if limit_files is not None:
112
+ files = files[:limit_files]
113
+
114
+ return files
115
+
116
+ def cast_normal_resource(
117
+ self, data, resource_name: str | None = None
118
+ ) -> GraphContainer:
119
+ """Cast data into a graph container using a resource.
120
+
121
+ Args:
122
+ data: Data to cast
123
+ resource_name: Optional name of the resource to use
124
+
125
+ Returns:
126
+ GraphContainer: Container with cast graph data
127
+ """
128
+ rr = self.schema.fetch_resource(resource_name)
129
+
130
+ with ThreadPoolExecutor(max_workers=self.n_threads) as executor:
131
+ docs = list(
132
+ executor.map(
133
+ lambda doc: rr(doc),
134
+ data,
135
+ )
136
+ )
137
+
138
+ graph = GraphContainer.from_docs_list(docs)
139
+ return graph
140
+
141
+ def process_batch(
142
+ self,
143
+ batch,
144
+ resource_name: str | None,
145
+ conn_conf: None | DBConnectionConfig = None,
146
+ ):
147
+ """Process a batch of data.
148
+
149
+ Args:
150
+ batch: Batch of data to process
151
+ resource_name: Optional name of the resource to use
152
+ conn_conf: Optional database connection configuration
153
+ """
154
+ gc = self.cast_normal_resource(batch, resource_name=resource_name)
155
+
156
+ if conn_conf is not None:
157
+ self.push_db(gc, conn_conf, resource_name=resource_name)
158
+
159
+ def process_resource(
160
+ self,
161
+ resource_instance: Path,
162
+ resource_name: str | None,
163
+ conn_conf: None | DBConnectionConfig = None,
164
+ **kwargs,
165
+ ):
166
+ """Process a resource instance.
167
+
168
+ Args:
169
+ resource_instance: Path to the resource file
170
+ resource_name: Optional name of the resource
171
+ conn_conf: Optional database connection configuration
172
+ """
173
+ chunker = ChunkerFactory.create_chunker(
174
+ resource=resource_instance,
175
+ batch_size=self.batch_size,
176
+ limit=self.max_items,
177
+ **kwargs,
178
+ )
179
+ for batch in chunker:
180
+ self.process_batch(batch, resource_name=resource_name, conn_conf=conn_conf)
181
+
182
+ def push_db(
183
+ self,
184
+ gc: GraphContainer,
185
+ conn_conf: DBConnectionConfig,
186
+ resource_name: str | None,
187
+ ):
188
+ """Push graph container data to the database.
189
+
190
+ Args:
191
+ gc: Graph container with data to push
192
+ conn_conf: Database connection configuration
193
+ resource_name: Optional name of the resource
194
+ """
195
+ vc = self.schema.vertex_config
196
+ resource = self.schema.fetch_resource(resource_name)
197
+ with ConnectionManager(connection_config=conn_conf) as db_client:
198
+ for vcol, data in gc.vertices.items():
199
+ # blank nodes: push and get back their keys {"_key": ...}
200
+ if vcol in vc.blank_vertices:
201
+ query0 = db_client.insert_return_batch(data, vc.vertex_dbname(vcol))
202
+ cursor = db_client.execute(query0)
203
+ gc.vertices[vcol] = [item for item in cursor]
204
+ else:
205
+ db_client.upsert_docs_batch(
206
+ data,
207
+ vc.vertex_dbname(vcol),
208
+ vc.index(vcol),
209
+ update_keys="doc",
210
+ filter_uniques=True,
211
+ dry=self.dry,
212
+ )
213
+
214
+ # update edge misc with blank node edges
215
+ for vcol in vc.blank_vertices:
216
+ for edge_id, edge in self.schema.edge_config.edges_items():
217
+ vfrom, vto, relation = edge_id
218
+ if vcol == vfrom or vcol == vto:
219
+ if edge_id not in gc.edges:
220
+ gc.edges[edge_id] = []
221
+ gc.edges[edge_id].extend(
222
+ [
223
+ (x, y, {})
224
+ for x, y in zip(gc.vertices[vfrom], gc.vertices[vto])
225
+ ]
226
+ )
227
+
228
+ with ConnectionManager(connection_config=conn_conf) as db_client:
229
+ # currently works only on item level
230
+ for edge in resource.extra_weights:
231
+ if edge.weights is None:
232
+ continue
233
+ for weight in edge.weights.vertices:
234
+ if weight.name in vc.vertex_set:
235
+ index_fields = vc.index(weight.name)
236
+
237
+ if not self.dry and weight.name in gc.vertices:
238
+ weights_per_item = db_client.fetch_present_documents(
239
+ class_name=vc.vertex_dbname(weight.name),
240
+ batch=gc.vertices[weight.name],
241
+ match_keys=index_fields.fields,
242
+ keep_keys=weight.fields,
243
+ )
244
+
245
+ for j, item in enumerate(gc.linear):
246
+ weights = weights_per_item[j]
247
+
248
+ for ee in item[edge.edge_id]:
249
+ weight_collection_attached = {
250
+ weight.cfield(k): v
251
+ for k, v in weights[0].items()
252
+ }
253
+ ee.update(weight_collection_attached)
254
+ else:
255
+ logger.error(f"{weight.name} not a valid vertex")
256
+
257
+ with ConnectionManager(connection_config=conn_conf) as db_client:
258
+ for edge_id, edge in self.schema.edge_config.edges_items():
259
+ for ee in gc.loop_over_relations(edge_id):
260
+ _, _, relation = ee
261
+ if not self.dry:
262
+ data = gc.edges[ee]
263
+ db_client.insert_edges_batch(
264
+ docs_edges=data,
265
+ source_class=vc.vertex_dbname(edge.source),
266
+ target_class=vc.vertex_dbname(edge.target),
267
+ relation_name=relation,
268
+ collection_name=edge.collection_name,
269
+ match_keys_source=vc.index(edge.source).fields,
270
+ match_keys_target=vc.index(edge.target).fields,
271
+ filter_uniques=False,
272
+ dry=self.dry,
273
+ )
274
+
275
+ def process_with_queue(self, tasks: mp.Queue, **kwargs):
276
+ """Process tasks from a queue.
277
+
278
+ Args:
279
+ tasks: Queue of tasks to process
280
+ **kwargs: Additional keyword arguments
281
+ """
282
+ while True:
283
+ try:
284
+ task = tasks.get_nowait()
285
+ filepath, resource_name = task
286
+ except queue.Empty:
287
+ break
288
+ else:
289
+ self.process_resource(
290
+ resource_instance=filepath, resource_name=resource_name, **kwargs
291
+ )
292
+
293
+ @staticmethod
294
+ def normalize_resource(
295
+ data: pd.DataFrame | list[list] | list[dict], columns: list[str] | None = None
296
+ ) -> list[dict]:
297
+ """Normalize resource data into a list of dictionaries.
298
+
299
+ Args:
300
+ data: Data to normalize (DataFrame, list of lists, or list of dicts)
301
+ columns: Optional column names for list data
302
+
303
+ Returns:
304
+ list[dict]: Normalized data as list of dictionaries
305
+
306
+ Raises:
307
+ ValueError: If columns is not provided for list data
308
+ """
309
+ if isinstance(data, pd.DataFrame):
310
+ columns = data.columns.tolist()
311
+ _data = data.values.tolist()
312
+ elif data and isinstance(data[0], list):
313
+ _data = cast(list[list], data) # Tell mypy this is list[list]
314
+ if columns is None:
315
+ raise ValueError("columns should be set")
316
+ else:
317
+ return cast(list[dict], data) # Tell mypy this is list[dict]
318
+ rows_dressed = [{k: v for k, v in zip(columns, item)} for item in _data]
319
+ return rows_dressed
320
+
321
+ def ingest_files(self, path: Path | str, **kwargs):
322
+ """Ingest files from a directory.
323
+
324
+ Args:
325
+ path: Path to directory containing files
326
+ **kwargs: Additional keyword arguments:
327
+ - conn_conf: Database connection configuration
328
+ - clean_start: Whether to clean the database before ingestion
329
+ - n_cores: Number of CPU cores to use
330
+ - max_items: Maximum number of items to process
331
+ - batch_size: Size of batches for processing
332
+ - dry: Whether to perform a dry run
333
+ - init_only: Whether to only initialize the database
334
+ - limit_files: Optional limit on number of files to process
335
+ - patterns: Optional file patterns to match
336
+ """
337
+
338
+ path = Path(path).expanduser()
339
+ conn_conf: DBConnectionConfig = kwargs.get("conn_conf")
340
+ self.clean_start = kwargs.pop("clean_start", self.clean_start)
341
+ self.n_cores = kwargs.pop("n_cores", self.n_cores)
342
+ self.max_items = kwargs.pop("max_items", self.max_items)
343
+ self.batch_size = kwargs.pop("batch_size", self.batch_size)
344
+ self.dry = kwargs.pop("dry", self.dry)
345
+ init_only = kwargs.pop("init_only", False)
346
+ limit_files = kwargs.pop("limit_files", None)
347
+ patterns = kwargs.pop("patterns", Patterns())
348
+
349
+ if (
350
+ conn_conf.connection_type == ConnectionKind.ARANGO
351
+ and conn_conf.database == "_system"
352
+ ):
353
+ db_name = self.schema.general.name
354
+ try:
355
+ with ConnectionManager(connection_config=conn_conf) as db_client:
356
+ db_client.create_database(db_name)
357
+ except Exception as exc:
358
+ logger.error(exc)
359
+
360
+ conn_conf.database = db_name
361
+
362
+ with ConnectionManager(connection_config=conn_conf) as db_client:
363
+ db_client.init_db(self.schema, self.clean_start)
364
+
365
+ if init_only:
366
+ logger.info("ingest execution bound to init")
367
+ sys.exit(0)
368
+
369
+ tasks: list[tuple[Path, str]] = []
370
+ for r in self.schema.resources:
371
+ pattern = (
372
+ FilePattern(regex=r.name)
373
+ if r.name not in patterns.patterns
374
+ else patterns.patterns[r.name]
375
+ )
376
+ files = Caster.discover_files(
377
+ path, limit_files=limit_files, pattern=pattern
378
+ )
379
+ logger.info(f"For resource name {r.name} {len(files)} were found")
380
+ tasks += [(f, r.name) for f in files]
381
+
382
+ with Timer() as klepsidra:
383
+ if self.n_cores > 1:
384
+ queue_tasks: mp.Queue = mp.Queue()
385
+ for item in tasks:
386
+ queue_tasks.put(item)
387
+
388
+ func = partial(
389
+ self.process_with_queue,
390
+ **kwargs,
391
+ )
392
+ assert mp.get_start_method() == "fork", (
393
+ "Requires 'forking' operating system"
394
+ )
395
+
396
+ processes = []
397
+
398
+ for w in range(self.n_cores):
399
+ p = mp.Process(target=func, args=(queue_tasks,), kwargs=kwargs)
400
+ processes.append(p)
401
+ p.start()
402
+ for p in processes:
403
+ p.join()
404
+ else:
405
+ for f, resource_name in tasks:
406
+ self.process_resource(
407
+ resource_instance=f, resource_name=resource_name, **kwargs
408
+ )
409
+ logger.info(f"Processing took {klepsidra.elapsed:.1f} sec")
graflo/cli/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """Command-line interface for graflo.
2
+
3
+ This module provides command-line tools for working with graflo, including
4
+ utilities for data ingestion, schema management, and graph operations.
5
+
6
+ Key Components:
7
+ - Command-line tools for data processing
8
+ - Schema management utilities
9
+ - Graph database operations
10
+
11
+ Example:
12
+ >>> uv run ingest --config config.json --data data.json
13
+ >>> uv run plot_schema --config schema.yaml --output figs
14
+ """