graflo 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1120 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +297 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +586 -0
  13. graflo/caster.py +655 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +194 -0
  16. graflo/cli/manage_dbs.py +197 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/data_source/__init__.py +48 -0
  20. graflo/data_source/api.py +339 -0
  21. graflo/data_source/base.py +97 -0
  22. graflo/data_source/factory.py +298 -0
  23. graflo/data_source/file.py +133 -0
  24. graflo/data_source/memory.py +72 -0
  25. graflo/data_source/registry.py +82 -0
  26. graflo/data_source/sql.py +185 -0
  27. graflo/db/__init__.py +44 -0
  28. graflo/db/arango/__init__.py +22 -0
  29. graflo/db/arango/conn.py +1026 -0
  30. graflo/db/arango/query.py +180 -0
  31. graflo/db/arango/util.py +88 -0
  32. graflo/db/conn.py +377 -0
  33. graflo/db/connection/__init__.py +6 -0
  34. graflo/db/connection/config_mapping.py +18 -0
  35. graflo/db/connection/onto.py +688 -0
  36. graflo/db/connection/wsgi.py +29 -0
  37. graflo/db/manager.py +119 -0
  38. graflo/db/neo4j/__init__.py +16 -0
  39. graflo/db/neo4j/conn.py +639 -0
  40. graflo/db/postgres/__init__.py +156 -0
  41. graflo/db/postgres/conn.py +425 -0
  42. graflo/db/postgres/resource_mapping.py +139 -0
  43. graflo/db/postgres/schema_inference.py +245 -0
  44. graflo/db/postgres/types.py +148 -0
  45. graflo/db/tigergraph/__init__.py +9 -0
  46. graflo/db/tigergraph/conn.py +2212 -0
  47. graflo/db/util.py +49 -0
  48. graflo/filter/__init__.py +21 -0
  49. graflo/filter/onto.py +525 -0
  50. graflo/logging.conf +22 -0
  51. graflo/onto.py +190 -0
  52. graflo/plot/__init__.py +17 -0
  53. graflo/plot/plotter.py +556 -0
  54. graflo/util/__init__.py +23 -0
  55. graflo/util/chunker.py +751 -0
  56. graflo/util/merge.py +150 -0
  57. graflo/util/misc.py +37 -0
  58. graflo/util/onto.py +332 -0
  59. graflo/util/transform.py +448 -0
  60. graflo-1.3.3.dist-info/METADATA +190 -0
  61. graflo-1.3.3.dist-info/RECORD +64 -0
  62. graflo-1.3.3.dist-info/WHEEL +4 -0
  63. graflo-1.3.3.dist-info/entry_points.txt +5 -0
  64. graflo-1.3.3.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,1026 @@
1
+ """ArangoDB connection implementation for graph database operations.
2
+
3
+ This module implements the Connection interface for ArangoDB, providing
4
+ specific functionality for graph operations in ArangoDB. It handles:
5
+ - Graph and collection management
6
+ - Document and edge operations
7
+ - Index creation and management
8
+ - AQL query execution
9
+ - Batch operations with upsert support
10
+
11
+ Key Features:
12
+ - Graph-based document organization
13
+ - Edge collection management
14
+ - Persistent, hash, skiplist, and fulltext indices
15
+ - Batch document and edge operations
16
+ - AQL query generation and execution
17
+
18
+ Example:
19
+ >>> conn = ArangoConnection(config)
20
+ >>> conn.init_db(schema, clean_start=True)
21
+ >>> conn.upsert_docs_batch(docs, "users", match_keys=["email"])
22
+ """
23
+
24
+ import json
25
+ import logging
26
+ from typing import Optional
27
+
28
+ from arango import ArangoClient
29
+
30
+ from graflo.architecture.edge import Edge
31
+ from graflo.architecture.onto import (
32
+ Index,
33
+ IndexType,
34
+ )
35
+ from graflo.architecture.schema import Schema
36
+ from graflo.architecture.vertex import VertexConfig
37
+ from graflo.db.arango.query import fetch_fields_query
38
+ from graflo.db.arango.util import render_filters
39
+ from graflo.db.conn import Connection
40
+ from graflo.db.util import get_data_from_cursor
41
+ from graflo.filter.onto import Clause
42
+ from graflo.onto import AggregationType, DBFlavor
43
+ from graflo.util.transform import pick_unique_dict
44
+
45
+ from ..connection.onto import ArangoConfig
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ def _json_serializer(obj):
51
+ """JSON serializer for objects not serializable by default json code.
52
+
53
+ Handles datetime, date, time, and other non-serializable types.
54
+ Decimal should already be converted to float at the data source level.
55
+
56
+ Args:
57
+ obj: Object to serialize
58
+
59
+ Returns:
60
+ JSON-serializable representation
61
+ """
62
+ from datetime import date, datetime, time
63
+
64
+ if isinstance(obj, (datetime, date, time)):
65
+ return obj.isoformat()
66
+ # Decimal should be converted to float at source (SQLDataSource)
67
+ # But handle it here as a fallback
68
+ from decimal import Decimal
69
+
70
+ if isinstance(obj, Decimal):
71
+ return float(obj)
72
+ raise TypeError(f"Type {type(obj)} not serializable")
73
+
74
+
75
+ class ArangoConnection(Connection):
76
+ """ArangoDB-specific implementation of the Connection interface.
77
+
78
+ This class provides ArangoDB-specific implementations for all database
79
+ operations, including graph management, document operations, and query
80
+ execution. It uses the ArangoDB Python driver for all operations.
81
+
82
+ Attributes:
83
+ conn: ArangoDB database connection instance
84
+ """
85
+
86
+ def __init__(self, config: ArangoConfig):
87
+ """Initialize ArangoDB connection.
88
+
89
+ Args:
90
+ config: ArangoDB connection configuration containing URL, credentials,
91
+ and database name
92
+ """
93
+ super().__init__()
94
+ # Store config for later use
95
+ self.config = config
96
+ # Validate required config values
97
+ if config.url is None:
98
+ raise ValueError("ArangoDB connection requires a URL to be configured")
99
+ if config.database is None:
100
+ raise ValueError(
101
+ "ArangoDB connection requires a database name to be configured"
102
+ )
103
+
104
+ # ArangoDB accepts empty string for password if None
105
+ password = config.password if config.password is not None else ""
106
+ # ArangoDB has default username "root" if None
107
+ username = config.username if config.username is not None else "root"
108
+
109
+ # Store client for system operations
110
+ self.client = ArangoClient(
111
+ hosts=config.url, request_timeout=config.request_timeout
112
+ )
113
+ # Connect to the configured database for regular operations
114
+ self.conn = self.client.db(
115
+ config.database,
116
+ username=username,
117
+ password=password,
118
+ )
119
+ # Store credentials for system operations
120
+ self._username = username
121
+ self._password = password
122
+
123
+ def create_database(self, name: str):
124
+ """Create a new ArangoDB database.
125
+
126
+ Database creation/deletion operations must be performed from the _system database.
127
+
128
+ Args:
129
+ name: Name of the database to create
130
+ """
131
+ try:
132
+ # Connect to _system database for system operations
133
+ system_db = self.client.db(
134
+ "_system", username=self._username, password=self._password
135
+ )
136
+ if not system_db.has_database(name):
137
+ try:
138
+ system_db.create_database(name)
139
+ logger.info(f"Successfully created ArangoDB database '{name}'")
140
+ except Exception as create_error:
141
+ logger.error(
142
+ f"Failed to create ArangoDB database '{name}': {create_error}",
143
+ exc_info=True,
144
+ )
145
+ raise
146
+ else:
147
+ logger.debug(f"ArangoDB database '{name}' already exists")
148
+ except Exception as e:
149
+ logger.error(
150
+ f"Error creating ArangoDB database '{name}': {e}",
151
+ exc_info=True,
152
+ )
153
+ raise
154
+
155
+ def delete_database(self, name: str):
156
+ """Delete an ArangoDB database.
157
+
158
+ Database creation/deletion operations must be performed from the _system database.
159
+
160
+ Args:
161
+ name: Name of the database to delete
162
+ """
163
+ try:
164
+ # Connect to _system database for system operations
165
+ system_db = self.client.db(
166
+ "_system", username=self._username, password=self._password
167
+ )
168
+ if system_db.has_database(name):
169
+ try:
170
+ system_db.delete_database(name)
171
+ logger.info(f"Successfully deleted ArangoDB database '{name}'")
172
+ except Exception as delete_error:
173
+ logger.error(
174
+ f"Failed to delete ArangoDB database '{name}': {delete_error}",
175
+ exc_info=True,
176
+ )
177
+ raise
178
+ else:
179
+ logger.debug(
180
+ f"ArangoDB database '{name}' does not exist, skipping deletion"
181
+ )
182
+ except Exception as e:
183
+ logger.error(
184
+ f"Error deleting ArangoDB database '{name}': {e}",
185
+ exc_info=True,
186
+ )
187
+ raise
188
+
189
+ def execute(self, query, **kwargs):
190
+ """Execute an AQL query.
191
+
192
+ Args:
193
+ query: AQL query string to execute
194
+ **kwargs: Additional query parameters
195
+
196
+ Returns:
197
+ Cursor: ArangoDB cursor for the query results
198
+ """
199
+ cursor = self.conn.aql.execute(query)
200
+ return cursor
201
+
202
+ def close(self):
203
+ """Close the ArangoDB connection."""
204
+ # self.conn.close()
205
+ pass
206
+
207
+ def init_db(self, schema: Schema, clean_start):
208
+ """Initialize ArangoDB with the given schema.
209
+
210
+ Checks if the database exists and creates it if it doesn't.
211
+ Uses schema.general.name if database is not set in config.
212
+
213
+ Args:
214
+ schema: Schema containing graph structure definitions
215
+ clean_start: If True, delete all existing collections before initialization
216
+ """
217
+ # Determine database name: use config.database if set, otherwise use schema.general.name
218
+ db_name = self.config.database
219
+ if not db_name:
220
+ db_name = schema.general.name
221
+ # Update config for subsequent operations
222
+ self.config.database = db_name
223
+
224
+ # Check if database exists and create it if it doesn't
225
+ # Use context manager pattern for system database operations
226
+ try:
227
+ system_db = self.client.db(
228
+ "_system", username=self._username, password=self._password
229
+ )
230
+ if not system_db.has_database(db_name):
231
+ logger.info(f"Database '{db_name}' does not exist, creating it...")
232
+ try:
233
+ system_db.create_database(db_name)
234
+ logger.info(f"Successfully created database '{db_name}'")
235
+ except Exception as create_error:
236
+ logger.error(
237
+ f"Failed to create database '{db_name}': {create_error}",
238
+ exc_info=True,
239
+ )
240
+ raise
241
+
242
+ # Reconnect to the target database (newly created or existing)
243
+ if (
244
+ self.config.database != db_name
245
+ or not hasattr(self, "_db_connected")
246
+ or self._db_connected != db_name
247
+ ):
248
+ try:
249
+ self.conn = self.client.db(
250
+ db_name, username=self._username, password=self._password
251
+ )
252
+ self._db_connected = db_name
253
+ logger.debug(f"Connected to database '{db_name}'")
254
+ except Exception as conn_error:
255
+ logger.error(
256
+ f"Failed to connect to database '{db_name}': {conn_error}",
257
+ exc_info=True,
258
+ )
259
+ raise
260
+ except Exception as e:
261
+ logger.error(
262
+ f"Error during database initialization for '{db_name}': {e}",
263
+ exc_info=True,
264
+ )
265
+ raise
266
+
267
+ try:
268
+ if clean_start:
269
+ try:
270
+ self.delete_graph_structure([], [], delete_all=True)
271
+ logger.debug(f"Cleaned database '{db_name}' for fresh start")
272
+ except Exception as clean_error:
273
+ logger.warning(
274
+ f"Error during clean_start for database '{db_name}': {clean_error}",
275
+ exc_info=True,
276
+ )
277
+ # Continue - may be first run or already clean
278
+
279
+ try:
280
+ self.define_schema(schema)
281
+ logger.debug(f"Defined schema for database '{db_name}'")
282
+ except Exception as schema_error:
283
+ logger.error(
284
+ f"Failed to define schema for database '{db_name}': {schema_error}",
285
+ exc_info=True,
286
+ )
287
+ raise
288
+
289
+ try:
290
+ self.define_indexes(schema)
291
+ logger.debug(f"Defined indexes for database '{db_name}'")
292
+ except Exception as index_error:
293
+ logger.error(
294
+ f"Failed to define indexes for database '{db_name}': {index_error}",
295
+ exc_info=True,
296
+ )
297
+ raise
298
+ except Exception as e:
299
+ logger.error(
300
+ f"Error during database schema initialization for '{db_name}': {e}",
301
+ exc_info=True,
302
+ )
303
+ raise
304
+
305
+ def define_schema(self, schema: Schema):
306
+ """Define ArangoDB collections based on schema.
307
+
308
+ Args:
309
+ schema: Schema containing collection definitions
310
+ """
311
+ self.define_vertex_collections(schema)
312
+ self.define_edge_collections(schema.edge_config.edges_list(include_aux=True))
313
+
314
+ def define_vertex_collections(self, schema: Schema):
315
+ """Define vertex collections in ArangoDB.
316
+
317
+ Creates vertex collections for both connected and disconnected vertices,
318
+ organizing them into appropriate graphs.
319
+
320
+ Args:
321
+ schema: Schema containing vertex definitions
322
+ """
323
+ vertex_config = schema.vertex_config
324
+ disconnected_vertex_collections = (
325
+ set(vertex_config.vertex_set) - schema.edge_config.vertices
326
+ )
327
+ for item in schema.edge_config.edges_list():
328
+ u, v = item.source, item.target
329
+ gname = item.graph_name
330
+ if not gname:
331
+ logger.warning(
332
+ f"Edge {item.source} -> {item.target} has no graph_name, skipping"
333
+ )
334
+ continue
335
+ logger.info(f"{item.source}, {item.target}, {gname}")
336
+ if self.conn.has_graph(gname):
337
+ g = self.conn.graph(gname)
338
+ else:
339
+ g = self.conn.create_graph(gname) # type: ignore
340
+
341
+ _ = self.create_collection(
342
+ vertex_config.vertex_dbname(u), vertex_config.index(u), g
343
+ )
344
+
345
+ _ = self.create_collection(
346
+ vertex_config.vertex_dbname(v), vertex_config.index(v), g
347
+ )
348
+ for v in disconnected_vertex_collections:
349
+ _ = self.create_collection(
350
+ vertex_config.vertex_dbname(v), vertex_config.index(v), None
351
+ )
352
+
353
+ def define_edge_collections(self, edges: list[Edge]):
354
+ """Define edge collections in ArangoDB.
355
+
356
+ Creates edge collections and their definitions in the appropriate graphs.
357
+
358
+ Args:
359
+ edges: List of edge configurations to create
360
+ """
361
+ for item in edges:
362
+ gname = item.graph_name
363
+ if not gname:
364
+ logger.warning("Edge has no graph_name, skipping")
365
+ continue
366
+ if self.conn.has_graph(gname):
367
+ g = self.conn.graph(gname)
368
+ else:
369
+ g = self.conn.create_graph(gname) # type: ignore
370
+ collection_name = item.collection_name
371
+ if not collection_name:
372
+ logger.warning("Edge has no collection_name, skipping")
373
+ continue
374
+ if not g.has_edge_definition(collection_name):
375
+ _ = g.create_edge_definition(
376
+ edge_collection=collection_name,
377
+ from_vertex_collections=[item._source_collection],
378
+ to_vertex_collections=[item._target_collection],
379
+ )
380
+
381
+ def _add_index(self, general_collection, index: Index):
382
+ """Add an index to an ArangoDB collection.
383
+
384
+ Supports persistent, hash, skiplist, and fulltext indices.
385
+
386
+ Args:
387
+ general_collection: ArangoDB collection to add index to
388
+ index: Index configuration to create
389
+
390
+ Returns:
391
+ IndexHandle: Handle to the created index
392
+ """
393
+ data = index.db_form(DBFlavor.ARANGO)
394
+ if index.type == IndexType.PERSISTENT:
395
+ ih = general_collection.add_index(data)
396
+ if index.type == IndexType.HASH:
397
+ ih = general_collection.add_index(data)
398
+ elif index.type == IndexType.SKIPLIST:
399
+ ih = general_collection.add_skiplist_index(
400
+ fields=index.fields, unique=index.unique
401
+ )
402
+ elif index.type == IndexType.FULLTEXT:
403
+ ih = general_collection.add_index(
404
+ data={"fields": index.fields, "type": "fulltext"}
405
+ )
406
+ else:
407
+ ih = None
408
+ return ih
409
+
410
+ def define_vertex_indices(self, vertex_config: VertexConfig):
411
+ """Define indices for vertex collections.
412
+
413
+ Creates indices for each vertex collection based on the configuration.
414
+
415
+ Args:
416
+ vertex_config: Vertex configuration containing index definitions
417
+ """
418
+ for c in vertex_config.vertex_set:
419
+ general_collection = self.conn.collection(vertex_config.vertex_dbname(c))
420
+ ixs = general_collection.indexes()
421
+ field_combinations = [tuple(ix["fields"]) for ix in ixs]
422
+ for index_obj in vertex_config.indexes(c):
423
+ if tuple(index_obj.fields) not in field_combinations:
424
+ self._add_index(general_collection, index_obj)
425
+
426
+ def define_edge_indices(self, edges: list[Edge]):
427
+ """Define indices for edge collections.
428
+
429
+ Creates indices for each edge collection based on the configuration.
430
+
431
+ Args:
432
+ edges: List of edge configurations containing index definitions
433
+ """
434
+ for edge in edges:
435
+ collection_name = edge.collection_name
436
+ if not collection_name:
437
+ logger.warning("Edge has no collection_name, skipping index creation")
438
+ continue
439
+ general_collection = self.conn.collection(collection_name)
440
+ for index_obj in edge.indexes:
441
+ self._add_index(general_collection, index_obj)
442
+
443
+ def fetch_indexes(self, db_class_name: Optional[str] = None):
444
+ """Fetch all indices from the database.
445
+
446
+ Args:
447
+ db_class_name: Optional collection name to fetch indices for
448
+
449
+ Returns:
450
+ dict: Mapping of collection names to their indices
451
+ """
452
+ if db_class_name is None:
453
+ classes = self.conn.collections()
454
+ elif self.conn.has_collection(db_class_name):
455
+ classes = [self.conn.collection(db_class_name)]
456
+ else:
457
+ classes = []
458
+
459
+ r = {}
460
+ for cname in classes:
461
+ assert isinstance(cname["name"], str)
462
+ c = self.conn.collection(cname["name"])
463
+ r[cname["name"]] = c.indexes()
464
+ return r
465
+
466
+ def create_collection(self, db_class_name, index: None | Index = None, g=None):
467
+ """Create a new ArangoDB collection.
468
+
469
+ Args:
470
+ db_class_name: Name of the collection to create
471
+ index: Optional index to create on the collection
472
+ g: Optional graph to create the collection in
473
+
474
+ Returns:
475
+ IndexHandle: Handle to the created index if one was created
476
+ """
477
+ if not self.conn.has_collection(db_class_name):
478
+ if g is not None:
479
+ _ = g.create_vertex_collection(db_class_name)
480
+ else:
481
+ self.conn.create_collection(db_class_name)
482
+ general_collection = self.conn.collection(db_class_name)
483
+ if index is not None and index.fields != ["_key"]:
484
+ ih = self._add_index(general_collection, index)
485
+ return ih
486
+ else:
487
+ return None
488
+
489
+ def delete_graph_structure(self, vertex_types=(), graph_names=(), delete_all=False):
490
+ """Delete graph structure (collections and graphs) from ArangoDB.
491
+
492
+ In ArangoDB:
493
+ - Collections: Container for vertices (vertex collections) and edges (edge collections)
494
+ - Graphs: Named graphs that connect vertex and edge collections
495
+
496
+ Args:
497
+ vertex_types: Collection names to delete (vertex or edge collections)
498
+ graph_names: Graph names to delete
499
+ delete_all: If True, delete all non-system collections and graphs
500
+ """
501
+ cnames = vertex_types
502
+ gnames = graph_names
503
+ logger.info("collections (non system):")
504
+ logger.info([c for c in self.conn.collections() if c["name"][0] != "_"])
505
+
506
+ if delete_all:
507
+ cnames = [c["name"] for c in self.conn.collections() if c["name"][0] != "_"]
508
+ gnames = [g["name"] for g in self.conn.graphs()]
509
+
510
+ for gn in gnames:
511
+ if self.conn.has_graph(gn):
512
+ self.conn.delete_graph(gn)
513
+
514
+ logger.info("graphs (after delete operation):")
515
+ logger.info(self.conn.graphs())
516
+
517
+ for cn in cnames:
518
+ if self.conn.has_collection(cn):
519
+ self.conn.delete_collection(cn)
520
+
521
+ logger.info("collections (after delete operation):")
522
+ logger.info([c for c in self.conn.collections() if c["name"][0] != "_"])
523
+
524
+ logger.info("graphs:")
525
+ logger.info(self.conn.graphs())
526
+
527
+ def get_collections(self):
528
+ """Get all collections in the database.
529
+
530
+ Returns:
531
+ list: List of collection information dictionaries
532
+ """
533
+ return self.conn.collections()
534
+
535
+ def upsert_docs_batch(
536
+ self,
537
+ docs,
538
+ class_name,
539
+ match_keys: list[str] | None = None,
540
+ **kwargs,
541
+ ):
542
+ """Upsert a batch of documents using AQL.
543
+
544
+ Performs an upsert operation on a batch of documents, using the specified
545
+ match keys to determine whether to update existing documents or insert new ones.
546
+
547
+ Args:
548
+ docs: List of documents to upsert
549
+ class_name: Collection name to upsert into
550
+ match_keys: Keys to match for upsert operation
551
+ **kwargs: Additional options:
552
+ - dry: If True, don't execute the query
553
+ - update_keys: Keys to update on match
554
+ - filter_uniques: If True, filter duplicate documents
555
+ """
556
+ dry = kwargs.pop("dry", False)
557
+ update_keys = kwargs.pop("update_keys", None)
558
+ filter_uniques = kwargs.pop("filter_uniques", True)
559
+
560
+ if isinstance(docs, list):
561
+ if filter_uniques:
562
+ docs = pick_unique_dict(docs)
563
+ docs = json.dumps(docs, default=_json_serializer)
564
+ if match_keys is None:
565
+ upsert_clause = ""
566
+ update_clause = ""
567
+ else:
568
+ upsert_clause = ", ".join([f'"{k}": doc.{k}' for k in match_keys])
569
+ upsert_clause = f"UPSERT {{{upsert_clause}}}"
570
+
571
+ if isinstance(update_keys, list):
572
+ update_clause = ", ".join([f'"{k}": doc.{k}' for k in update_keys])
573
+ update_clause = f"{{{update_clause}}}"
574
+ elif update_keys == "doc":
575
+ update_clause = "doc"
576
+ else:
577
+ update_clause = "{}"
578
+ update_clause = f"UPDATE {update_clause}"
579
+
580
+ options = "OPTIONS {exclusive: true, ignoreErrors: true}"
581
+
582
+ q_update = f"""FOR doc in {docs}
583
+ {upsert_clause}
584
+ INSERT doc
585
+ {update_clause}
586
+ IN {class_name} {options}"""
587
+ if not dry:
588
+ self.execute(q_update)
589
+
590
+ def insert_edges_batch(
591
+ self,
592
+ docs_edges,
593
+ source_class,
594
+ target_class,
595
+ relation_name=None,
596
+ collection_name=None,
597
+ match_keys_source=("_key",),
598
+ match_keys_target=("_key",),
599
+ filter_uniques=True,
600
+ uniq_weight_fields=None,
601
+ uniq_weight_collections=None,
602
+ upsert_option=False,
603
+ head=None,
604
+ **kwargs,
605
+ ):
606
+ """Insert a batch of edges using AQL.
607
+
608
+ Creates edges between source and target vertices, with support for
609
+ weight fields and unique constraints.
610
+
611
+ Args:
612
+ docs_edges: List of edge documents in format [{_source_aux: source_doc, _target_aux: target_doc}]
613
+ source_class: Source vertex collection name
614
+ target_class: Target vertex collection name
615
+ relation_name: Optional relation name for the edges
616
+ collection_name: Edge collection name
617
+ match_keys_source: Keys to match source vertices
618
+ match_keys_target: Keys to match target vertices
619
+ filter_uniques: If True, filter duplicate edges
620
+ uniq_weight_fields: Fields to consider for uniqueness
621
+ uniq_weight_collections: Collections to consider for uniqueness
622
+ upsert_option: If True, use upsert instead of insert
623
+ head: Optional limit on number of edges to insert
624
+ **kwargs: Additional options:
625
+ - dry: If True, don't execute the query
626
+ """
627
+ dry = kwargs.pop("dry", False)
628
+
629
+ if isinstance(docs_edges, list):
630
+ if docs_edges:
631
+ logger.debug(f" docs_edges[0] = {docs_edges[0]}")
632
+ if head is not None:
633
+ docs_edges = docs_edges[:head]
634
+ if filter_uniques:
635
+ docs_edges = pick_unique_dict(docs_edges)
636
+ docs_edges_str = json.dumps(docs_edges)
637
+ else:
638
+ return ""
639
+
640
+ if match_keys_source[0] == "_key":
641
+ result_from = f'CONCAT("{source_class}/", edge[0]._key)'
642
+ source_filter = ""
643
+ else:
644
+ result_from = "sources[0]._id"
645
+ filter_source = " && ".join(
646
+ [f"v.{k} == edge[0].{k}" for k in match_keys_source]
647
+ )
648
+ source_filter = (
649
+ f"LET sources = (FOR v IN {source_class} FILTER"
650
+ f" {filter_source} LIMIT 1 RETURN v)"
651
+ )
652
+
653
+ if match_keys_target[0] == "_key":
654
+ result_to = f'CONCAT("{target_class}/", edge[1]._key)'
655
+ target_filter = ""
656
+ else:
657
+ result_to = "targets[0]._id"
658
+ filter_target = " && ".join(
659
+ [f"v.{k} == edge[1].{k}" for k in match_keys_target]
660
+ )
661
+ target_filter = (
662
+ f"LET targets = (FOR v IN {target_class} FILTER"
663
+ f" {filter_target} LIMIT 1 RETURN v)"
664
+ )
665
+
666
+ doc_definition = f"MERGE({{_from : {result_from}, _to : {result_to}}}, edge[2])"
667
+
668
+ logger.debug(f" source_filter = {source_filter}")
669
+ logger.debug(f" target_filter = {target_filter}")
670
+ logger.debug(f" doc = {doc_definition}")
671
+
672
+ if upsert_option:
673
+ ups_from = result_from if source_filter else "doc._from"
674
+ ups_to = result_to if target_filter else "doc._to"
675
+
676
+ weight_fs = []
677
+ if uniq_weight_fields is not None:
678
+ weight_fs += uniq_weight_fields
679
+ if uniq_weight_collections is not None:
680
+ weight_fs += uniq_weight_collections
681
+ if relation_name is not None:
682
+ weight_fs += ["relation"]
683
+
684
+ if weight_fs:
685
+ weights_clause = ", " + ", ".join(
686
+ [f"'{x}' : edge.{x}" for x in weight_fs]
687
+ )
688
+ else:
689
+ weights_clause = ""
690
+
691
+ upsert = f"{{'_from': {ups_from}, '_to': {ups_to}" + weights_clause + "}"
692
+ logger.debug(f" upsert clause: {upsert}")
693
+ clauses = f"UPSERT {upsert} INSERT doc UPDATE {{}}"
694
+ options = "OPTIONS {exclusive: true}"
695
+ else:
696
+ if relation_name is None:
697
+ doc_clause = "doc"
698
+ else:
699
+ doc_clause = f"MERGE(doc, {{'relation': '{relation_name}' }})"
700
+ clauses = f"INSERT {doc_clause}"
701
+ options = "OPTIONS {exclusive: true, ignoreErrors: true}"
702
+
703
+ q_update = f"""
704
+ FOR edge in {docs_edges_str} {source_filter} {target_filter}
705
+ LET doc = {doc_definition}
706
+ {clauses}
707
+ in {collection_name} {options}"""
708
+ if not dry:
709
+ self.execute(q_update)
710
+
711
+ def insert_return_batch(self, docs, class_name):
712
+ """Insert documents and return their keys.
713
+
714
+ Args:
715
+ docs: Documents to insert
716
+ class_name: Collection to insert into
717
+
718
+ Returns:
719
+ str: AQL query string for the operation
720
+ """
721
+ docs = json.dumps(docs)
722
+ query0 = f"""FOR doc in {docs}
723
+ INSERT doc
724
+ INTO {class_name}
725
+ LET inserted = NEW
726
+ RETURN {{_key: inserted._key}}
727
+ """
728
+ return query0
729
+
730
+ def fetch_present_documents(
731
+ self,
732
+ batch,
733
+ class_name,
734
+ match_keys,
735
+ keep_keys,
736
+ flatten=False,
737
+ filters: None | Clause | list | dict = None,
738
+ ) -> list | dict:
739
+ """Fetch documents that exist in the database.
740
+
741
+ Args:
742
+ batch: Batch of documents to check
743
+ class_name: Collection to check in
744
+ match_keys: Keys to match documents
745
+ keep_keys: Keys to keep in result
746
+ flatten: If True, flatten the result into a list
747
+ filters: Additional query filters
748
+
749
+ Returns:
750
+ Union[list, dict]: Documents that exist in the database, either as a
751
+ flat list or a dictionary mapping batch indices to documents
752
+ """
753
+ q0 = fetch_fields_query(
754
+ collection_name=class_name,
755
+ docs=batch,
756
+ match_keys=match_keys,
757
+ keep_keys=keep_keys,
758
+ filters=filters,
759
+ )
760
+ # {"__i": i, "_group": [doc]}
761
+ cursor = self.execute(q0)
762
+
763
+ if flatten:
764
+ rdata = []
765
+ for item in get_data_from_cursor(cursor):
766
+ group = item.pop("_group", [])
767
+ rdata += [sub_item for sub_item in group]
768
+ return rdata
769
+ else:
770
+ rdata_dict = {}
771
+ for item in get_data_from_cursor(cursor):
772
+ __i = item.pop("__i")
773
+ group = item.pop("_group")
774
+ rdata_dict[__i] = group
775
+ return rdata_dict
776
+
777
+ def fetch_docs(
778
+ self,
779
+ class_name,
780
+ filters: None | Clause | list | dict = None,
781
+ limit: int | None = None,
782
+ return_keys: list | None = None,
783
+ unset_keys: list | None = None,
784
+ **kwargs,
785
+ ):
786
+ """Fetch documents from a collection.
787
+
788
+ Args:
789
+ class_name: Collection to fetch from
790
+ filters: Query filters
791
+ limit: Maximum number of documents to return
792
+ return_keys: Keys to return
793
+ unset_keys: Keys to unset
794
+
795
+ Returns:
796
+ list: Fetched documents
797
+ """
798
+ filter_clause = render_filters(filters, doc_name="d")
799
+
800
+ if return_keys is None:
801
+ if unset_keys is None:
802
+ return_clause = "d"
803
+ else:
804
+ tmp_clause = ", ".join([f'"{item}"' for item in unset_keys])
805
+ return_clause = f"UNSET(d, {tmp_clause})"
806
+ else:
807
+ if unset_keys is None:
808
+ tmp_clause = ", ".join([f'"{item}"' for item in return_keys])
809
+ return_clause = f"KEEP(d, {tmp_clause})"
810
+ else:
811
+ raise ValueError("both return_keys and unset_keys are set")
812
+
813
+ if limit is not None and isinstance(limit, int):
814
+ limit_clause = f"LIMIT {limit}"
815
+ else:
816
+ limit_clause = ""
817
+
818
+ q = (
819
+ f"FOR d in {class_name}"
820
+ f" {filter_clause}"
821
+ f" {limit_clause}"
822
+ f" RETURN {return_clause}"
823
+ )
824
+ cursor = self.execute(q)
825
+ return get_data_from_cursor(cursor)
826
+
827
+ # TODO test
828
+ def fetch_edges(
829
+ self,
830
+ from_type: str,
831
+ from_id: str,
832
+ edge_type: str | None = None,
833
+ to_type: str | None = None,
834
+ to_id: str | None = None,
835
+ filters: list | dict | Clause | None = None,
836
+ limit: int | None = None,
837
+ return_keys: list | None = None,
838
+ unset_keys: list | None = None,
839
+ **kwargs,
840
+ ):
841
+ """Fetch edges from ArangoDB using AQL.
842
+
843
+ Args:
844
+ from_type: Source vertex collection name
845
+ from_id: Source vertex ID (can be _key or _id)
846
+ edge_type: Optional edge collection name to filter by
847
+ to_type: Optional target vertex collection name to filter by
848
+ to_id: Optional target vertex ID to filter by
849
+ filters: Additional query filters
850
+ limit: Maximum number of edges to return
851
+ return_keys: Keys to return (projection)
852
+ unset_keys: Keys to exclude (projection)
853
+ **kwargs: Additional parameters
854
+
855
+ Returns:
856
+ list: List of fetched edges
857
+ """
858
+ # Convert from_id to _id format if needed
859
+ if not from_id.startswith(from_type):
860
+ # Assume it's a _key, convert to _id
861
+ from_vertex_id = f"{from_type}/{from_id}"
862
+ else:
863
+ from_vertex_id = from_id
864
+
865
+ # Build AQL query to fetch edges
866
+ # Start with basic edge traversal
867
+ if edge_type:
868
+ edge_collection = edge_type
869
+ else:
870
+ # If no edge_type specified, we need to search all edge collections
871
+ # This is a simplified version - in practice you might want to list all edge collections
872
+ raise ValueError("edge_type is required for ArangoDB edge fetching")
873
+
874
+ filter_clause = render_filters(filters, doc_name="e")
875
+ filter_parts = []
876
+
877
+ if to_type:
878
+ filter_parts.append(f"e._to LIKE '{to_type}/%'")
879
+ if to_id and to_type:
880
+ if not to_id.startswith(to_type):
881
+ to_vertex_id = f"{to_type}/{to_id}"
882
+ else:
883
+ to_vertex_id = to_id
884
+ filter_parts.append(f"e._to == '{to_vertex_id}'")
885
+
886
+ additional_filters = " && ".join(filter_parts)
887
+ if filter_clause and additional_filters:
888
+ filter_clause = f"{filter_clause} && {additional_filters}"
889
+ elif additional_filters:
890
+ filter_clause = additional_filters
891
+
892
+ query = f"""
893
+ FOR e IN {edge_collection}
894
+ FILTER e._from == '{from_vertex_id}'
895
+ {f"FILTER {filter_clause}" if filter_clause else ""}
896
+ {f"LIMIT {limit}" if limit else ""}
897
+ RETURN e
898
+ """
899
+
900
+ cursor = self.execute(query)
901
+ result = list(get_data_from_cursor(cursor))
902
+
903
+ # Apply projection
904
+ if return_keys is not None:
905
+ result = [
906
+ {k: doc.get(k) for k in return_keys if k in doc} for doc in result
907
+ ]
908
+ elif unset_keys is not None:
909
+ result = [
910
+ {k: v for k, v in doc.items() if k not in unset_keys} for doc in result
911
+ ]
912
+
913
+ return result
914
+
915
+ def aggregate(
916
+ self,
917
+ class_name,
918
+ aggregation_function: AggregationType,
919
+ discriminant: str | None = None,
920
+ aggregated_field: str | None = None,
921
+ filters: None | Clause | list | dict = None,
922
+ ):
923
+ """Perform aggregation on a collection.
924
+
925
+ Args:
926
+ class_name: Collection to aggregate
927
+ aggregation_function: Type of aggregation to perform
928
+ discriminant: Field to group by
929
+ aggregated_field: Field to aggregate
930
+ filters: Query filters
931
+
932
+ Returns:
933
+ list: Aggregation results
934
+ """
935
+ filter_clause = render_filters(filters, doc_name="doc")
936
+
937
+ if (
938
+ aggregated_field is not None
939
+ and aggregation_function != AggregationType.COUNT
940
+ ):
941
+ group_unit = f"g[*].doc.{aggregated_field}"
942
+ else:
943
+ group_unit = "g"
944
+
945
+ if discriminant is not None:
946
+ collect_clause = f"COLLECT value = doc['{discriminant}'] INTO g"
947
+ return_clause = f"""{{ '{discriminant}' : value, '_value': {aggregation_function}({group_unit})}}"""
948
+ else:
949
+ if (
950
+ aggregated_field is None
951
+ and aggregation_function == AggregationType.COUNT
952
+ ):
953
+ collect_clause = (
954
+ f"COLLECT AGGREGATE value = {aggregation_function} (doc)"
955
+ )
956
+ else:
957
+ collect_clause = (
958
+ "COLLECT AGGREGATE value ="
959
+ f" {aggregation_function}(doc['{aggregated_field}'])"
960
+ )
961
+ return_clause = """{ '_value' : value }"""
962
+
963
+ q = f"""FOR doc IN {class_name}
964
+ {filter_clause}
965
+ {collect_clause}
966
+ RETURN {return_clause}"""
967
+
968
+ cursor = self.execute(q)
969
+ data = get_data_from_cursor(cursor)
970
+ return data
971
+
972
+ def keep_absent_documents(
973
+ self,
974
+ batch,
975
+ class_name,
976
+ match_keys,
977
+ keep_keys,
978
+ filters: None | Clause | list | dict = None,
979
+ ):
980
+ """Keep documents that don't exist in the database.
981
+
982
+ Args:
983
+ batch: Batch of documents to check
984
+ class_name: Collection to check in
985
+ match_keys: Keys to match documents
986
+ keep_keys: Keys to keep in result
987
+ filters: Additional query filters
988
+
989
+ Returns:
990
+ list: Documents that don't exist in the database
991
+ """
992
+ present_docs_keys = self.fetch_present_documents(
993
+ batch=batch,
994
+ class_name=class_name,
995
+ match_keys=match_keys,
996
+ keep_keys=keep_keys,
997
+ flatten=False,
998
+ filters=filters,
999
+ )
1000
+
1001
+ assert isinstance(present_docs_keys, dict)
1002
+
1003
+ if any([len(v) > 1 for v in present_docs_keys.values()]):
1004
+ logger.warning(
1005
+ "fetch_present_documents returned multiple docs per filtering condition"
1006
+ )
1007
+
1008
+ absent_indices = sorted(set(range(len(batch))) - set(present_docs_keys.keys()))
1009
+ batch_absent = [batch[j] for j in absent_indices]
1010
+ return batch_absent
1011
+
1012
+ def update_to_numeric(self, collection_name, field):
1013
+ """Update a field to numeric type in all documents.
1014
+
1015
+ Args:
1016
+ collection_name: Collection to update
1017
+ field: Field to convert to numeric
1018
+
1019
+ Returns:
1020
+ str: AQL query string for the operation
1021
+ """
1022
+ s1 = f"FOR p IN {collection_name} FILTER p.{field} update p with {{"
1023
+ s2 = f"{field}: TO_NUMBER(p.{field}) "
1024
+ s3 = f"}} in {collection_name}"
1025
+ q0 = s1 + s2 + s3
1026
+ return q0