graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +70 -0
  3. graflo/architecture/__init__.py +38 -0
  4. graflo/architecture/actor.py +1276 -0
  5. graflo/architecture/actor_util.py +450 -0
  6. graflo/architecture/edge.py +418 -0
  7. graflo/architecture/onto.py +376 -0
  8. graflo/architecture/onto_sql.py +54 -0
  9. graflo/architecture/resource.py +163 -0
  10. graflo/architecture/schema.py +135 -0
  11. graflo/architecture/transform.py +292 -0
  12. graflo/architecture/util.py +89 -0
  13. graflo/architecture/vertex.py +562 -0
  14. graflo/caster.py +736 -0
  15. graflo/cli/__init__.py +14 -0
  16. graflo/cli/ingest.py +203 -0
  17. graflo/cli/manage_dbs.py +197 -0
  18. graflo/cli/plot_schema.py +132 -0
  19. graflo/cli/xml2json.py +93 -0
  20. graflo/data_source/__init__.py +48 -0
  21. graflo/data_source/api.py +339 -0
  22. graflo/data_source/base.py +95 -0
  23. graflo/data_source/factory.py +304 -0
  24. graflo/data_source/file.py +148 -0
  25. graflo/data_source/memory.py +70 -0
  26. graflo/data_source/registry.py +82 -0
  27. graflo/data_source/sql.py +183 -0
  28. graflo/db/__init__.py +44 -0
  29. graflo/db/arango/__init__.py +22 -0
  30. graflo/db/arango/conn.py +1025 -0
  31. graflo/db/arango/query.py +180 -0
  32. graflo/db/arango/util.py +88 -0
  33. graflo/db/conn.py +377 -0
  34. graflo/db/connection/__init__.py +6 -0
  35. graflo/db/connection/config_mapping.py +18 -0
  36. graflo/db/connection/onto.py +717 -0
  37. graflo/db/connection/wsgi.py +29 -0
  38. graflo/db/manager.py +119 -0
  39. graflo/db/neo4j/__init__.py +16 -0
  40. graflo/db/neo4j/conn.py +639 -0
  41. graflo/db/postgres/__init__.py +37 -0
  42. graflo/db/postgres/conn.py +948 -0
  43. graflo/db/postgres/fuzzy_matcher.py +281 -0
  44. graflo/db/postgres/heuristics.py +133 -0
  45. graflo/db/postgres/inference_utils.py +428 -0
  46. graflo/db/postgres/resource_mapping.py +273 -0
  47. graflo/db/postgres/schema_inference.py +372 -0
  48. graflo/db/postgres/types.py +148 -0
  49. graflo/db/postgres/util.py +87 -0
  50. graflo/db/tigergraph/__init__.py +9 -0
  51. graflo/db/tigergraph/conn.py +2365 -0
  52. graflo/db/tigergraph/onto.py +26 -0
  53. graflo/db/util.py +49 -0
  54. graflo/filter/__init__.py +21 -0
  55. graflo/filter/onto.py +525 -0
  56. graflo/logging.conf +22 -0
  57. graflo/onto.py +312 -0
  58. graflo/plot/__init__.py +17 -0
  59. graflo/plot/plotter.py +616 -0
  60. graflo/util/__init__.py +23 -0
  61. graflo/util/chunker.py +807 -0
  62. graflo/util/merge.py +150 -0
  63. graflo/util/misc.py +37 -0
  64. graflo/util/onto.py +422 -0
  65. graflo/util/transform.py +454 -0
  66. graflo-1.3.7.dist-info/METADATA +243 -0
  67. graflo-1.3.7.dist-info/RECORD +70 -0
  68. graflo-1.3.7.dist-info/WHEEL +4 -0
  69. graflo-1.3.7.dist-info/entry_points.txt +5 -0
  70. graflo-1.3.7.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,1025 @@
1
+ """ArangoDB connection implementation for graph database operations.
2
+
3
+ This module implements the Connection interface for ArangoDB, providing
4
+ specific functionality for graph operations in ArangoDB. It handles:
5
+ - Graph and collection management
6
+ - Document and edge operations
7
+ - Index creation and management
8
+ - AQL query execution
9
+ - Batch operations with upsert support
10
+
11
+ Key Features:
12
+ - Graph-based document organization
13
+ - Edge collection management
14
+ - Persistent, hash, skiplist, and fulltext indices
15
+ - Batch document and edge operations
16
+ - AQL query generation and execution
17
+
18
+ Example:
19
+ >>> conn = ArangoConnection(config)
20
+ >>> conn.init_db(schema, clean_start=True)
21
+ >>> conn.upsert_docs_batch(docs, "users", match_keys=["email"])
22
+ """
23
+
24
+ import json
25
+ import logging
26
+
27
+ from arango import ArangoClient
28
+
29
+ from graflo.architecture.edge import Edge
30
+ from graflo.architecture.onto import (
31
+ Index,
32
+ IndexType,
33
+ )
34
+ from graflo.architecture.schema import Schema
35
+ from graflo.architecture.vertex import VertexConfig
36
+ from graflo.db.arango.query import fetch_fields_query
37
+ from graflo.db.arango.util import render_filters
38
+ from graflo.db.conn import Connection
39
+ from graflo.db.util import get_data_from_cursor
40
+ from graflo.filter.onto import Clause
41
+ from graflo.onto import AggregationType, DBFlavor
42
+ from graflo.util.transform import pick_unique_dict
43
+
44
+ from ..connection.onto import ArangoConfig
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ def _json_serializer(obj):
50
+ """JSON serializer for objects not serializable by default json code.
51
+
52
+ Handles datetime, date, time, and other non-serializable types.
53
+ Decimal should already be converted to float at the data source level.
54
+
55
+ Args:
56
+ obj: Object to serialize
57
+
58
+ Returns:
59
+ JSON-serializable representation
60
+ """
61
+ from datetime import date, datetime, time
62
+
63
+ if isinstance(obj, (datetime, date, time)):
64
+ return obj.isoformat()
65
+ # Decimal should be converted to float at source (SQLDataSource)
66
+ # But handle it here as a fallback
67
+ from decimal import Decimal
68
+
69
+ if isinstance(obj, Decimal):
70
+ return float(obj)
71
+ raise TypeError(f"Type {type(obj)} not serializable")
72
+
73
+
74
+ class ArangoConnection(Connection):
75
+ """ArangoDB-specific implementation of the Connection interface.
76
+
77
+ This class provides ArangoDB-specific implementations for all database
78
+ operations, including graph management, document operations, and query
79
+ execution. It uses the ArangoDB Python driver for all operations.
80
+
81
+ Attributes:
82
+ conn: ArangoDB database connection instance
83
+ """
84
+
85
+ def __init__(self, config: ArangoConfig):
86
+ """Initialize ArangoDB connection.
87
+
88
+ Args:
89
+ config: ArangoDB connection configuration containing URL, credentials,
90
+ and database name
91
+ """
92
+ super().__init__()
93
+ # Store config for later use
94
+ self.config = config
95
+ # Validate required config values
96
+ if config.url is None:
97
+ raise ValueError("ArangoDB connection requires a URL to be configured")
98
+ if config.database is None:
99
+ raise ValueError(
100
+ "ArangoDB connection requires a database name to be configured"
101
+ )
102
+
103
+ # ArangoDB accepts empty string for password if None
104
+ password = config.password if config.password is not None else ""
105
+ # ArangoDB has default username "root" if None
106
+ username = config.username if config.username is not None else "root"
107
+
108
+ # Store client for system operations
109
+ self.client = ArangoClient(
110
+ hosts=config.url, request_timeout=config.request_timeout
111
+ )
112
+ # Connect to the configured database for regular operations
113
+ self.conn = self.client.db(
114
+ config.database,
115
+ username=username,
116
+ password=password,
117
+ )
118
+ # Store credentials for system operations
119
+ self._username = username
120
+ self._password = password
121
+
122
+ def create_database(self, name: str):
123
+ """Create a new ArangoDB database.
124
+
125
+ Database creation/deletion operations must be performed from the _system database.
126
+
127
+ Args:
128
+ name: Name of the database to create
129
+ """
130
+ try:
131
+ # Connect to _system database for system operations
132
+ system_db = self.client.db(
133
+ "_system", username=self._username, password=self._password
134
+ )
135
+ if not system_db.has_database(name):
136
+ try:
137
+ system_db.create_database(name)
138
+ logger.info(f"Successfully created ArangoDB database '{name}'")
139
+ except Exception as create_error:
140
+ logger.error(
141
+ f"Failed to create ArangoDB database '{name}': {create_error}",
142
+ exc_info=True,
143
+ )
144
+ raise
145
+ else:
146
+ logger.debug(f"ArangoDB database '{name}' already exists")
147
+ except Exception as e:
148
+ logger.error(
149
+ f"Error creating ArangoDB database '{name}': {e}",
150
+ exc_info=True,
151
+ )
152
+ raise
153
+
154
+ def delete_database(self, name: str):
155
+ """Delete an ArangoDB database.
156
+
157
+ Database creation/deletion operations must be performed from the _system database.
158
+
159
+ Args:
160
+ name: Name of the database to delete
161
+ """
162
+ try:
163
+ # Connect to _system database for system operations
164
+ system_db = self.client.db(
165
+ "_system", username=self._username, password=self._password
166
+ )
167
+ if system_db.has_database(name):
168
+ try:
169
+ system_db.delete_database(name)
170
+ logger.info(f"Successfully deleted ArangoDB database '{name}'")
171
+ except Exception as delete_error:
172
+ logger.error(
173
+ f"Failed to delete ArangoDB database '{name}': {delete_error}",
174
+ exc_info=True,
175
+ )
176
+ raise
177
+ else:
178
+ logger.debug(
179
+ f"ArangoDB database '{name}' does not exist, skipping deletion"
180
+ )
181
+ except Exception as e:
182
+ logger.error(
183
+ f"Error deleting ArangoDB database '{name}': {e}",
184
+ exc_info=True,
185
+ )
186
+ raise
187
+
188
+ def execute(self, query, **kwargs):
189
+ """Execute an AQL query.
190
+
191
+ Args:
192
+ query: AQL query string to execute
193
+ **kwargs: Additional query parameters
194
+
195
+ Returns:
196
+ Cursor: ArangoDB cursor for the query results
197
+ """
198
+ cursor = self.conn.aql.execute(query)
199
+ return cursor
200
+
201
+ def close(self):
202
+ """Close the ArangoDB connection."""
203
+ # self.conn.close()
204
+ pass
205
+
206
+ def init_db(self, schema: Schema, clean_start):
207
+ """Initialize ArangoDB with the given schema.
208
+
209
+ Checks if the database exists and creates it if it doesn't.
210
+ Uses schema.general.name if database is not set in config.
211
+
212
+ Args:
213
+ schema: Schema containing graph structure definitions
214
+ clean_start: If True, delete all existing collections before initialization
215
+ """
216
+ # Determine database name: use config.database if set, otherwise use schema.general.name
217
+ db_name = self.config.database
218
+ if not db_name:
219
+ db_name = schema.general.name
220
+ # Update config for subsequent operations
221
+ self.config.database = db_name
222
+
223
+ # Check if database exists and create it if it doesn't
224
+ # Use context manager pattern for system database operations
225
+ try:
226
+ system_db = self.client.db(
227
+ "_system", username=self._username, password=self._password
228
+ )
229
+ if not system_db.has_database(db_name):
230
+ logger.info(f"Database '{db_name}' does not exist, creating it...")
231
+ try:
232
+ system_db.create_database(db_name)
233
+ logger.info(f"Successfully created database '{db_name}'")
234
+ except Exception as create_error:
235
+ logger.error(
236
+ f"Failed to create database '{db_name}': {create_error}",
237
+ exc_info=True,
238
+ )
239
+ raise
240
+
241
+ # Reconnect to the target database (newly created or existing)
242
+ if (
243
+ self.config.database != db_name
244
+ or not hasattr(self, "_db_connected")
245
+ or self._db_connected != db_name
246
+ ):
247
+ try:
248
+ self.conn = self.client.db(
249
+ db_name, username=self._username, password=self._password
250
+ )
251
+ self._db_connected = db_name
252
+ logger.debug(f"Connected to database '{db_name}'")
253
+ except Exception as conn_error:
254
+ logger.error(
255
+ f"Failed to connect to database '{db_name}': {conn_error}",
256
+ exc_info=True,
257
+ )
258
+ raise
259
+ except Exception as e:
260
+ logger.error(
261
+ f"Error during database initialization for '{db_name}': {e}",
262
+ exc_info=True,
263
+ )
264
+ raise
265
+
266
+ try:
267
+ if clean_start:
268
+ try:
269
+ self.delete_graph_structure([], [], delete_all=True)
270
+ logger.debug(f"Cleaned database '{db_name}' for fresh start")
271
+ except Exception as clean_error:
272
+ logger.warning(
273
+ f"Error during clean_start for database '{db_name}': {clean_error}",
274
+ exc_info=True,
275
+ )
276
+ # Continue - may be first run or already clean
277
+
278
+ try:
279
+ self.define_schema(schema)
280
+ logger.debug(f"Defined schema for database '{db_name}'")
281
+ except Exception as schema_error:
282
+ logger.error(
283
+ f"Failed to define schema for database '{db_name}': {schema_error}",
284
+ exc_info=True,
285
+ )
286
+ raise
287
+
288
+ try:
289
+ self.define_indexes(schema)
290
+ logger.debug(f"Defined indexes for database '{db_name}'")
291
+ except Exception as index_error:
292
+ logger.error(
293
+ f"Failed to define indexes for database '{db_name}': {index_error}",
294
+ exc_info=True,
295
+ )
296
+ raise
297
+ except Exception as e:
298
+ logger.error(
299
+ f"Error during database schema initialization for '{db_name}': {e}",
300
+ exc_info=True,
301
+ )
302
+ raise
303
+
304
+ def define_schema(self, schema: Schema):
305
+ """Define ArangoDB collections based on schema.
306
+
307
+ Args:
308
+ schema: Schema containing collection definitions
309
+ """
310
+ self.define_vertex_collections(schema)
311
+ self.define_edge_collections(schema.edge_config.edges_list(include_aux=True))
312
+
313
+ def define_vertex_collections(self, schema: Schema):
314
+ """Define vertex collections in ArangoDB.
315
+
316
+ Creates vertex collections for both connected and disconnected vertices,
317
+ organizing them into appropriate graphs.
318
+
319
+ Args:
320
+ schema: Schema containing vertex definitions
321
+ """
322
+ vertex_config = schema.vertex_config
323
+ disconnected_vertex_collections = (
324
+ set(vertex_config.vertex_set) - schema.edge_config.vertices
325
+ )
326
+ for item in schema.edge_config.edges_list():
327
+ u, v = item.source, item.target
328
+ gname = item.graph_name
329
+ if not gname:
330
+ logger.warning(
331
+ f"Edge {item.source} -> {item.target} has no graph_name, skipping"
332
+ )
333
+ continue
334
+ logger.info(f"{item.source}, {item.target}, {gname}")
335
+ if self.conn.has_graph(gname):
336
+ g = self.conn.graph(gname)
337
+ else:
338
+ g = self.conn.create_graph(gname) # type: ignore
339
+
340
+ _ = self.create_collection(
341
+ vertex_config.vertex_dbname(u), vertex_config.index(u), g
342
+ )
343
+
344
+ _ = self.create_collection(
345
+ vertex_config.vertex_dbname(v), vertex_config.index(v), g
346
+ )
347
+ for v in disconnected_vertex_collections:
348
+ _ = self.create_collection(
349
+ vertex_config.vertex_dbname(v), vertex_config.index(v), None
350
+ )
351
+
352
+ def define_edge_collections(self, edges: list[Edge]):
353
+ """Define edge collections in ArangoDB.
354
+
355
+ Creates edge collections and their definitions in the appropriate graphs.
356
+
357
+ Args:
358
+ edges: List of edge configurations to create
359
+ """
360
+ for item in edges:
361
+ gname = item.graph_name
362
+ if not gname:
363
+ logger.warning("Edge has no graph_name, skipping")
364
+ continue
365
+ if self.conn.has_graph(gname):
366
+ g = self.conn.graph(gname)
367
+ else:
368
+ g = self.conn.create_graph(gname) # type: ignore
369
+ collection_name = item.database_name
370
+ if not collection_name:
371
+ logger.warning("Edge has no database_name, skipping")
372
+ continue
373
+ if not g.has_edge_definition(collection_name):
374
+ _ = g.create_edge_definition(
375
+ edge_collection=collection_name,
376
+ from_vertex_collections=[item._source],
377
+ to_vertex_collections=[item._target],
378
+ )
379
+
380
+ def _add_index(self, general_collection, index: Index):
381
+ """Add an index to an ArangoDB collection.
382
+
383
+ Supports persistent, hash, skiplist, and fulltext indices.
384
+
385
+ Args:
386
+ general_collection: ArangoDB collection to add index to
387
+ index: Index configuration to create
388
+
389
+ Returns:
390
+ IndexHandle: Handle to the created index
391
+ """
392
+ data = index.db_form(DBFlavor.ARANGO)
393
+ if index.type == IndexType.PERSISTENT:
394
+ ih = general_collection.add_index(data)
395
+ if index.type == IndexType.HASH:
396
+ ih = general_collection.add_index(data)
397
+ elif index.type == IndexType.SKIPLIST:
398
+ ih = general_collection.add_skiplist_index(
399
+ fields=index.fields, unique=index.unique
400
+ )
401
+ elif index.type == IndexType.FULLTEXT:
402
+ ih = general_collection.add_index(
403
+ data={"fields": index.fields, "type": "fulltext"}
404
+ )
405
+ else:
406
+ ih = None
407
+ return ih
408
+
409
+ def define_vertex_indices(self, vertex_config: VertexConfig):
410
+ """Define indices for vertex collections.
411
+
412
+ Creates indices for each vertex collection based on the configuration.
413
+
414
+ Args:
415
+ vertex_config: Vertex configuration containing index definitions
416
+ """
417
+ for c in vertex_config.vertex_set:
418
+ general_collection = self.conn.collection(vertex_config.vertex_dbname(c))
419
+ ixs = general_collection.indexes()
420
+ field_combinations = [tuple(ix["fields"]) for ix in ixs]
421
+ for index_obj in vertex_config.indexes(c):
422
+ if tuple(index_obj.fields) not in field_combinations:
423
+ self._add_index(general_collection, index_obj)
424
+
425
+ def define_edge_indices(self, edges: list[Edge]):
426
+ """Define indices for edge collections.
427
+
428
+ Creates indices for each edge collection based on the configuration.
429
+
430
+ Args:
431
+ edges: List of edge configurations containing index definitions
432
+ """
433
+ for edge in edges:
434
+ collection_name = edge.database_name
435
+ if not collection_name:
436
+ logger.warning("Edge has no database_name, skipping index creation")
437
+ continue
438
+ general_collection = self.conn.collection(collection_name)
439
+ for index_obj in edge.indexes:
440
+ self._add_index(general_collection, index_obj)
441
+
442
+ def fetch_indexes(self, db_class_name: str | None = None):
443
+ """Fetch all indices from the database.
444
+
445
+ Args:
446
+ db_class_name: Optional collection name to fetch indices for
447
+
448
+ Returns:
449
+ dict: Mapping of collection names to their indices
450
+ """
451
+ if db_class_name is None:
452
+ classes = self.conn.collections()
453
+ elif self.conn.has_collection(db_class_name):
454
+ classes = [self.conn.collection(db_class_name)]
455
+ else:
456
+ classes = []
457
+
458
+ r = {}
459
+ for cname in classes:
460
+ assert isinstance(cname["name"], str)
461
+ c = self.conn.collection(cname["name"])
462
+ r[cname["name"]] = c.indexes()
463
+ return r
464
+
465
+ def create_collection(self, db_class_name, index: None | Index = None, g=None):
466
+ """Create a new ArangoDB collection.
467
+
468
+ Args:
469
+ db_class_name: Name of the collection to create
470
+ index: Optional index to create on the collection
471
+ g: Optional graph to create the collection in
472
+
473
+ Returns:
474
+ IndexHandle: Handle to the created index if one was created
475
+ """
476
+ if not self.conn.has_collection(db_class_name):
477
+ if g is not None:
478
+ _ = g.create_vertex_collection(db_class_name)
479
+ else:
480
+ self.conn.create_collection(db_class_name)
481
+ general_collection = self.conn.collection(db_class_name)
482
+ if index is not None and index.fields != ["_key"]:
483
+ ih = self._add_index(general_collection, index)
484
+ return ih
485
+ else:
486
+ return None
487
+
488
+ def delete_graph_structure(self, vertex_types=(), graph_names=(), delete_all=False):
489
+ """Delete graph structure (collections and graphs) from ArangoDB.
490
+
491
+ In ArangoDB:
492
+ - Collections: Container for vertices (vertex collections) and edges (edge collections)
493
+ - Graphs: Named graphs that connect vertex and edge collections
494
+
495
+ Args:
496
+ vertex_types: Collection names to delete (vertex or edge collections)
497
+ graph_names: Graph names to delete
498
+ delete_all: If True, delete all non-system collections and graphs
499
+ """
500
+ cnames = vertex_types
501
+ gnames = graph_names
502
+ logger.info("collections (non system):")
503
+ logger.info([c for c in self.conn.collections() if c["name"][0] != "_"])
504
+
505
+ if delete_all:
506
+ cnames = [c["name"] for c in self.conn.collections() if c["name"][0] != "_"]
507
+ gnames = [g["name"] for g in self.conn.graphs()]
508
+
509
+ for gn in gnames:
510
+ if self.conn.has_graph(gn):
511
+ self.conn.delete_graph(gn)
512
+
513
+ logger.info("graphs (after delete operation):")
514
+ logger.info(self.conn.graphs())
515
+
516
+ for cn in cnames:
517
+ if self.conn.has_collection(cn):
518
+ self.conn.delete_collection(cn)
519
+
520
+ logger.info("collections (after delete operation):")
521
+ logger.info([c for c in self.conn.collections() if c["name"][0] != "_"])
522
+
523
+ logger.info("graphs:")
524
+ logger.info(self.conn.graphs())
525
+
526
+ def get_collections(self):
527
+ """Get all collections in the database.
528
+
529
+ Returns:
530
+ list: List of collection information dictionaries
531
+ """
532
+ return self.conn.collections()
533
+
534
+ def upsert_docs_batch(
535
+ self,
536
+ docs,
537
+ class_name,
538
+ match_keys: list[str] | None = None,
539
+ **kwargs,
540
+ ):
541
+ """Upsert a batch of documents using AQL.
542
+
543
+ Performs an upsert operation on a batch of documents, using the specified
544
+ match keys to determine whether to update existing documents or insert new ones.
545
+
546
+ Args:
547
+ docs: List of documents to upsert
548
+ class_name: Collection name to upsert into
549
+ match_keys: Keys to match for upsert operation
550
+ **kwargs: Additional options:
551
+ - dry: If True, don't execute the query
552
+ - update_keys: Keys to update on match
553
+ - filter_uniques: If True, filter duplicate documents
554
+ """
555
+ dry = kwargs.pop("dry", False)
556
+ update_keys = kwargs.pop("update_keys", None)
557
+ filter_uniques = kwargs.pop("filter_uniques", True)
558
+
559
+ if isinstance(docs, list):
560
+ if filter_uniques:
561
+ docs = pick_unique_dict(docs)
562
+ docs = json.dumps(docs, default=_json_serializer)
563
+ if match_keys is None:
564
+ upsert_clause = ""
565
+ update_clause = ""
566
+ else:
567
+ upsert_clause = ", ".join([f'"{k}": doc.{k}' for k in match_keys])
568
+ upsert_clause = f"UPSERT {{{upsert_clause}}}"
569
+
570
+ if isinstance(update_keys, list):
571
+ update_clause = ", ".join([f'"{k}": doc.{k}' for k in update_keys])
572
+ update_clause = f"{{{update_clause}}}"
573
+ elif update_keys == "doc":
574
+ update_clause = "doc"
575
+ else:
576
+ update_clause = "{}"
577
+ update_clause = f"UPDATE {update_clause}"
578
+
579
+ options = "OPTIONS {exclusive: true, ignoreErrors: true}"
580
+
581
+ q_update = f"""FOR doc in {docs}
582
+ {upsert_clause}
583
+ INSERT doc
584
+ {update_clause}
585
+ IN {class_name} {options}"""
586
+ if not dry:
587
+ self.execute(q_update)
588
+
589
+ def insert_edges_batch(
590
+ self,
591
+ docs_edges,
592
+ source_class,
593
+ target_class,
594
+ relation_name=None,
595
+ collection_name=None,
596
+ match_keys_source=("_key",),
597
+ match_keys_target=("_key",),
598
+ filter_uniques=True,
599
+ uniq_weight_fields=None,
600
+ uniq_weight_collections=None,
601
+ upsert_option=False,
602
+ head=None,
603
+ **kwargs,
604
+ ):
605
+ """Insert a batch of edges using AQL.
606
+
607
+ Creates edges between source and target vertices, with support for
608
+ weight fields and unique constraints.
609
+
610
+ Args:
611
+ docs_edges: List of edge documents in format [{_source_aux: source_doc, _target_aux: target_doc}]
612
+ source_class: Source vertex collection name
613
+ target_class: Target vertex collection name
614
+ relation_name: Optional relation name for the edges
615
+ collection_name: Edge collection name
616
+ match_keys_source: Keys to match source vertices
617
+ match_keys_target: Keys to match target vertices
618
+ filter_uniques: If True, filter duplicate edges
619
+ uniq_weight_fields: Fields to consider for uniqueness
620
+ uniq_weight_collections: Collections to consider for uniqueness
621
+ upsert_option: If True, use upsert instead of insert
622
+ head: Optional limit on number of edges to insert
623
+ **kwargs: Additional options:
624
+ - dry: If True, don't execute the query
625
+ """
626
+ dry = kwargs.pop("dry", False)
627
+
628
+ if isinstance(docs_edges, list):
629
+ if docs_edges:
630
+ logger.debug(f" docs_edges[0] = {docs_edges[0]}")
631
+ if head is not None:
632
+ docs_edges = docs_edges[:head]
633
+ if filter_uniques:
634
+ docs_edges = pick_unique_dict(docs_edges)
635
+ docs_edges_str = json.dumps(docs_edges)
636
+ else:
637
+ return ""
638
+
639
+ if match_keys_source[0] == "_key":
640
+ result_from = f'CONCAT("{source_class}/", edge[0]._key)'
641
+ source_filter = ""
642
+ else:
643
+ result_from = "sources[0]._id"
644
+ filter_source = " && ".join(
645
+ [f"v.{k} == edge[0].{k}" for k in match_keys_source]
646
+ )
647
+ source_filter = (
648
+ f"LET sources = (FOR v IN {source_class} FILTER"
649
+ f" {filter_source} LIMIT 1 RETURN v)"
650
+ )
651
+
652
+ if match_keys_target[0] == "_key":
653
+ result_to = f'CONCAT("{target_class}/", edge[1]._key)'
654
+ target_filter = ""
655
+ else:
656
+ result_to = "targets[0]._id"
657
+ filter_target = " && ".join(
658
+ [f"v.{k} == edge[1].{k}" for k in match_keys_target]
659
+ )
660
+ target_filter = (
661
+ f"LET targets = (FOR v IN {target_class} FILTER"
662
+ f" {filter_target} LIMIT 1 RETURN v)"
663
+ )
664
+
665
+ doc_definition = f"MERGE({{_from : {result_from}, _to : {result_to}}}, edge[2])"
666
+
667
+ logger.debug(f" source_filter = {source_filter}")
668
+ logger.debug(f" target_filter = {target_filter}")
669
+ logger.debug(f" doc = {doc_definition}")
670
+
671
+ if upsert_option:
672
+ ups_from = result_from if source_filter else "doc._from"
673
+ ups_to = result_to if target_filter else "doc._to"
674
+
675
+ weight_fs = []
676
+ if uniq_weight_fields is not None:
677
+ weight_fs += uniq_weight_fields
678
+ if uniq_weight_collections is not None:
679
+ weight_fs += uniq_weight_collections
680
+ if relation_name is not None:
681
+ weight_fs += ["relation"]
682
+
683
+ if weight_fs:
684
+ weights_clause = ", " + ", ".join(
685
+ [f"'{x}' : edge.{x}" for x in weight_fs]
686
+ )
687
+ else:
688
+ weights_clause = ""
689
+
690
+ upsert = f"{{'_from': {ups_from}, '_to': {ups_to}" + weights_clause + "}"
691
+ logger.debug(f" upsert clause: {upsert}")
692
+ clauses = f"UPSERT {upsert} INSERT doc UPDATE {{}}"
693
+ options = "OPTIONS {exclusive: true}"
694
+ else:
695
+ if relation_name is None:
696
+ doc_clause = "doc"
697
+ else:
698
+ doc_clause = f"MERGE(doc, {{'relation': '{relation_name}' }})"
699
+ clauses = f"INSERT {doc_clause}"
700
+ options = "OPTIONS {exclusive: true, ignoreErrors: true}"
701
+
702
+ q_update = f"""
703
+ FOR edge in {docs_edges_str} {source_filter} {target_filter}
704
+ LET doc = {doc_definition}
705
+ {clauses}
706
+ in {collection_name} {options}"""
707
+ if not dry:
708
+ self.execute(q_update)
709
+
710
+ def insert_return_batch(self, docs, class_name):
711
+ """Insert documents and return their keys.
712
+
713
+ Args:
714
+ docs: Documents to insert
715
+ class_name: Collection to insert into
716
+
717
+ Returns:
718
+ str: AQL query string for the operation
719
+ """
720
+ docs = json.dumps(docs)
721
+ query0 = f"""FOR doc in {docs}
722
+ INSERT doc
723
+ INTO {class_name}
724
+ LET inserted = NEW
725
+ RETURN {{_key: inserted._key}}
726
+ """
727
+ return query0
728
+
729
+ def fetch_present_documents(
730
+ self,
731
+ batch,
732
+ class_name,
733
+ match_keys,
734
+ keep_keys,
735
+ flatten=False,
736
+ filters: None | Clause | list | dict = None,
737
+ ) -> list | dict:
738
+ """Fetch documents that exist in the database.
739
+
740
+ Args:
741
+ batch: Batch of documents to check
742
+ class_name: Collection to check in
743
+ match_keys: Keys to match documents
744
+ keep_keys: Keys to keep in result
745
+ flatten: If True, flatten the result into a list
746
+ filters: Additional query filters
747
+
748
+ Returns:
749
+ list | dict: Documents that exist in the database, either as a
750
+ flat list or a dictionary mapping batch indices to documents
751
+ """
752
+ q0 = fetch_fields_query(
753
+ collection_name=class_name,
754
+ docs=batch,
755
+ match_keys=match_keys,
756
+ keep_keys=keep_keys,
757
+ filters=filters,
758
+ )
759
+ # {"__i": i, "_group": [doc]}
760
+ cursor = self.execute(q0)
761
+
762
+ if flatten:
763
+ rdata = []
764
+ for item in get_data_from_cursor(cursor):
765
+ group = item.pop("_group", [])
766
+ rdata += [sub_item for sub_item in group]
767
+ return rdata
768
+ else:
769
+ rdata_dict = {}
770
+ for item in get_data_from_cursor(cursor):
771
+ __i = item.pop("__i")
772
+ group = item.pop("_group")
773
+ rdata_dict[__i] = group
774
+ return rdata_dict
775
+
776
+ def fetch_docs(
777
+ self,
778
+ class_name,
779
+ filters: None | Clause | list | dict = None,
780
+ limit: int | None = None,
781
+ return_keys: list | None = None,
782
+ unset_keys: list | None = None,
783
+ **kwargs,
784
+ ):
785
+ """Fetch documents from a collection.
786
+
787
+ Args:
788
+ class_name: Collection to fetch from
789
+ filters: Query filters
790
+ limit: Maximum number of documents to return
791
+ return_keys: Keys to return
792
+ unset_keys: Keys to unset
793
+
794
+ Returns:
795
+ list: Fetched documents
796
+ """
797
+ filter_clause = render_filters(filters, doc_name="d")
798
+
799
+ if return_keys is None:
800
+ if unset_keys is None:
801
+ return_clause = "d"
802
+ else:
803
+ tmp_clause = ", ".join([f'"{item}"' for item in unset_keys])
804
+ return_clause = f"UNSET(d, {tmp_clause})"
805
+ else:
806
+ if unset_keys is None:
807
+ tmp_clause = ", ".join([f'"{item}"' for item in return_keys])
808
+ return_clause = f"KEEP(d, {tmp_clause})"
809
+ else:
810
+ raise ValueError("both return_keys and unset_keys are set")
811
+
812
+ if limit is not None and isinstance(limit, int):
813
+ limit_clause = f"LIMIT {limit}"
814
+ else:
815
+ limit_clause = ""
816
+
817
+ q = (
818
+ f"FOR d in {class_name}"
819
+ f" {filter_clause}"
820
+ f" {limit_clause}"
821
+ f" RETURN {return_clause}"
822
+ )
823
+ cursor = self.execute(q)
824
+ return get_data_from_cursor(cursor)
825
+
826
+ # TODO test
827
+ def fetch_edges(
828
+ self,
829
+ from_type: str,
830
+ from_id: str,
831
+ edge_type: str | None = None,
832
+ to_type: str | None = None,
833
+ to_id: str | None = None,
834
+ filters: list | dict | Clause | None = None,
835
+ limit: int | None = None,
836
+ return_keys: list | None = None,
837
+ unset_keys: list | None = None,
838
+ **kwargs,
839
+ ):
840
+ """Fetch edges from ArangoDB using AQL.
841
+
842
+ Args:
843
+ from_type: Source vertex collection name
844
+ from_id: Source vertex ID (can be _key or _id)
845
+ edge_type: Optional edge collection name to filter by
846
+ to_type: Optional target vertex collection name to filter by
847
+ to_id: Optional target vertex ID to filter by
848
+ filters: Additional query filters
849
+ limit: Maximum number of edges to return
850
+ return_keys: Keys to return (projection)
851
+ unset_keys: Keys to exclude (projection)
852
+ **kwargs: Additional parameters
853
+
854
+ Returns:
855
+ list: List of fetched edges
856
+ """
857
+ # Convert from_id to _id format if needed
858
+ if not from_id.startswith(from_type):
859
+ # Assume it's a _key, convert to _id
860
+ from_vertex_id = f"{from_type}/{from_id}"
861
+ else:
862
+ from_vertex_id = from_id
863
+
864
+ # Build AQL query to fetch edges
865
+ # Start with basic edge traversal
866
+ if edge_type:
867
+ edge_collection = edge_type
868
+ else:
869
+ # If no edge_type specified, we need to search all edge collections
870
+ # This is a simplified version - in practice you might want to list all edge collections
871
+ raise ValueError("edge_type is required for ArangoDB edge fetching")
872
+
873
+ filter_clause = render_filters(filters, doc_name="e")
874
+ filter_parts = []
875
+
876
+ if to_type:
877
+ filter_parts.append(f"e._to LIKE '{to_type}/%'")
878
+ if to_id and to_type:
879
+ if not to_id.startswith(to_type):
880
+ to_vertex_id = f"{to_type}/{to_id}"
881
+ else:
882
+ to_vertex_id = to_id
883
+ filter_parts.append(f"e._to == '{to_vertex_id}'")
884
+
885
+ additional_filters = " && ".join(filter_parts)
886
+ if filter_clause and additional_filters:
887
+ filter_clause = f"{filter_clause} && {additional_filters}"
888
+ elif additional_filters:
889
+ filter_clause = additional_filters
890
+
891
+ query = f"""
892
+ FOR e IN {edge_collection}
893
+ FILTER e._from == '{from_vertex_id}'
894
+ {f"FILTER {filter_clause}" if filter_clause else ""}
895
+ {f"LIMIT {limit}" if limit else ""}
896
+ RETURN e
897
+ """
898
+
899
+ cursor = self.execute(query)
900
+ result = list(get_data_from_cursor(cursor))
901
+
902
+ # Apply projection
903
+ if return_keys is not None:
904
+ result = [
905
+ {k: doc.get(k) for k in return_keys if k in doc} for doc in result
906
+ ]
907
+ elif unset_keys is not None:
908
+ result = [
909
+ {k: v for k, v in doc.items() if k not in unset_keys} for doc in result
910
+ ]
911
+
912
+ return result
913
+
914
+ def aggregate(
915
+ self,
916
+ class_name,
917
+ aggregation_function: AggregationType,
918
+ discriminant: str | None = None,
919
+ aggregated_field: str | None = None,
920
+ filters: None | Clause | list | dict = None,
921
+ ):
922
+ """Perform aggregation on a collection.
923
+
924
+ Args:
925
+ class_name: Collection to aggregate
926
+ aggregation_function: Type of aggregation to perform
927
+ discriminant: Field to group by
928
+ aggregated_field: Field to aggregate
929
+ filters: Query filters
930
+
931
+ Returns:
932
+ list: Aggregation results
933
+ """
934
+ filter_clause = render_filters(filters, doc_name="doc")
935
+
936
+ if (
937
+ aggregated_field is not None
938
+ and aggregation_function != AggregationType.COUNT
939
+ ):
940
+ group_unit = f"g[*].doc.{aggregated_field}"
941
+ else:
942
+ group_unit = "g"
943
+
944
+ if discriminant is not None:
945
+ collect_clause = f"COLLECT value = doc['{discriminant}'] INTO g"
946
+ return_clause = f"""{{ '{discriminant}' : value, '_value': {aggregation_function}({group_unit})}}"""
947
+ else:
948
+ if (
949
+ aggregated_field is None
950
+ and aggregation_function == AggregationType.COUNT
951
+ ):
952
+ collect_clause = (
953
+ f"COLLECT AGGREGATE value = {aggregation_function} (doc)"
954
+ )
955
+ else:
956
+ collect_clause = (
957
+ "COLLECT AGGREGATE value ="
958
+ f" {aggregation_function}(doc['{aggregated_field}'])"
959
+ )
960
+ return_clause = """{ '_value' : value }"""
961
+
962
+ q = f"""FOR doc IN {class_name}
963
+ {filter_clause}
964
+ {collect_clause}
965
+ RETURN {return_clause}"""
966
+
967
+ cursor = self.execute(q)
968
+ data = get_data_from_cursor(cursor)
969
+ return data
970
+
971
+ def keep_absent_documents(
972
+ self,
973
+ batch,
974
+ class_name,
975
+ match_keys,
976
+ keep_keys,
977
+ filters: None | Clause | list | dict = None,
978
+ ):
979
+ """Keep documents that don't exist in the database.
980
+
981
+ Args:
982
+ batch: Batch of documents to check
983
+ class_name: Collection to check in
984
+ match_keys: Keys to match documents
985
+ keep_keys: Keys to keep in result
986
+ filters: Additional query filters
987
+
988
+ Returns:
989
+ list: Documents that don't exist in the database
990
+ """
991
+ present_docs_keys = self.fetch_present_documents(
992
+ batch=batch,
993
+ class_name=class_name,
994
+ match_keys=match_keys,
995
+ keep_keys=keep_keys,
996
+ flatten=False,
997
+ filters=filters,
998
+ )
999
+
1000
+ assert isinstance(present_docs_keys, dict)
1001
+
1002
+ if any([len(v) > 1 for v in present_docs_keys.values()]):
1003
+ logger.warning(
1004
+ "fetch_present_documents returned multiple docs per filtering condition"
1005
+ )
1006
+
1007
+ absent_indices = sorted(set(range(len(batch))) - set(present_docs_keys.keys()))
1008
+ batch_absent = [batch[j] for j in absent_indices]
1009
+ return batch_absent
1010
+
1011
+ def update_to_numeric(self, collection_name, field):
1012
+ """Update a field to numeric type in all documents.
1013
+
1014
+ Args:
1015
+ collection_name: Collection to update
1016
+ field: Field to convert to numeric
1017
+
1018
+ Returns:
1019
+ str: AQL query string for the operation
1020
+ """
1021
+ s1 = f"FOR p IN {collection_name} FILTER p.{field} update p with {{"
1022
+ s2 = f"{field}: TO_NUMBER(p.{field}) "
1023
+ s3 = f"}} in {collection_name}"
1024
+ q0 = s1 + s2 + s3
1025
+ return q0