graflo 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of graflo might be problematic. Click here for more details.

Files changed (45) hide show
  1. graflo/README.md +18 -0
  2. graflo/__init__.py +39 -0
  3. graflo/architecture/__init__.py +37 -0
  4. graflo/architecture/actor.py +974 -0
  5. graflo/architecture/actor_util.py +425 -0
  6. graflo/architecture/edge.py +295 -0
  7. graflo/architecture/onto.py +374 -0
  8. graflo/architecture/resource.py +161 -0
  9. graflo/architecture/schema.py +136 -0
  10. graflo/architecture/transform.py +292 -0
  11. graflo/architecture/util.py +93 -0
  12. graflo/architecture/vertex.py +277 -0
  13. graflo/caster.py +409 -0
  14. graflo/cli/__init__.py +14 -0
  15. graflo/cli/ingest.py +144 -0
  16. graflo/cli/manage_dbs.py +193 -0
  17. graflo/cli/plot_schema.py +132 -0
  18. graflo/cli/xml2json.py +93 -0
  19. graflo/db/__init__.py +32 -0
  20. graflo/db/arango/__init__.py +16 -0
  21. graflo/db/arango/conn.py +734 -0
  22. graflo/db/arango/query.py +180 -0
  23. graflo/db/arango/util.py +88 -0
  24. graflo/db/connection.py +304 -0
  25. graflo/db/manager.py +104 -0
  26. graflo/db/neo4j/__init__.py +16 -0
  27. graflo/db/neo4j/conn.py +432 -0
  28. graflo/db/util.py +49 -0
  29. graflo/filter/__init__.py +21 -0
  30. graflo/filter/onto.py +400 -0
  31. graflo/logging.conf +22 -0
  32. graflo/onto.py +186 -0
  33. graflo/plot/__init__.py +17 -0
  34. graflo/plot/plotter.py +556 -0
  35. graflo/util/__init__.py +23 -0
  36. graflo/util/chunker.py +739 -0
  37. graflo/util/merge.py +148 -0
  38. graflo/util/misc.py +37 -0
  39. graflo/util/onto.py +63 -0
  40. graflo/util/transform.py +406 -0
  41. graflo-1.1.0.dist-info/METADATA +157 -0
  42. graflo-1.1.0.dist-info/RECORD +45 -0
  43. graflo-1.1.0.dist-info/WHEEL +4 -0
  44. graflo-1.1.0.dist-info/entry_points.txt +5 -0
  45. graflo-1.1.0.dist-info/licenses/LICENSE +126 -0
@@ -0,0 +1,734 @@
1
+ """ArangoDB connection implementation for graph database operations.
2
+
3
+ This module implements the Connection interface for ArangoDB, providing
4
+ specific functionality for graph operations in ArangoDB. It handles:
5
+ - Graph and collection management
6
+ - Document and edge operations
7
+ - Index creation and management
8
+ - AQL query execution
9
+ - Batch operations with upsert support
10
+
11
+ Key Features:
12
+ - Graph-based document organization
13
+ - Edge collection management
14
+ - Persistent, hash, skiplist, and fulltext indices
15
+ - Batch document and edge operations
16
+ - AQL query generation and execution
17
+
18
+ Example:
19
+ >>> conn = ArangoConnection(config)
20
+ >>> conn.init_db(schema, clean_start=True)
21
+ >>> conn.upsert_docs_batch(docs, "users", match_keys=["email"])
22
+ """
23
+
24
+ import json
25
+ import logging
26
+ from typing import Optional
27
+
28
+ from arango import ArangoClient
29
+ from suthing import ArangoConnectionConfig
30
+
31
+ from graflo.architecture.edge import Edge
32
+ from graflo.architecture.onto import (
33
+ Index,
34
+ IndexType,
35
+ )
36
+ from graflo.architecture.schema import Schema
37
+ from graflo.architecture.vertex import VertexConfig
38
+ from graflo.db.arango.query import fetch_fields_query
39
+ from graflo.db.arango.util import render_filters
40
+ from graflo.db.connection import Connection
41
+ from graflo.db.util import get_data_from_cursor
42
+ from graflo.filter.onto import Clause
43
+ from graflo.onto import AggregationType, DBFlavor
44
+ from graflo.util.transform import pick_unique_dict
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ class ArangoConnection(Connection):
50
+ """ArangoDB-specific implementation of the Connection interface.
51
+
52
+ This class provides ArangoDB-specific implementations for all database
53
+ operations, including graph management, document operations, and query
54
+ execution. It uses the ArangoDB Python driver for all operations.
55
+
56
+ Attributes:
57
+ conn: ArangoDB database connection instance
58
+ """
59
+
60
+ def __init__(self, config: ArangoConnectionConfig):
61
+ """Initialize ArangoDB connection.
62
+
63
+ Args:
64
+ config: ArangoDB connection configuration containing URL, credentials,
65
+ and database name
66
+ """
67
+ super().__init__()
68
+ client = ArangoClient(hosts=config.url, request_timeout=config.request_timeout)
69
+
70
+ self.conn = client.db(
71
+ config.database,
72
+ username=config.username,
73
+ password=config.password,
74
+ )
75
+
76
+ def create_database(self, name: str):
77
+ """Create a new ArangoDB database.
78
+
79
+ Args:
80
+ name: Name of the database to create
81
+ """
82
+ if not self.conn.has_database(name):
83
+ self.conn.create_database(name)
84
+
85
+ def delete_database(self, name: str):
86
+ """Delete an ArangoDB database.
87
+
88
+ Args:
89
+ name: Name of the database to delete
90
+ """
91
+ if not self.conn.has_database(name):
92
+ self.conn.delete_database(name)
93
+
94
+ def execute(self, query, **kwargs):
95
+ """Execute an AQL query.
96
+
97
+ Args:
98
+ query: AQL query string to execute
99
+ **kwargs: Additional query parameters
100
+
101
+ Returns:
102
+ Cursor: ArangoDB cursor for the query results
103
+ """
104
+ cursor = self.conn.aql.execute(query)
105
+ return cursor
106
+
107
+ def close(self):
108
+ """Close the ArangoDB connection."""
109
+ # self.conn.close()
110
+ pass
111
+
112
+ def init_db(self, schema: Schema, clean_start):
113
+ """Initialize ArangoDB with the given schema.
114
+
115
+ Args:
116
+ schema: Schema containing graph structure definitions
117
+ clean_start: If True, delete all existing collections before initialization
118
+ """
119
+ if clean_start:
120
+ self.delete_collections([], [], delete_all=True)
121
+ self.define_collections(schema)
122
+ self.define_indexes(schema)
123
+
124
+ def define_collections(self, schema: Schema):
125
+ """Define ArangoDB collections based on schema.
126
+
127
+ Args:
128
+ schema: Schema containing collection definitions
129
+ """
130
+ self.define_vertex_collections(schema)
131
+ self.define_edge_collections(schema.edge_config.edges_list(include_aux=True))
132
+
133
+ def define_vertex_collections(self, schema: Schema):
134
+ """Define vertex collections in ArangoDB.
135
+
136
+ Creates vertex collections for both connected and disconnected vertices,
137
+ organizing them into appropriate graphs.
138
+
139
+ Args:
140
+ schema: Schema containing vertex definitions
141
+ """
142
+ vertex_config = schema.vertex_config
143
+ disconnected_vertex_collections = (
144
+ set(vertex_config.vertex_set) - schema.edge_config.vertices
145
+ )
146
+ for item in schema.edge_config.edges_list():
147
+ u, v = item.source, item.target
148
+ gname = item.graph_name
149
+ logger.info(f"{item.source}, {item.target}, {gname}")
150
+ if self.conn.has_graph(gname):
151
+ g = self.conn.graph(gname)
152
+ else:
153
+ g = self.conn.create_graph(gname) # type: ignore
154
+
155
+ _ = self.create_collection(
156
+ vertex_config.vertex_dbname(u), vertex_config.index(u), g
157
+ )
158
+
159
+ _ = self.create_collection(
160
+ vertex_config.vertex_dbname(v), vertex_config.index(v), g
161
+ )
162
+ for v in disconnected_vertex_collections:
163
+ _ = self.create_collection(
164
+ vertex_config.vertex_dbname(v), vertex_config.index(v), None
165
+ )
166
+
167
+ def define_edge_collections(self, edges: list[Edge]):
168
+ """Define edge collections in ArangoDB.
169
+
170
+ Creates edge collections and their definitions in the appropriate graphs.
171
+
172
+ Args:
173
+ edges: List of edge configurations to create
174
+ """
175
+ for item in edges:
176
+ gname = item.graph_name
177
+ if self.conn.has_graph(gname):
178
+ g = self.conn.graph(gname)
179
+ else:
180
+ g = self.conn.create_graph(gname) # type: ignore
181
+ if not g.has_edge_definition(item.collection_name):
182
+ _ = g.create_edge_definition(
183
+ edge_collection=item.collection_name,
184
+ from_vertex_collections=[item._source_collection],
185
+ to_vertex_collections=[item._target_collection],
186
+ )
187
+
188
+ def _add_index(self, general_collection, index: Index):
189
+ """Add an index to an ArangoDB collection.
190
+
191
+ Supports persistent, hash, skiplist, and fulltext indices.
192
+
193
+ Args:
194
+ general_collection: ArangoDB collection to add index to
195
+ index: Index configuration to create
196
+
197
+ Returns:
198
+ IndexHandle: Handle to the created index
199
+ """
200
+ data = index.db_form(DBFlavor.ARANGO)
201
+ if index.type == IndexType.PERSISTENT:
202
+ ih = general_collection.add_index(data)
203
+ if index.type == IndexType.HASH:
204
+ ih = general_collection.add_index(data)
205
+ elif index.type == IndexType.SKIPLIST:
206
+ ih = general_collection.add_skiplist_index(
207
+ fields=index.fields, unique=index.unique
208
+ )
209
+ elif index.type == IndexType.FULLTEXT:
210
+ ih = general_collection.add_index(
211
+ data={"fields": index.fields, "type": "fulltext"}
212
+ )
213
+ else:
214
+ ih = None
215
+ return ih
216
+
217
+ def define_vertex_indices(self, vertex_config: VertexConfig):
218
+ """Define indices for vertex collections.
219
+
220
+ Creates indices for each vertex collection based on the configuration.
221
+
222
+ Args:
223
+ vertex_config: Vertex configuration containing index definitions
224
+ """
225
+ for c in vertex_config.vertex_set:
226
+ general_collection = self.conn.collection(vertex_config.vertex_dbname(c))
227
+ ixs = general_collection.indexes()
228
+ field_combinations = [tuple(ix["fields"]) for ix in ixs]
229
+ for index_obj in vertex_config.indexes(c):
230
+ if tuple(index_obj.fields) not in field_combinations:
231
+ self._add_index(general_collection, index_obj)
232
+
233
+ def define_edge_indices(self, edges: list[Edge]):
234
+ """Define indices for edge collections.
235
+
236
+ Creates indices for each edge collection based on the configuration.
237
+
238
+ Args:
239
+ edges: List of edge configurations containing index definitions
240
+ """
241
+ for edge in edges:
242
+ general_collection = self.conn.collection(edge.collection_name)
243
+ for index_obj in edge.indexes:
244
+ self._add_index(general_collection, index_obj)
245
+
246
+ def fetch_indexes(self, db_class_name: Optional[str] = None):
247
+ """Fetch all indices from the database.
248
+
249
+ Args:
250
+ db_class_name: Optional collection name to fetch indices for
251
+
252
+ Returns:
253
+ dict: Mapping of collection names to their indices
254
+ """
255
+ if db_class_name is None:
256
+ classes = self.conn.collections()
257
+ elif self.conn.has_collection(db_class_name):
258
+ classes = [self.conn.collection(db_class_name)]
259
+ else:
260
+ classes = []
261
+
262
+ r = {}
263
+ for cname in classes:
264
+ assert isinstance(cname["name"], str)
265
+ c = self.conn.collection(cname["name"])
266
+ r[cname["name"]] = c.indexes()
267
+ return r
268
+
269
+ def create_collection(self, db_class_name, index: None | Index = None, g=None):
270
+ """Create a new ArangoDB collection.
271
+
272
+ Args:
273
+ db_class_name: Name of the collection to create
274
+ index: Optional index to create on the collection
275
+ g: Optional graph to create the collection in
276
+
277
+ Returns:
278
+ IndexHandle: Handle to the created index if one was created
279
+ """
280
+ if not self.conn.has_collection(db_class_name):
281
+ if g is not None:
282
+ _ = g.create_vertex_collection(db_class_name)
283
+ else:
284
+ self.conn.create_collection(db_class_name)
285
+ general_collection = self.conn.collection(db_class_name)
286
+ if index is not None and index.fields != ["_key"]:
287
+ ih = self._add_index(general_collection, index)
288
+ return ih
289
+ else:
290
+ return None
291
+
292
+ def delete_collections(self, cnames=(), gnames=(), delete_all=False):
293
+ """Delete collections and graphs from ArangoDB.
294
+
295
+ Args:
296
+ cnames: Collection names to delete
297
+ gnames: Graph names to delete
298
+ delete_all: If True, delete all non-system collections and graphs
299
+ """
300
+ logger.info("collections (non system):")
301
+ logger.info([c for c in self.conn.collections() if c["name"][0] != "_"])
302
+
303
+ if delete_all:
304
+ cnames = [c["name"] for c in self.conn.collections() if c["name"][0] != "_"]
305
+ gnames = [g["name"] for g in self.conn.graphs()]
306
+
307
+ for gn in gnames:
308
+ if self.conn.has_graph(gn):
309
+ self.conn.delete_graph(gn)
310
+
311
+ logger.info("graphs (after delete operation):")
312
+ logger.info(self.conn.graphs())
313
+
314
+ for cn in cnames:
315
+ if self.conn.has_collection(cn):
316
+ self.conn.delete_collection(cn)
317
+
318
+ logger.info("collections (after delete operation):")
319
+ logger.info([c for c in self.conn.collections() if c["name"][0] != "_"])
320
+
321
+ logger.info("graphs:")
322
+ logger.info(self.conn.graphs())
323
+
324
+ def get_collections(self):
325
+ """Get all collections in the database.
326
+
327
+ Returns:
328
+ list: List of collection information dictionaries
329
+ """
330
+ return self.conn.collections()
331
+
332
+ def upsert_docs_batch(
333
+ self,
334
+ docs,
335
+ class_name,
336
+ match_keys: list[str] | None = None,
337
+ **kwargs,
338
+ ):
339
+ """Upsert a batch of documents using AQL.
340
+
341
+ Performs an upsert operation on a batch of documents, using the specified
342
+ match keys to determine whether to update existing documents or insert new ones.
343
+
344
+ Args:
345
+ docs: List of documents to upsert
346
+ class_name: Collection name to upsert into
347
+ match_keys: Keys to match for upsert operation
348
+ **kwargs: Additional options:
349
+ - dry: If True, don't execute the query
350
+ - update_keys: Keys to update on match
351
+ - filter_uniques: If True, filter duplicate documents
352
+ """
353
+ dry = kwargs.pop("dry", False)
354
+ update_keys = kwargs.pop("update_keys", None)
355
+ filter_uniques = kwargs.pop("filter_uniques", True)
356
+
357
+ if isinstance(docs, list):
358
+ if filter_uniques:
359
+ docs = pick_unique_dict(docs)
360
+ docs = json.dumps(docs)
361
+ if match_keys is None:
362
+ upsert_clause = ""
363
+ update_clause = ""
364
+ else:
365
+ upsert_clause = ", ".join([f'"{k}": doc.{k}' for k in match_keys])
366
+ upsert_clause = f"UPSERT {{{upsert_clause}}}"
367
+
368
+ if isinstance(update_keys, list):
369
+ update_clause = ", ".join([f'"{k}": doc.{k}' for k in update_keys])
370
+ update_clause = f"{{{update_clause}}}"
371
+ elif update_keys == "doc":
372
+ update_clause = "doc"
373
+ else:
374
+ update_clause = "{}"
375
+ update_clause = f"UPDATE {update_clause}"
376
+
377
+ options = "OPTIONS {exclusive: true, ignoreErrors: true}"
378
+
379
+ q_update = f"""FOR doc in {docs}
380
+ {upsert_clause}
381
+ INSERT doc
382
+ {update_clause}
383
+ IN {class_name} {options}"""
384
+ if not dry:
385
+ self.execute(q_update)
386
+
387
+ def insert_edges_batch(
388
+ self,
389
+ docs_edges,
390
+ source_class,
391
+ target_class,
392
+ relation_name=None,
393
+ collection_name=None,
394
+ match_keys_source=("_key",),
395
+ match_keys_target=("_key",),
396
+ filter_uniques=True,
397
+ uniq_weight_fields=None,
398
+ uniq_weight_collections=None,
399
+ upsert_option=False,
400
+ head=None,
401
+ **kwargs,
402
+ ):
403
+ """Insert a batch of edges using AQL.
404
+
405
+ Creates edges between source and target vertices, with support for
406
+ weight fields and unique constraints.
407
+
408
+ Args:
409
+ docs_edges: List of edge documents in format [{_source_aux: source_doc, _target_aux: target_doc}]
410
+ source_class: Source vertex collection name
411
+ target_class: Target vertex collection name
412
+ relation_name: Optional relation name for the edges
413
+ collection_name: Edge collection name
414
+ match_keys_source: Keys to match source vertices
415
+ match_keys_target: Keys to match target vertices
416
+ filter_uniques: If True, filter duplicate edges
417
+ uniq_weight_fields: Fields to consider for uniqueness
418
+ uniq_weight_collections: Collections to consider for uniqueness
419
+ upsert_option: If True, use upsert instead of insert
420
+ head: Optional limit on number of edges to insert
421
+ **kwargs: Additional options:
422
+ - dry: If True, don't execute the query
423
+ """
424
+ dry = kwargs.pop("dry", False)
425
+
426
+ if isinstance(docs_edges, list):
427
+ if docs_edges:
428
+ logger.debug(f" docs_edges[0] = {docs_edges[0]}")
429
+ if head is not None:
430
+ docs_edges = docs_edges[:head]
431
+ if filter_uniques:
432
+ docs_edges = pick_unique_dict(docs_edges)
433
+ docs_edges_str = json.dumps(docs_edges)
434
+ else:
435
+ return ""
436
+
437
+ if match_keys_source[0] == "_key":
438
+ result_from = f'CONCAT("{source_class}/", edge[0]._key)'
439
+ source_filter = ""
440
+ else:
441
+ result_from = "sources[0]._id"
442
+ filter_source = " && ".join(
443
+ [f"v.{k} == edge[0].{k}" for k in match_keys_source]
444
+ )
445
+ source_filter = (
446
+ f"LET sources = (FOR v IN {source_class} FILTER"
447
+ f" {filter_source} LIMIT 1 RETURN v)"
448
+ )
449
+
450
+ if match_keys_target[0] == "_key":
451
+ result_to = f'CONCAT("{target_class}/", edge[1]._key)'
452
+ target_filter = ""
453
+ else:
454
+ result_to = "targets[0]._id"
455
+ filter_target = " && ".join(
456
+ [f"v.{k} == edge[1].{k}" for k in match_keys_target]
457
+ )
458
+ target_filter = (
459
+ f"LET targets = (FOR v IN {target_class} FILTER"
460
+ f" {filter_target} LIMIT 1 RETURN v)"
461
+ )
462
+
463
+ doc_definition = f"MERGE({{_from : {result_from}, _to : {result_to}}}, edge[2])"
464
+
465
+ logger.debug(f" source_filter = {source_filter}")
466
+ logger.debug(f" target_filter = {target_filter}")
467
+ logger.debug(f" doc = {doc_definition}")
468
+
469
+ if upsert_option:
470
+ ups_from = result_from if source_filter else "doc._from"
471
+ ups_to = result_to if target_filter else "doc._to"
472
+
473
+ weight_fs = []
474
+ if uniq_weight_fields is not None:
475
+ weight_fs += uniq_weight_fields
476
+ if uniq_weight_collections is not None:
477
+ weight_fs += uniq_weight_collections
478
+ if relation_name is not None:
479
+ weight_fs += ["relation"]
480
+
481
+ if weight_fs:
482
+ weights_clause = ", " + ", ".join(
483
+ [f"'{x}' : edge.{x}" for x in weight_fs]
484
+ )
485
+ else:
486
+ weights_clause = ""
487
+
488
+ upsert = f"{{'_from': {ups_from}, '_to': {ups_to}" + weights_clause + "}"
489
+ logger.debug(f" upsert clause: {upsert}")
490
+ clauses = f"UPSERT {upsert} INSERT doc UPDATE {{}}"
491
+ options = "OPTIONS {exclusive: true}"
492
+ else:
493
+ if relation_name is None:
494
+ doc_clause = "doc"
495
+ else:
496
+ doc_clause = f"MERGE(doc, {{'relation': '{relation_name}' }})"
497
+ clauses = f"INSERT {doc_clause}"
498
+ options = "OPTIONS {exclusive: true, ignoreErrors: true}"
499
+
500
+ q_update = f"""
501
+ FOR edge in {docs_edges_str} {source_filter} {target_filter}
502
+ LET doc = {doc_definition}
503
+ {clauses}
504
+ in {collection_name} {options}"""
505
+ if not dry:
506
+ self.execute(q_update)
507
+
508
+ def insert_return_batch(self, docs, class_name):
509
+ """Insert documents and return their keys.
510
+
511
+ Args:
512
+ docs: Documents to insert
513
+ class_name: Collection to insert into
514
+
515
+ Returns:
516
+ str: AQL query string for the operation
517
+ """
518
+ docs = json.dumps(docs)
519
+ query0 = f"""FOR doc in {docs}
520
+ INSERT doc
521
+ INTO {class_name}
522
+ LET inserted = NEW
523
+ RETURN {{_key: inserted._key}}
524
+ """
525
+ return query0
526
+
527
+ def fetch_present_documents(
528
+ self,
529
+ batch,
530
+ class_name,
531
+ match_keys,
532
+ keep_keys,
533
+ flatten=False,
534
+ filters: None | Clause | list | dict = None,
535
+ ) -> list | dict:
536
+ """Fetch documents that exist in the database.
537
+
538
+ Args:
539
+ batch: Batch of documents to check
540
+ class_name: Collection to check in
541
+ match_keys: Keys to match documents
542
+ keep_keys: Keys to keep in result
543
+ flatten: If True, flatten the result into a list
544
+ filters: Additional query filters
545
+
546
+ Returns:
547
+ Union[list, dict]: Documents that exist in the database, either as a
548
+ flat list or a dictionary mapping batch indices to documents
549
+ """
550
+ q0 = fetch_fields_query(
551
+ collection_name=class_name,
552
+ docs=batch,
553
+ match_keys=match_keys,
554
+ keep_keys=keep_keys,
555
+ filters=filters,
556
+ )
557
+ # {"__i": i, "_group": [doc]}
558
+ cursor = self.execute(q0)
559
+
560
+ if flatten:
561
+ rdata = []
562
+ for item in get_data_from_cursor(cursor):
563
+ group = item.pop("_group", [])
564
+ rdata += [sub_item for sub_item in group]
565
+ return rdata
566
+ else:
567
+ rdata_dict = {}
568
+ for item in get_data_from_cursor(cursor):
569
+ __i = item.pop("__i")
570
+ group = item.pop("_group")
571
+ rdata_dict[__i] = group
572
+ return rdata_dict
573
+
574
+ def fetch_docs(
575
+ self,
576
+ class_name,
577
+ filters: None | Clause | list | dict = None,
578
+ limit: int | None = None,
579
+ return_keys: list | None = None,
580
+ unset_keys: list | None = None,
581
+ ):
582
+ """Fetch documents from a collection.
583
+
584
+ Args:
585
+ class_name: Collection to fetch from
586
+ filters: Query filters
587
+ limit: Maximum number of documents to return
588
+ return_keys: Keys to return
589
+ unset_keys: Keys to unset
590
+
591
+ Returns:
592
+ list: Fetched documents
593
+ """
594
+ filter_clause = render_filters(filters, doc_name="d")
595
+
596
+ if return_keys is None:
597
+ if unset_keys is None:
598
+ return_clause = "d"
599
+ else:
600
+ tmp_clause = ", ".join([f'"{item}"' for item in unset_keys])
601
+ return_clause = f"UNSET(d, {tmp_clause})"
602
+ else:
603
+ if unset_keys is None:
604
+ tmp_clause = ", ".join([f'"{item}"' for item in return_keys])
605
+ return_clause = f"KEEP(d, {tmp_clause})"
606
+ else:
607
+ raise ValueError("both return_keys and unset_keys are set")
608
+
609
+ if limit is not None and isinstance(limit, int):
610
+ limit_clause = f"LIMIT {limit}"
611
+ else:
612
+ limit_clause = ""
613
+
614
+ q = (
615
+ f"FOR d in {class_name}"
616
+ f" {filter_clause}"
617
+ f" {limit_clause}"
618
+ f" RETURN {return_clause}"
619
+ )
620
+ cursor = self.execute(q)
621
+ return get_data_from_cursor(cursor)
622
+
623
+ def aggregate(
624
+ self,
625
+ class_name,
626
+ aggregation_function: AggregationType,
627
+ discriminant: str | None = None,
628
+ aggregated_field: str | None = None,
629
+ filters: None | Clause | list | dict = None,
630
+ ):
631
+ """Perform aggregation on a collection.
632
+
633
+ Args:
634
+ class_name: Collection to aggregate
635
+ aggregation_function: Type of aggregation to perform
636
+ discriminant: Field to group by
637
+ aggregated_field: Field to aggregate
638
+ filters: Query filters
639
+
640
+ Returns:
641
+ list: Aggregation results
642
+ """
643
+ filter_clause = render_filters(filters, doc_name="doc")
644
+
645
+ if (
646
+ aggregated_field is not None
647
+ and aggregation_function != AggregationType.COUNT
648
+ ):
649
+ group_unit = f"g[*].doc.{aggregated_field}"
650
+ else:
651
+ group_unit = "g"
652
+
653
+ if discriminant is not None:
654
+ collect_clause = f"COLLECT value = doc['{discriminant}'] INTO g"
655
+ return_clause = f"""{{ '{discriminant}' : value, '_value': {aggregation_function}({group_unit})}}"""
656
+ else:
657
+ if (
658
+ aggregated_field is None
659
+ and aggregation_function == AggregationType.COUNT
660
+ ):
661
+ collect_clause = (
662
+ f"COLLECT AGGREGATE value = {aggregation_function} (doc)"
663
+ )
664
+ else:
665
+ collect_clause = (
666
+ "COLLECT AGGREGATE value ="
667
+ f" {aggregation_function}(doc['{aggregated_field}'])"
668
+ )
669
+ return_clause = """{ '_value' : value }"""
670
+
671
+ q = f"""FOR doc IN {class_name}
672
+ {filter_clause}
673
+ {collect_clause}
674
+ RETURN {return_clause}"""
675
+
676
+ cursor = self.execute(q)
677
+ data = get_data_from_cursor(cursor)
678
+ return data
679
+
680
+ def keep_absent_documents(
681
+ self,
682
+ batch,
683
+ class_name,
684
+ match_keys,
685
+ keep_keys,
686
+ filters: None | Clause | list | dict = None,
687
+ ):
688
+ """Keep documents that don't exist in the database.
689
+
690
+ Args:
691
+ batch: Batch of documents to check
692
+ class_name: Collection to check in
693
+ match_keys: Keys to match documents
694
+ keep_keys: Keys to keep in result
695
+ filters: Additional query filters
696
+
697
+ Returns:
698
+ list: Documents that don't exist in the database
699
+ """
700
+ present_docs_keys = self.fetch_present_documents(
701
+ batch=batch,
702
+ class_name=class_name,
703
+ match_keys=match_keys,
704
+ keep_keys=keep_keys,
705
+ flatten=False,
706
+ filters=filters,
707
+ )
708
+
709
+ assert isinstance(present_docs_keys, dict)
710
+
711
+ if any([len(v) > 1 for v in present_docs_keys.values()]):
712
+ logger.warning(
713
+ "fetch_present_documents returned multiple docs per filtering condition"
714
+ )
715
+
716
+ absent_indices = sorted(set(range(len(batch))) - set(present_docs_keys.keys()))
717
+ batch_absent = [batch[j] for j in absent_indices]
718
+ return batch_absent
719
+
720
+ def update_to_numeric(self, collection_name, field):
721
+ """Update a field to numeric type in all documents.
722
+
723
+ Args:
724
+ collection_name: Collection to update
725
+ field: Field to convert to numeric
726
+
727
+ Returns:
728
+ str: AQL query string for the operation
729
+ """
730
+ s1 = f"FOR p IN {collection_name} FILTER p.{field} update p with {{"
731
+ s2 = f"{field}: TO_NUMBER(p.{field}) "
732
+ s3 = f"}} in {collection_name}"
733
+ q0 = s1 + s2 + s3
734
+ return q0