mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
txtai/graph/base.py ADDED
@@ -0,0 +1,769 @@
1
+ """
2
+ Graph module
3
+ """
4
+
5
+ from collections import Counter
6
+
7
+ from .topics import Topics
8
+
9
+
10
+ # pylint: disable=R0904
11
+ class Graph:
12
+ """
13
+ Base class for Graph instances. This class builds graph networks. Supports topic modeling
14
+ and relationship traversal.
15
+ """
16
+
17
+ def __init__(self, config):
18
+ """
19
+ Creates a new Graph.
20
+
21
+ Args:
22
+ config: graph configuration
23
+ """
24
+
25
+ # Graph configuration
26
+ self.config = config if config is not None else {}
27
+
28
+ # Graph backend
29
+ self.backend = None
30
+
31
+ # Topic modeling
32
+ self.categories = None
33
+ self.topics = None
34
+
35
+ # Transform columns
36
+ columns = config.get("columns", {})
37
+ self.text = columns.get("text", "text")
38
+ self.object = columns.get("object", "object")
39
+
40
+ # Attributes to copy - skips text/object/relationship fields - set to True to copy all
41
+ self.copyattributes = config.get("copyattributes", False)
42
+
43
+ # Relationships are manually-provided edges
44
+ self.relationships = columns.get("relationships", "relationships")
45
+ self.relations = {}
46
+
47
+ def create(self):
48
+ """
49
+ Creates the graph network.
50
+ """
51
+
52
+ raise NotImplementedError
53
+
54
+ def count(self):
55
+ """
56
+ Returns the total number of nodes in graph.
57
+
58
+ Returns:
59
+ total nodes in graph
60
+ """
61
+
62
+ raise NotImplementedError
63
+
64
+ def scan(self, attribute=None, data=False):
65
+ """
66
+ Iterates over nodes that match a criteria. If no criteria specified, all nodes
67
+ are returned.
68
+
69
+ Args:
70
+ attribute: if specified, nodes having this attribute are returned
71
+ data: if True, attribute data is also returned
72
+
73
+ Returns:
74
+ node id iterator if data is False or (id, attribute dictionary) iterator if data is True
75
+ """
76
+
77
+ raise NotImplementedError
78
+
79
+ def node(self, node):
80
+ """
81
+ Get node by id. Returns None if not found.
82
+
83
+ Args:
84
+ node: node id
85
+
86
+ Returns:
87
+ graph node
88
+ """
89
+
90
+ raise NotImplementedError
91
+
92
+ def addnode(self, node, **attrs):
93
+ """
94
+ Adds a node to the graph.
95
+
96
+ Args:
97
+ node: node id
98
+ attrs: node attributes
99
+ """
100
+
101
+ raise NotImplementedError
102
+
103
+ def addnodes(self, nodes):
104
+ """
105
+ Adds nodes to the graph.
106
+
107
+ Args:
108
+ nodes: list of (node, attributes) to add
109
+ """
110
+
111
+ raise NotImplementedError
112
+
113
+ def removenode(self, node):
114
+ """
115
+ Removes a node and all it's edges from graph.
116
+
117
+ Args:
118
+ node: node id
119
+ """
120
+
121
+ raise NotImplementedError
122
+
123
+ def hasnode(self, node):
124
+ """
125
+ Returns True if node found, False otherwise.
126
+
127
+ Args:
128
+ node: node id
129
+
130
+ Returns:
131
+ True if node found, False otherwise
132
+ """
133
+
134
+ raise NotImplementedError
135
+
136
+ def attribute(self, node, field):
137
+ """
138
+ Gets a node attribute.
139
+
140
+ Args:
141
+ node: node id
142
+ field: attribute name
143
+
144
+ Returns:
145
+ attribute value
146
+ """
147
+
148
+ raise NotImplementedError
149
+
150
+ def addattribute(self, node, field, value):
151
+ """
152
+ Adds an attribute to node.
153
+
154
+ Args:
155
+ node: node id
156
+ field: attribute name
157
+ value: attribute value
158
+ """
159
+
160
+ raise NotImplementedError
161
+
162
+ def removeattribute(self, node, field):
163
+ """
164
+ Removes an attribute from node.
165
+
166
+ Args:
167
+ node: node id
168
+ field: attribute name
169
+
170
+ Returns:
171
+ attribute value or None if not present
172
+ """
173
+
174
+ raise NotImplementedError
175
+
176
+ def edgecount(self):
177
+ """
178
+ Returns the total number of edges.
179
+
180
+ Returns:
181
+ total number of edges in graph
182
+ """
183
+
184
+ raise NotImplementedError
185
+
186
+ def edges(self, node):
187
+ """
188
+ Gets edges of node by id.
189
+
190
+ Args:
191
+ node: node id
192
+
193
+ Returns:
194
+ list of edge node ids
195
+ """
196
+
197
+ raise NotImplementedError
198
+
199
+ def addedge(self, source, target, **attrs):
200
+ """
201
+ Adds an edge to graph.
202
+
203
+ Args:
204
+ source: node 1 id
205
+ target: node 2 id
206
+ """
207
+
208
+ raise NotImplementedError
209
+
210
+ def addedges(self, edges):
211
+ """
212
+ Adds an edge to graph.
213
+
214
+ Args:
215
+ edges: list of (source, target, attributes) to add
216
+ """
217
+
218
+ raise NotImplementedError
219
+
220
+ def hasedge(self, source, target=None):
221
+ """
222
+ Returns True if edge found, False otherwise. If target is None, this method
223
+ returns True if any edge is found.
224
+
225
+ Args:
226
+ source: node 1 id
227
+ target: node 2 id
228
+
229
+ Returns:
230
+ True if edge found, False otherwise
231
+ """
232
+
233
+ raise NotImplementedError
234
+
235
+ def centrality(self):
236
+ """
237
+ Runs a centrality algorithm on the graph.
238
+
239
+ Returns:
240
+ dict of {node id: centrality score}
241
+ """
242
+
243
+ raise NotImplementedError
244
+
245
+ def pagerank(self):
246
+ """
247
+ Runs the pagerank algorithm on the graph.
248
+
249
+ Returns:
250
+ dict of {node id, page rank score}
251
+ """
252
+
253
+ raise NotImplementedError
254
+
255
+ def showpath(self, source, target):
256
+ """
257
+ Gets the shortest path between source and target.
258
+
259
+ Args:
260
+ source: start node id
261
+ target: end node id
262
+
263
+ Returns:
264
+ list of node ids representing the shortest path
265
+ """
266
+
267
+ raise NotImplementedError
268
+
269
+ def isquery(self, queries):
270
+ """
271
+ Checks if queries are supported graph queries.
272
+
273
+ Args:
274
+ queries: queries to check
275
+
276
+ Returns:
277
+ True if all the queries are supported graph queries, False otherwise
278
+ """
279
+
280
+ raise NotImplementedError
281
+
282
+ def parse(self, query):
283
+ """
284
+ Parses a graph query into query components.
285
+
286
+ Args:
287
+ query: graph query
288
+
289
+ Returns:
290
+ query components as a dictionary
291
+ """
292
+
293
+ raise NotImplementedError
294
+
295
+ def search(self, query, limit=None, graph=False):
296
+ """
297
+ Searches graph for nodes matching query.
298
+
299
+ Args:
300
+ query: graph query
301
+ limit: maximum results
302
+ graph: return graph results if True
303
+
304
+ Returns:
305
+ list of dict if graph is set to False
306
+ filtered graph if graph is set to True
307
+ """
308
+
309
+ raise NotImplementedError
310
+
311
+ def batchsearch(self, queries, limit=None, graph=False):
312
+ """
313
+ Searches graph for nodes matching query.
314
+
315
+ Args:
316
+ query: graph query
317
+ limit: maximum results
318
+ graph: return graph results if True
319
+
320
+ Returns:
321
+ list of dict if graph is set to False
322
+ filtered graph if graph is set to True
323
+ """
324
+
325
+ return [self.search(query, limit, graph) for query in queries]
326
+
327
+ def communities(self, config):
328
+ """
329
+ Run community detection on the graph.
330
+
331
+ Args:
332
+ config: configuration
333
+
334
+ Returns:
335
+ dictionary of {topic name:[ids]}
336
+ """
337
+
338
+ raise NotImplementedError
339
+
340
+ def load(self, path):
341
+ """
342
+ Loads a graph at path.
343
+
344
+ Args:
345
+ path: path to graph
346
+ """
347
+
348
+ raise NotImplementedError
349
+
350
+ def save(self, path):
351
+ """
352
+ Saves a graph at path.
353
+
354
+ Args:
355
+ path: path to save graph
356
+ """
357
+
358
+ raise NotImplementedError
359
+
360
+ def loaddict(self, data):
361
+ """
362
+ Loads data from input dictionary into this graph.
363
+
364
+ Args:
365
+ data: input dictionary
366
+ """
367
+
368
+ raise NotImplementedError
369
+
370
+ def savedict(self):
371
+ """
372
+ Saves graph data to a dictionary.
373
+
374
+ Returns:
375
+ dict
376
+ """
377
+
378
+ raise NotImplementedError
379
+
380
+ def initialize(self):
381
+ """
382
+ Initialize graph instance.
383
+ """
384
+
385
+ if not self.backend:
386
+ self.backend = self.create()
387
+
388
+ def close(self):
389
+ """
390
+ Closes this graph.
391
+ """
392
+
393
+ self.backend, self.categories, self.topics = None, None, None
394
+
395
+ def insert(self, documents, index=0):
396
+ """
397
+ Insert graph nodes for each document.
398
+
399
+ Args:
400
+ documents: list of (id, data, tags)
401
+ index: indexid offset, used for node ids
402
+ """
403
+
404
+ # Initialize graph backend
405
+ self.initialize()
406
+
407
+ nodes = []
408
+ for uid, document, _ in documents:
409
+ # Manually provided relationships and attributes to copy
410
+ relations, attributes = None, {}
411
+
412
+ # Extract data from dictionary
413
+ if isinstance(document, dict):
414
+ # Extract relationships
415
+ relations = document.get(self.relationships)
416
+
417
+ # Attributes to copy, if any
418
+ search = self.copyattributes if isinstance(self.copyattributes, list) else []
419
+ attributes = {
420
+ k: v
421
+ for k, v in document.items()
422
+ if k not in [self.text, self.object, self.relationships] and (self.copyattributes is True or k in search)
423
+ }
424
+
425
+ # Require text or object field
426
+ document = document.get(self.text, document.get(self.object))
427
+
428
+ if document is not None:
429
+ if isinstance(document, list):
430
+ # Join tokens as text
431
+ document = " ".join(document)
432
+
433
+ # Create node
434
+ nodes.append((index, {**{"id": uid, "data": document}, **attributes}))
435
+
436
+ # Add relationships
437
+ self.addrelations(index, relations)
438
+
439
+ index += 1
440
+
441
+ # Add nodes
442
+ self.addnodes(nodes)
443
+
444
+ def delete(self, ids):
445
+ """
446
+ Deletes ids from graph.
447
+
448
+ Args:
449
+ ids: node ids to delete
450
+ """
451
+
452
+ for node in ids:
453
+ # Remove existing node, if it exists
454
+ if self.hasnode(node):
455
+ # Delete from topics
456
+ topic = self.attribute(node, "topic")
457
+ if topic and self.topics:
458
+ # Delete id from topic
459
+ self.topics[topic].remove(node)
460
+
461
+ # Also delete topic, if it's empty
462
+ if not self.topics[topic]:
463
+ self.topics.pop(topic)
464
+
465
+ # Delete node
466
+ self.removenode(node)
467
+
468
+ def index(self, search, ids, similarity):
469
+ """
470
+ Build relationships between graph nodes using a score-based search function.
471
+
472
+ Args:
473
+ search: batch search function - takes a list of queries and returns lists of (id, scores) to use as edge weights
474
+ ids: ids function - internal id resolver
475
+ similarity: batch similarity function - takes a list of text and labels and returns best matches
476
+ """
477
+
478
+ # Add relationship edges
479
+ self.resolverelations(ids)
480
+
481
+ # Infer node edges using search function
482
+ self.inferedges(self.scan(), search)
483
+
484
+ # Label categories/topics
485
+ if "topics" in self.config:
486
+ self.addtopics(similarity)
487
+
488
+ def upsert(self, search, ids, similarity=None):
489
+ """
490
+ Adds relationships for new graph nodes using a score-based search function.
491
+
492
+ Args:
493
+ search: batch search function - takes a list of queries and returns lists of (id, scores) to use as edge weights
494
+ ids: ids function - internal id resolver
495
+ similarity: batch similarity function - takes a list of text and labels and returns best matches
496
+ """
497
+
498
+ # Detect if topics processing is enabled
499
+ hastopics = "topics" in self.config
500
+
501
+ # Add relationship edges
502
+ self.resolverelations(ids)
503
+
504
+ # Infer node edges using new/updated nodes, set updated flag for topic processing, if necessary
505
+ self.inferedges(self.scan(attribute="data"), search, {"updated": True} if hastopics else None)
506
+
507
+ # Infer topics with topics of connected nodes
508
+ if hastopics:
509
+ # Infer topics if there is at least one topic, otherwise rebuild
510
+ if self.topics:
511
+ self.infertopics()
512
+ else:
513
+ self.addtopics(similarity)
514
+
515
+ def filter(self, nodes, graph=None):
516
+ """
517
+ Creates a subgraph of this graph using the list of input nodes. This method creates a new graph
518
+ selecting only matching nodes, edges, topics and categories.
519
+
520
+ Args:
521
+ nodes: nodes to select as a list of ids or list of (id, score) tuples
522
+ graph: optional graph used to store filtered results
523
+
524
+ Returns:
525
+ graph
526
+ """
527
+
528
+ # Set graph if available, otherwise create a new empty graph of the same type
529
+ graph = graph if graph else type(self)(self.config)
530
+
531
+ # Initalize subgraph
532
+ graph.initialize()
533
+
534
+ nodeids = {node[0] if isinstance(node, tuple) else node for node in nodes}
535
+ for node in nodes:
536
+ # Unpack node and score, if available
537
+ node, score = node if isinstance(node, tuple) else (node, None)
538
+
539
+ # Add nodes
540
+ graph.addnode(node, **self.node(node))
541
+
542
+ # Add score if present
543
+ if score is not None:
544
+ graph.addattribute(node, "score", score)
545
+
546
+ # Add edges
547
+ edges = self.edges(node)
548
+ if edges:
549
+ for target, attributes in self.edges(node).items():
550
+ if target in nodeids:
551
+ graph.addedge(node, target, **attributes)
552
+
553
+ # Filter categories and topics
554
+ if self.topics:
555
+ topics = {}
556
+ for i, (topic, ids) in enumerate(self.topics.items()):
557
+ ids = [x for x in ids if x in nodeids]
558
+ if ids:
559
+ topics[topic] = (self.categories[i] if self.categories else None, ids)
560
+
561
+ # Sort by number of nodes descending
562
+ topics = sorted(topics.items(), key=lambda x: len(x[1][1]), reverse=True)
563
+
564
+ # Copy filtered categories and topics
565
+ graph.categories = [category for _, (category, _) in topics] if self.categories else None
566
+ graph.topics = {topic: ids for topic, (_, ids) in topics}
567
+
568
+ return graph
569
+
570
+ def addrelations(self, node, relations):
571
+ """
572
+ Add manually-provided relationships.
573
+
574
+ Args:
575
+ node: node id
576
+ relations: list of relationships to add
577
+ """
578
+
579
+ # Add relationships, if any
580
+ if relations:
581
+ if node not in self.relations:
582
+ self.relations[node] = []
583
+
584
+ # Add each relationship
585
+ for relation in relations:
586
+ # Support both dict and string ids
587
+ relation = {"id": relation} if not isinstance(relation, dict) else relation
588
+ self.relations[node].append(relation)
589
+
590
+ def resolverelations(self, ids):
591
+ """
592
+ Resolves ids and creates edges for manually-provided relationships.
593
+
594
+ Args:
595
+ ids: internal id resolver
596
+ """
597
+
598
+ # Relationship edges
599
+ edges = []
600
+
601
+ # Resolve ids and create edges for relationships
602
+ for node, relations in self.relations.items():
603
+ # Resolve internal ids
604
+ iids = ids(y["id"] for y in relations)
605
+
606
+ # Add each edge
607
+ for relation in relations:
608
+ # Make copy of relation
609
+ relation = relation.copy()
610
+
611
+ # Lookup targets for relationship
612
+ targets = iids.get(str(relation.pop("id")))
613
+
614
+ # Create edge for each instance of id - internal id pair
615
+ if targets:
616
+ for target in targets:
617
+ # Add weight, if not provided
618
+ relation["weight"] = relation.get("weight", 1.0)
619
+
620
+ # Add edge and all other attributes
621
+ edges.append((node, target, relation))
622
+
623
+ # Add relationships
624
+ if edges:
625
+ self.addedges(edges)
626
+
627
+ # Clear temporary relationship storage
628
+ self.relations = {}
629
+
630
+ def inferedges(self, nodes, search, attributes=None):
631
+ """
632
+ Infers edges for a list of nodes using a score-based search function.
633
+
634
+ Args:
635
+ nodes: list of nodes
636
+ search: search function to use to identify edges
637
+ attribute: dictionary of attributes to add to each node
638
+ """
639
+
640
+ # Read graph parameters
641
+ batchsize, limit, minscore = self.config.get("batchsize", 256), self.config.get("limit", 15), self.config.get("minscore", 0.1)
642
+ approximate = self.config.get("approximate", True)
643
+
644
+ batch = []
645
+ for node in nodes:
646
+ # Get data attribute
647
+ data = self.removeattribute(node, "data")
648
+
649
+ # Set text field when data is a string
650
+ if isinstance(data, str):
651
+ self.addattribute(node, "text", data)
652
+
653
+ # Add additional attributes, if specified
654
+ if attributes:
655
+ for field, value in attributes.items():
656
+ self.addattribute(node, field, value)
657
+
658
+ # Skip nodes with existing edges when building an approximate network
659
+ if not approximate or not self.hasedge(node):
660
+ batch.append((node, data))
661
+
662
+ # Process batch
663
+ if len(batch) == batchsize:
664
+ self.addbatch(search, batch, limit, minscore)
665
+ batch = []
666
+
667
+ if batch:
668
+ self.addbatch(search, batch, limit, minscore)
669
+
670
+ def addbatch(self, search, batch, limit, minscore):
671
+ """
672
+ Adds batch of documents to graph. This method runs the search function for each item in batch
673
+ and adds node edges between the input and each search result.
674
+
675
+ Args:
676
+ search: search function to use to identify edges
677
+ batch: batch to add
678
+ limit: max edges to add per node
679
+ minscore: min score to add node edge
680
+ """
681
+
682
+ edges = []
683
+ for x, result in enumerate(search([data for _, data in batch], limit)):
684
+ # Get input node id
685
+ x, _ = batch[x]
686
+
687
+ # Add edges for each input node id and result node id pair that meets specified criteria
688
+ for y, score in result:
689
+ if str(x) != str(y) and score > minscore:
690
+ edges.append((x, y, {"weight": score}))
691
+
692
+ self.addedges(edges)
693
+
694
+ def addtopics(self, similarity=None):
695
+ """
696
+ Identifies and adds topics using community detection.
697
+
698
+ Args:
699
+ similarity: similarity function for labeling categories
700
+ """
701
+
702
+ # Clear previous topics, if any
703
+ self.cleartopics()
704
+
705
+ # Use community detection to get topics
706
+ topics = Topics(self.config["topics"])
707
+ config = topics.config
708
+ self.topics = topics(self)
709
+
710
+ # Label each topic with a higher level category
711
+ if "categories" in config and similarity:
712
+ self.categories = []
713
+ results = similarity(self.topics.keys(), config["categories"])
714
+ for result in results:
715
+ self.categories.append(config["categories"][result[0][0]])
716
+
717
+ # Add topic-related node attributes
718
+ for x, topic in enumerate(self.topics):
719
+ for r, node in enumerate(self.topics[topic]):
720
+ self.addattribute(node, "topic", topic)
721
+ self.addattribute(node, "topicrank", r)
722
+
723
+ if self.categories:
724
+ self.addattribute(node, "category", self.categories[x])
725
+
726
+ def cleartopics(self):
727
+ """
728
+ Clears topic fields from all nodes.
729
+ """
730
+
731
+ # Clear previous topics, if any
732
+ if self.topics:
733
+ for node in self.scan():
734
+ self.removeattribute(node, "topic")
735
+ self.removeattribute(node, "topicrank")
736
+
737
+ if self.categories:
738
+ self.removeattribute(node, "category")
739
+
740
+ self.topics, self.categories = None, None
741
+
742
+ def infertopics(self):
743
+ """
744
+ Infers topics for all nodes with an "updated" attribute. This method analyzes the direct node
745
+ neighbors and set the most commonly occuring topic and category for each node.
746
+ """
747
+
748
+ # Iterate over nodes missing topic attribute (only occurs for new nodes)
749
+ for node in self.scan(attribute="updated"):
750
+ # Remove updated attribute
751
+ self.removeattribute(node, "updated")
752
+
753
+ # Get list of neighboring nodes
754
+ ids = self.edges(node)
755
+ ids = ids.keys() if ids else None
756
+
757
+ # Infer topic
758
+ topic = Counter(self.attribute(x, "topic") for x in ids).most_common(1)[0][0] if ids else None
759
+ if topic:
760
+ # Add id to topic list and set topic attribute
761
+ self.topics[topic].append(node)
762
+ self.addattribute(node, "topic", topic)
763
+
764
+ # Set topic rank
765
+ self.addattribute(node, "topicrank", len(self.topics[topic]) - 1)
766
+
767
+ # Infer category
768
+ category = Counter(self.attribute(x, "category") for x in ids).most_common(1)[0][0]
769
+ self.addattribute(node, "category", category)