meshagent-agents 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of meshagent-agents might be problematic. Click here for more details.

@@ -0,0 +1,593 @@
1
+ from meshagent.agents import TaskRunner, RequiredToolkit, SingleRoomAgent
2
+ from meshagent.tools import Toolkit, Tool, ToolContext
3
+ from meshagent.api.room_server_client import TextDataType, VectorDataType, FloatDataType, IntDataType
4
+ from openai import AsyncOpenAI
5
+ from typing import Optional
6
+ from meshagent.api.chan import Chan
7
+
8
+ import hashlib
9
+ import chonkie
10
+ import asyncio
11
+ import logging
12
+
13
+ from functools import wraps
14
+
15
+ import os
16
+
17
+ # TODO: install chonkie, chonkie[semantic], openai
18
+
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+
21
+
22
+
23
+
24
+ def _async_debounce(wait):
25
+ def decorator(func):
26
+ task = None
27
+
28
+ @wraps(func)
29
+ async def debounced(*args, **kwargs):
30
+ nonlocal task
31
+
32
+ async def call_func():
33
+ await asyncio.sleep(wait)
34
+ await func(*args, **kwargs)
35
+
36
+ if task and not task.done():
37
+ task.cancel()
38
+
39
+ task = asyncio.create_task(call_func())
40
+ return task
41
+
42
+ return debounced
43
+
44
+ return decorator
45
+
46
+ logger = logging.getLogger("indexer")
47
+ logger.setLevel(logging.INFO)
48
+
49
+ class Chunk:
50
+ def __init__(self, *, text: str, start: int, end: int):
51
+ self.text = text
52
+ self.start = start
53
+ self.end = end
54
+
55
+ class Chunker:
56
+
57
+ async def chunk(self, *, text: str, max_length: Optional[int] = None) -> list[Chunk]:
58
+ pass
59
+
60
+ class ChonkieChunker(Chunker):
61
+ def __init__(self, chunker: Optional[chonkie.BaseChunker] = None):
62
+ super().__init__()
63
+
64
+ if chunker == None:
65
+ chunker = chonkie.SemanticChunker()
66
+
67
+ self._chunker = chunker
68
+
69
+ async def chunk(self, *, text: str, max_length: Optional[int] = None) -> list[Chunk]:
70
+ chunks = await asyncio.to_thread(self._chunker.chunk, text=text)
71
+ mapped = []
72
+ for chunk in chunks:
73
+ mapped.append(Chunk(text=chunk.text, start=chunk.start_index, end=chunk.end_index))
74
+ return mapped
75
+
76
+
77
+ class Embedder:
78
+ def __init__(self, *, size: int, max_length: int):
79
+ self.size = size
80
+ self.max_length = max_length
81
+
82
+ async def embed(self, *, text: str) -> list[float]:
83
+ pass
84
+
85
+ class OpenAIEmbedder(Embedder):
86
+ def __init__(self, *, size: int, max_length: int, model: str, openai: Optional[AsyncOpenAI] = None, ):
87
+ if openai == None:
88
+ openai = AsyncOpenAI()
89
+
90
+ self._openai = openai
91
+ self._model = model
92
+
93
+ super().__init__(size=size, max_length=max_length)
94
+
95
+
96
+ async def embed(self, *, text):
97
+ return (await self._openai.embeddings.create(input=text, model=self._model, encoding_format="float")).data[0].embedding
98
+
99
+
100
+
101
+ class RagTool(Tool):
102
+ def __init__(self, *, name = "rag_search", table: str, title = "RAG search", description = "perform a RAG search", rules = None, thumbnail_url = None, embedder: Optional[Embedder] = None):
103
+
104
+ self.table = table
105
+
106
+ super().__init__(
107
+ name=name,
108
+ input_schema={
109
+ "type":"object",
110
+ "additionalProperties" : False,
111
+ "required" : [
112
+ "query"
113
+ ],
114
+ "properties" : {
115
+ "query" : {
116
+ "type" : "string"
117
+ }
118
+ }
119
+ },
120
+ title=title,
121
+ description=description,
122
+ rules=rules, thumbnail_url=thumbnail_url)
123
+
124
+ self._embedder = embedder
125
+
126
+ async def execute(self, context: ToolContext, query: str):
127
+
128
+ if self._embedder == None:
129
+ results = await context.room.database.search(
130
+ table=self.table,
131
+ text=query,
132
+ limit=10
133
+ )
134
+ else:
135
+ embedding = await self._embedder.embed(text=query)
136
+ results = await context.room.database.search(
137
+ table=self.table,
138
+ text=query,
139
+ vector=embedding,
140
+ limit=10
141
+ )
142
+
143
+ results = list(map(lambda r: f"from {r["url"]}: {r["text"]}", results))
144
+
145
+ return {
146
+ "results" : results
147
+ }
148
+
149
+
150
+ def open_ai_embedding_3_small():
151
+ return OpenAIEmbedder(model="text-embedding-3-small", max_length=8191, size=1536)
152
+
153
+ def open_ai_embedding_3_large():
154
+ return OpenAIEmbedder(model="text-embedding-3-large", max_length=8191, size=3072)
155
+
156
+ def open_ai_embedding_ada_2():
157
+ return OpenAIEmbedder(model="text-embedding-ada-002", max_length=8191, size=1536)
158
+
159
+
160
+ class RagToolkit(Toolkit):
161
+ def __init__(self, table: str, embedder:Optional[Embedder] = None):
162
+
163
+ if embedder == None:
164
+ embedder = open_ai_embedding_3_large()
165
+
166
+ super().__init__(
167
+ name="meshagent.rag",
168
+ title="RAG",
169
+ description="Searches against an index",
170
+ tools=[
171
+ RagTool(table=table, embedder=embedder)
172
+ ]
173
+ )
174
+
175
+
176
+ class FileIndexEvent:
177
+ def __init__(self, *, path: str, deleted: bool):
178
+ self.path = path
179
+ self.deleted = deleted
180
+
181
+ class StorageIndexer(SingleRoomAgent):
182
+
183
+ def __init__(
184
+ self,
185
+ *,
186
+ name,
187
+ title=None,
188
+ description=None,
189
+ requires=None,
190
+ labels = None,
191
+ chunker: Optional[Chunker] = None,
192
+ embedder:Optional[Embedder] = None,
193
+ table: str = "storage_index",
194
+ ):
195
+ super().__init__(name=name, title=title, description=description, requires=requires, labels=labels)
196
+
197
+ self._chan = Chan[FileIndexEvent]()
198
+
199
+ if chunker == None:
200
+ chunker = ChonkieChunker()
201
+
202
+ if embedder == None:
203
+ embedder = open_ai_embedding_3_large()
204
+
205
+ self.chunker = chunker
206
+ self.embedder = embedder
207
+ self.table = table
208
+ self._vector_index_created = False
209
+ self._fts_created = False
210
+
211
+ async def read_file(self, *, path: str) -> str | None:
212
+ pass
213
+
214
+ @_async_debounce(10)
215
+ async def refresh_index(self):
216
+
217
+ self.room.developer.log_nowait(type="indexer.rebuild", data={})
218
+
219
+ indexes = await self.room.database.list_indexes(table=self.table)
220
+
221
+ logger.info(f"existing indexes {indexes}")
222
+
223
+ for index in indexes:
224
+
225
+ if "embedding" in index["columns"]:
226
+ self._vector_index_created = True
227
+
228
+ if "text" in index["columns"]:
229
+ self._fts_created = True
230
+
231
+ if self._vector_index_created == False:
232
+ try:
233
+ logger.info("attempting to create embedding index")
234
+ await self.room.database.create_vector_index(table=self.table, column="embedding", replace=False)
235
+ self._vector_index_created = True
236
+ except Exception as e:
237
+ # Will fail if there aren't enough rows
238
+ pass
239
+
240
+ if self._fts_created == False:
241
+ try:
242
+ logger.info("attempting to create fts index")
243
+ await self.room.database.create_full_text_search_index(table=self.table, column="text", replace=False)
244
+ self._fts_created = True
245
+ except Exception as e:
246
+ # Will fail if there aren't enough rows
247
+ pass
248
+
249
+ if self._fts_created == True or self._vector_index_created == True:
250
+ logger.info("optimizing existing index")
251
+ await self.room.database.optimize(table=self.table)
252
+
253
+
254
+ async def start(self, *, room):
255
+ await super().start(room=room)
256
+
257
+ room.storage.on("file.updated", self._on_file_updated)
258
+ room.storage.on("file.deleted", self._on_file_deleted)
259
+
260
+ await room.database.create_table_with_schema(
261
+ name=self.table,
262
+ schema={
263
+ "url" : TextDataType(),
264
+ "text" : TextDataType(),
265
+ "embedding" : VectorDataType(
266
+ size=self.embedder.size,
267
+ element_type=FloatDataType()
268
+ ),
269
+ "sha" : TextDataType(),
270
+ },
271
+ mode="create_if_not_exists",
272
+ data=None
273
+ )
274
+
275
+
276
+ def index_task(task: asyncio.Task):
277
+
278
+ try:
279
+ result = task.result()
280
+ except Exception as e:
281
+ logger.error("Index task failed", exc_info=e)
282
+
283
+
284
+ self._index_task = asyncio.create_task(self._indexer())
285
+ self._index_task.add_done_callback(index_task)
286
+
287
+ await self.install_requirements()
288
+
289
+
290
+ async def stop(self):
291
+ await super().stop()
292
+ await self._chan.close()
293
+
294
+
295
+ async def _indexer(self):
296
+
297
+ async for e in self._chan:
298
+
299
+ try:
300
+ if e.deleted:
301
+
302
+ # todo: consider using sql_alchemy or a library to do the escaping
303
+ def escape_sql_string(value):
304
+ if not isinstance(value, str):
305
+ raise TypeError("Input must be a string")
306
+ return value.replace("'", "''")
307
+
308
+ self.room.developer.log_nowait(type="indexer.delete", data={"path": e.path})
309
+ await self.room.database.delete(table=self.table, where=f"url='{escape_sql_string(e.path)}'")
310
+
311
+
312
+ else:
313
+
314
+ self.room.developer.log_nowait(type="indexer.index", data={"path": e.path})
315
+
316
+
317
+ async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
318
+
319
+ # if we already indexed this chunk, lets use the existing embedding instead of generating a new one
320
+ results = await self.room.database.search(
321
+ table=self.table,
322
+ where={
323
+ "sha" : sha,
324
+ },
325
+ limit=1
326
+ )
327
+
328
+ if len(results) != 0:
329
+ logger.info(f"chunk found from {e.path} {sha}, reusing embedding")
330
+ return results[0]["embedding"]
331
+
332
+ logger.info(f"chunk not found from {e.path} {sha}, generating embedding")
333
+
334
+ return await self.embedder.embed(text=text)
335
+
336
+ basename = os.path.basename(e.path)
337
+
338
+ chunk_sha = hashlib.sha256(basename.encode("utf-8")).hexdigest()
339
+
340
+ rows = []
341
+ # let's make the filename it's own chunk
342
+ rows.append(
343
+ {
344
+ "url" : e.path,
345
+ "text" : basename,
346
+ "sha" : chunk_sha,
347
+ "embedding" : await lookup_or_embed(sha=chunk_sha, text=basename)
348
+ }
349
+ )
350
+
351
+
352
+ text = await self.read_file(path=e.path)
353
+ if text != None:
354
+
355
+ # the content will be transformed into additional chunks
356
+ for chunk in await self.chunker.chunk(text=text, max_length = self.embedder.max_length):
357
+ logger.info(f"processing chunk from {e.path}: {chunk.start}")
358
+ chunk_sha = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest()
359
+ rows.append(
360
+ {
361
+ "url" : e.path,
362
+ "text" : chunk.text,
363
+ "embedding" : await lookup_or_embed(sha=chunk_sha, text=chunk.text),
364
+ "sha" : chunk_sha,
365
+ }
366
+ )
367
+ await self.room.database.merge(table=self.table, on="sha", records=rows)
368
+ await self.refresh_index()
369
+
370
+
371
+
372
+ except Exception as e:
373
+ logger.error("error while indexing", exc_info=e)
374
+
375
+
376
+ def _on_file_deleted(self, path: str):
377
+ self._chan.send_nowait(FileIndexEvent(path=path, deleted=True))
378
+
379
+ def _on_file_updated(self, path: str):
380
+ self._chan.send_nowait(FileIndexEvent(path=path, deleted=False))
381
+
382
+
383
+
384
+
385
+
386
+ class SiteIndexer(TaskRunner):
387
+
388
+ def __init__(self,
389
+ *,
390
+ name,
391
+ chunker: Optional[Chunker] = None,
392
+ embedder:Optional[Embedder] = None,
393
+ title=None,
394
+ description=None,
395
+ requires=None,
396
+ supports_tools = None,
397
+ labels: Optional[list[str]] = None
398
+
399
+ ):
400
+
401
+ if chunker == None:
402
+ chunker = ChonkieChunker()
403
+
404
+ if embedder == None:
405
+ embedder = open_ai_embedding_3_large()
406
+
407
+ self.chunker = chunker
408
+ self.embedder = embedder
409
+
410
+ super().__init__(
411
+ name=name,
412
+ title=title,
413
+ description=description,
414
+ requires=[
415
+ RequiredToolkit(
416
+ name="meshagent.firecrawl",
417
+ tools=[
418
+ "firecrawl_queue"
419
+ ]
420
+ ),
421
+ ],
422
+ supports_tools=supports_tools,
423
+ input_schema={
424
+ "type" : "object",
425
+ "required" : [
426
+ "queue", "table", "url"
427
+ ],
428
+ "additionalProperties" : False,
429
+ "properties" : {
430
+ "queue" : {
431
+ "type" : "string",
432
+ "description" : "default: firecrawl"
433
+ },
434
+ "table" : {
435
+ "type" : "string",
436
+ "description" : "default: index"
437
+ },
438
+ "url" : {
439
+ "type" : "string",
440
+ "description" : "default: index"
441
+ }
442
+ }
443
+ },
444
+ output_schema={
445
+ "type" : "object",
446
+ "required" : [],
447
+ "additionalProperties" : False,
448
+ "properties" : {},
449
+ },
450
+ labels=labels
451
+ )
452
+
453
+
454
+ async def ask(self, *, context, arguments):
455
+
456
+ queue = arguments["queue"]
457
+ table = arguments["table"]
458
+ url = arguments["url"]
459
+
460
+ tables = await context.room.database.list_tables()
461
+
462
+ exists = False
463
+ try:
464
+ exists = tables.index(table)
465
+ except ValueError:
466
+ pass
467
+
468
+
469
+ async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
470
+
471
+ # if we already indexed this chunk, lets use the existing embedding instead of generating a new one
472
+ if exists:
473
+
474
+ results = await self.room.database.search(
475
+ table=self.table,
476
+ where={
477
+ "sha" : sha,
478
+ },
479
+ limit=1
480
+ )
481
+
482
+
483
+ if len(results) != 0:
484
+ logger.info(f"chunk found from {url} {sha}, reusing embedding")
485
+ return results[0]["embedding"]
486
+
487
+ logger.info(f"chunk not found from {url} {sha}, generating embedding")
488
+
489
+ return await self.embedder.embed(text=text)
490
+
491
+
492
+ async def crawl():
493
+ logger.info(f"starting to crawl: {url}")
494
+ await context.room.agents.invoke_tool(
495
+ toolkit="meshagent.firecrawl",
496
+ tool="firecrawl_queue",
497
+ arguments={
498
+ "url" : url,
499
+ "queue": queue,
500
+ "limit" : 100
501
+ })
502
+
503
+ logger.info(f"done with crawl: {url}")
504
+ await context.room.queues.send(name=queue, message={ "done" : True })
505
+
506
+ def crawl_done(task: asyncio.Task):
507
+ try:
508
+ task.result()
509
+ except Exception as e:
510
+ logger.error("crawl failed", exc_info=e)
511
+
512
+
513
+ crawl_task = asyncio.create_task(crawl())
514
+ crawl_task.add_done_callback(crawl_done)
515
+
516
+ rows = []
517
+
518
+ id = 0
519
+
520
+ while True:
521
+ message = await context.room.queues.receive(name=queue, create=True, wait=True)
522
+
523
+ if message == None:
524
+ break
525
+
526
+ if message.get("type", None) == "crawl.completed":
527
+ break
528
+
529
+ if "data" in message:
530
+ for data in message["data"]:
531
+ try:
532
+ url : str = data["metadata"]["url"]
533
+ text : str = data["markdown"]
534
+ title : str = data["metadata"]["title"]
535
+ title_sha : str = hashlib.sha256(text.encode("utf-8")).hexdigest()
536
+
537
+ logger.info(f"processing crawled page: {url}")
538
+
539
+ # let's make the title it's own chunk
540
+ rows.append(
541
+ {
542
+ "id" : id,
543
+ "url" : url,
544
+ "text" : title,
545
+ "sha" : title_sha,
546
+ "embedding" : await lookup_or_embed(sha=title_sha, text=title)
547
+ }
548
+ )
549
+
550
+ id = id + 1
551
+
552
+ # the content will be transformed into additional chunks
553
+ for chunk in await self.chunker.chunk(text=text, max_length = self.embedder.max_length):
554
+ logger.info(f"processing chunk from {url}: {chunk.text}")
555
+ chunk_sha = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest()
556
+ rows.append(
557
+ {
558
+ "id" : id,
559
+ "url" : url,
560
+ "text" : chunk.text,
561
+ "embedding" : await lookup_or_embed(sha=chunk_sha, text=chunk.text)
562
+ }
563
+ )
564
+
565
+ id = id + 1
566
+
567
+ except Exception as e:
568
+ logger.error(f"failed to process: {url}", exc_info=e)
569
+
570
+ logger.info(f"saving crawl: {url}")
571
+
572
+ await context.room.database.create_table_with_schema(
573
+ name=table,
574
+ schema={
575
+ "id" : IntDataType(),
576
+ "url" : TextDataType(),
577
+ "text" : TextDataType(),
578
+ "embedding" : VectorDataType(
579
+ size=self.embedder.size,
580
+ element_type=FloatDataType()
581
+ ),
582
+ "sha" : TextDataType(),
583
+ },
584
+ mode="overwrite",
585
+ data=rows
586
+ )
587
+
588
+ if len(rows) > 255:
589
+ await context.room.database.create_vector_index(table=table, column="embedding")
590
+
591
+ await context.room.database.create_full_text_search_index(table=table, column="text")
592
+
593
+ return {}