meshagent-agents 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of meshagent-agents might be problematic. Click here for more details.
- meshagent/agents/__init__.py +27 -2
- meshagent/agents/adapter.py +18 -9
- meshagent/agents/agent.py +317 -214
- meshagent/agents/chat.py +392 -267
- meshagent/agents/context.py +58 -30
- meshagent/agents/development.py +11 -13
- meshagent/agents/hosting.py +109 -46
- meshagent/agents/indexer.py +241 -224
- meshagent/agents/listener.py +55 -52
- meshagent/agents/mail.py +145 -109
- meshagent/agents/planning.py +294 -199
- meshagent/agents/prompt.py +14 -12
- meshagent/agents/pydantic.py +98 -61
- meshagent/agents/schemas/__init__.py +11 -0
- meshagent/agents/schemas/document.py +32 -21
- meshagent/agents/schemas/gallery.py +23 -14
- meshagent/agents/schemas/presentation.py +33 -17
- meshagent/agents/schemas/schema.py +99 -45
- meshagent/agents/schemas/super_editor_document.py +52 -46
- meshagent/agents/single_shot_writer.py +37 -31
- meshagent/agents/thread_schema.py +74 -32
- meshagent/agents/utils.py +20 -12
- meshagent/agents/version.py +1 -1
- meshagent/agents/worker.py +48 -28
- meshagent/agents/writer.py +36 -23
- meshagent_agents-0.0.39.dist-info/METADATA +64 -0
- meshagent_agents-0.0.39.dist-info/RECORD +30 -0
- meshagent_agents-0.0.37.dist-info/METADATA +0 -36
- meshagent_agents-0.0.37.dist-info/RECORD +0 -30
- {meshagent_agents-0.0.37.dist-info → meshagent_agents-0.0.39.dist-info}/WHEEL +0 -0
- {meshagent_agents-0.0.37.dist-info → meshagent_agents-0.0.39.dist-info}/licenses/LICENSE +0 -0
- {meshagent_agents-0.0.37.dist-info → meshagent_agents-0.0.39.dist-info}/top_level.txt +0 -0
meshagent/agents/indexer.py
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
from meshagent.agents import TaskRunner, RequiredToolkit, SingleRoomAgent
|
|
2
2
|
from meshagent.tools import Toolkit, Tool, ToolContext
|
|
3
|
-
from meshagent.api.room_server_client import
|
|
3
|
+
from meshagent.api.room_server_client import (
|
|
4
|
+
TextDataType,
|
|
5
|
+
VectorDataType,
|
|
6
|
+
FloatDataType,
|
|
7
|
+
IntDataType,
|
|
8
|
+
)
|
|
4
9
|
from openai import AsyncOpenAI
|
|
5
10
|
from typing import Optional
|
|
6
11
|
from meshagent.api.chan import Chan
|
|
@@ -16,7 +21,6 @@ import os
|
|
|
16
21
|
|
|
17
22
|
# TODO: install chonkie, chonkie[semantic], openai
|
|
18
23
|
|
|
19
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
24
|
|
|
21
25
|
def _async_debounce(wait):
|
|
22
26
|
def decorator(func):
|
|
@@ -40,6 +44,7 @@ def _async_debounce(wait):
|
|
|
40
44
|
|
|
41
45
|
return decorator
|
|
42
46
|
|
|
47
|
+
|
|
43
48
|
logger = logging.getLogger("indexer")
|
|
44
49
|
|
|
45
50
|
|
|
@@ -49,124 +54,141 @@ class Chunk:
|
|
|
49
54
|
self.start = start
|
|
50
55
|
self.end = end
|
|
51
56
|
|
|
52
|
-
class Chunker:
|
|
53
57
|
|
|
54
|
-
|
|
58
|
+
class Chunker:
|
|
59
|
+
async def chunk(
|
|
60
|
+
self, *, text: str, max_length: Optional[int] = None
|
|
61
|
+
) -> list[Chunk]:
|
|
55
62
|
pass
|
|
56
63
|
|
|
64
|
+
|
|
57
65
|
class ChonkieChunker(Chunker):
|
|
58
66
|
def __init__(self, chunker: Optional[chonkie.BaseChunker] = None):
|
|
59
67
|
super().__init__()
|
|
60
68
|
|
|
61
|
-
if chunker
|
|
69
|
+
if chunker is None:
|
|
62
70
|
chunker = chonkie.SemanticChunker()
|
|
63
71
|
|
|
64
72
|
self._chunker = chunker
|
|
65
73
|
|
|
66
|
-
async def chunk(
|
|
74
|
+
async def chunk(
|
|
75
|
+
self, *, text: str, max_length: Optional[int] = None
|
|
76
|
+
) -> list[Chunk]:
|
|
67
77
|
chunks = await asyncio.to_thread(self._chunker.chunk, text=text)
|
|
68
78
|
mapped = []
|
|
69
79
|
for chunk in chunks:
|
|
70
|
-
mapped.append(
|
|
80
|
+
mapped.append(
|
|
81
|
+
Chunk(text=chunk.text, start=chunk.start_index, end=chunk.end_index)
|
|
82
|
+
)
|
|
71
83
|
return mapped
|
|
72
|
-
|
|
84
|
+
|
|
73
85
|
|
|
74
86
|
class Embedder:
|
|
75
87
|
def __init__(self, *, size: int, max_length: int):
|
|
76
88
|
self.size = size
|
|
77
89
|
self.max_length = max_length
|
|
78
|
-
|
|
90
|
+
|
|
79
91
|
async def embed(self, *, text: str) -> list[float]:
|
|
80
92
|
pass
|
|
81
93
|
|
|
94
|
+
|
|
82
95
|
class OpenAIEmbedder(Embedder):
|
|
83
|
-
def __init__(
|
|
84
|
-
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
*,
|
|
99
|
+
size: int,
|
|
100
|
+
max_length: int,
|
|
101
|
+
model: str,
|
|
102
|
+
openai: Optional[AsyncOpenAI] = None,
|
|
103
|
+
):
|
|
104
|
+
if openai is None:
|
|
85
105
|
openai = AsyncOpenAI()
|
|
86
|
-
|
|
106
|
+
|
|
87
107
|
self._openai = openai
|
|
88
108
|
self._model = model
|
|
89
109
|
|
|
90
110
|
super().__init__(size=size, max_length=max_length)
|
|
91
111
|
|
|
92
|
-
|
|
93
112
|
async def embed(self, *, text):
|
|
94
|
-
return (
|
|
95
|
-
|
|
113
|
+
return (
|
|
114
|
+
(
|
|
115
|
+
await self._openai.embeddings.create(
|
|
116
|
+
input=text, model=self._model, encoding_format="float"
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
.data[0]
|
|
120
|
+
.embedding
|
|
121
|
+
)
|
|
96
122
|
|
|
97
123
|
|
|
98
124
|
class RagTool(Tool):
|
|
99
|
-
def __init__(
|
|
100
|
-
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
*,
|
|
128
|
+
name="rag_search",
|
|
129
|
+
table: str,
|
|
130
|
+
title="RAG search",
|
|
131
|
+
description="perform a RAG search",
|
|
132
|
+
rules=None,
|
|
133
|
+
thumbnail_url=None,
|
|
134
|
+
embedder: Optional[Embedder] = None,
|
|
135
|
+
):
|
|
101
136
|
self.table = table
|
|
102
137
|
|
|
103
138
|
super().__init__(
|
|
104
139
|
name=name,
|
|
105
140
|
input_schema={
|
|
106
|
-
"type":"object",
|
|
107
|
-
"additionalProperties"
|
|
108
|
-
"required"
|
|
109
|
-
|
|
110
|
-
],
|
|
111
|
-
"properties" : {
|
|
112
|
-
"query" : {
|
|
113
|
-
"type" : "string"
|
|
114
|
-
}
|
|
115
|
-
}
|
|
141
|
+
"type": "object",
|
|
142
|
+
"additionalProperties": False,
|
|
143
|
+
"required": ["query"],
|
|
144
|
+
"properties": {"query": {"type": "string"}},
|
|
116
145
|
},
|
|
117
146
|
title=title,
|
|
118
147
|
description=description,
|
|
119
|
-
rules=rules,
|
|
120
|
-
|
|
148
|
+
rules=rules,
|
|
149
|
+
thumbnail_url=thumbnail_url,
|
|
150
|
+
)
|
|
151
|
+
|
|
121
152
|
self._embedder = embedder
|
|
122
153
|
|
|
123
154
|
async def execute(self, context: ToolContext, query: str):
|
|
124
|
-
|
|
125
|
-
if self._embedder == None:
|
|
155
|
+
if self._embedder is None:
|
|
126
156
|
results = await context.room.database.search(
|
|
127
|
-
table=self.table,
|
|
128
|
-
text=query,
|
|
129
|
-
limit=10
|
|
157
|
+
table=self.table, text=query, limit=10
|
|
130
158
|
)
|
|
131
159
|
else:
|
|
132
160
|
embedding = await self._embedder.embed(text=query)
|
|
133
161
|
results = await context.room.database.search(
|
|
134
|
-
table=self.table,
|
|
135
|
-
text=query,
|
|
136
|
-
vector=embedding,
|
|
137
|
-
limit=10
|
|
162
|
+
table=self.table, text=query, vector=embedding, limit=10
|
|
138
163
|
)
|
|
139
164
|
|
|
140
|
-
results = list(map(lambda r: f"from {r[
|
|
165
|
+
results = list(map(lambda r: f"from {r['url']}: {r['text']}", results))
|
|
166
|
+
|
|
167
|
+
return {"results": results}
|
|
141
168
|
|
|
142
|
-
return {
|
|
143
|
-
"results" : results
|
|
144
|
-
}
|
|
145
|
-
|
|
146
169
|
|
|
147
170
|
def open_ai_embedding_3_small():
|
|
148
171
|
return OpenAIEmbedder(model="text-embedding-3-small", max_length=8191, size=1536)
|
|
149
172
|
|
|
173
|
+
|
|
150
174
|
def open_ai_embedding_3_large():
|
|
151
175
|
return OpenAIEmbedder(model="text-embedding-3-large", max_length=8191, size=3072)
|
|
152
176
|
|
|
177
|
+
|
|
153
178
|
def open_ai_embedding_ada_2():
|
|
154
179
|
return OpenAIEmbedder(model="text-embedding-ada-002", max_length=8191, size=1536)
|
|
155
180
|
|
|
156
181
|
|
|
157
182
|
class RagToolkit(Toolkit):
|
|
158
|
-
def __init__(self, table: str, embedder:Optional[Embedder] = None):
|
|
159
|
-
|
|
160
|
-
if embedder == None:
|
|
183
|
+
def __init__(self, table: str, embedder: Optional[Embedder] = None):
|
|
184
|
+
if embedder is None:
|
|
161
185
|
embedder = open_ai_embedding_3_large()
|
|
162
186
|
|
|
163
187
|
super().__init__(
|
|
164
188
|
name="meshagent.rag",
|
|
165
189
|
title="RAG",
|
|
166
190
|
description="Searches against an index",
|
|
167
|
-
tools=[
|
|
168
|
-
RagTool(table=table, embedder=embedder)
|
|
169
|
-
]
|
|
191
|
+
tools=[RagTool(table=table, embedder=embedder)],
|
|
170
192
|
)
|
|
171
193
|
|
|
172
194
|
|
|
@@ -175,8 +197,8 @@ class FileIndexEvent:
|
|
|
175
197
|
self.path = path
|
|
176
198
|
self.deleted = deleted
|
|
177
199
|
|
|
178
|
-
class StorageIndexer(SingleRoomAgent):
|
|
179
200
|
|
|
201
|
+
class StorageIndexer(SingleRoomAgent):
|
|
180
202
|
def __init__(
|
|
181
203
|
self,
|
|
182
204
|
*,
|
|
@@ -184,19 +206,25 @@ class StorageIndexer(SingleRoomAgent):
|
|
|
184
206
|
title=None,
|
|
185
207
|
description=None,
|
|
186
208
|
requires=None,
|
|
187
|
-
labels
|
|
209
|
+
labels=None,
|
|
188
210
|
chunker: Optional[Chunker] = None,
|
|
189
|
-
embedder:Optional[Embedder] = None,
|
|
211
|
+
embedder: Optional[Embedder] = None,
|
|
190
212
|
table: str = "storage_index",
|
|
191
|
-
|
|
192
|
-
super().__init__(
|
|
213
|
+
):
|
|
214
|
+
super().__init__(
|
|
215
|
+
name=name,
|
|
216
|
+
title=title,
|
|
217
|
+
description=description,
|
|
218
|
+
requires=requires,
|
|
219
|
+
labels=labels,
|
|
220
|
+
)
|
|
193
221
|
|
|
194
222
|
self._chan = Chan[FileIndexEvent]()
|
|
195
|
-
|
|
196
|
-
if chunker
|
|
223
|
+
|
|
224
|
+
if chunker is None:
|
|
197
225
|
chunker = ChonkieChunker()
|
|
198
226
|
|
|
199
|
-
if embedder
|
|
227
|
+
if embedder is None:
|
|
200
228
|
embedder = open_ai_embedding_3_large()
|
|
201
229
|
|
|
202
230
|
self.chunker = chunker
|
|
@@ -207,46 +235,47 @@ class StorageIndexer(SingleRoomAgent):
|
|
|
207
235
|
|
|
208
236
|
async def read_file(self, *, path: str) -> str | None:
|
|
209
237
|
pass
|
|
210
|
-
|
|
238
|
+
|
|
211
239
|
@_async_debounce(10)
|
|
212
240
|
async def refresh_index(self):
|
|
213
|
-
|
|
214
241
|
self.room.developer.log_nowait(type="indexer.rebuild", data={})
|
|
215
242
|
|
|
216
243
|
indexes = await self.room.database.list_indexes(table=self.table)
|
|
217
244
|
|
|
218
245
|
logger.info(f"existing indexes {indexes}")
|
|
219
|
-
|
|
220
|
-
for index in indexes:
|
|
221
246
|
|
|
247
|
+
for index in indexes:
|
|
222
248
|
if "embedding" in index["columns"]:
|
|
223
249
|
self._vector_index_created = True
|
|
224
|
-
|
|
250
|
+
|
|
225
251
|
if "text" in index["columns"]:
|
|
226
|
-
self._fts_created
|
|
252
|
+
self._fts_created = True
|
|
227
253
|
|
|
228
|
-
if self._vector_index_created
|
|
254
|
+
if not self._vector_index_created:
|
|
229
255
|
try:
|
|
230
256
|
logger.info("attempting to create embedding index")
|
|
231
|
-
await self.room.database.create_vector_index(
|
|
257
|
+
await self.room.database.create_vector_index(
|
|
258
|
+
table=self.table, column="embedding", replace=False
|
|
259
|
+
)
|
|
232
260
|
self._vector_index_created = True
|
|
233
|
-
except Exception
|
|
261
|
+
except Exception:
|
|
234
262
|
# Will fail if there aren't enough rows
|
|
235
263
|
pass
|
|
236
264
|
|
|
237
|
-
if self._fts_created
|
|
265
|
+
if not self._fts_created:
|
|
238
266
|
try:
|
|
239
267
|
logger.info("attempting to create fts index")
|
|
240
|
-
await self.room.database.create_full_text_search_index(
|
|
268
|
+
await self.room.database.create_full_text_search_index(
|
|
269
|
+
table=self.table, column="text", replace=False
|
|
270
|
+
)
|
|
241
271
|
self._fts_created = True
|
|
242
|
-
except Exception
|
|
272
|
+
except Exception:
|
|
243
273
|
# Will fail if there aren't enough rows
|
|
244
274
|
pass
|
|
245
275
|
|
|
246
|
-
if self._fts_created
|
|
276
|
+
if self._fts_created or self._vector_index_created:
|
|
247
277
|
logger.info("optimizing existing index")
|
|
248
278
|
await self.room.database.optimize(table=self.table)
|
|
249
|
-
|
|
250
279
|
|
|
251
280
|
async def start(self, *, room):
|
|
252
281
|
await super().start(room=room)
|
|
@@ -257,74 +286,72 @@ class StorageIndexer(SingleRoomAgent):
|
|
|
257
286
|
await room.database.create_table_with_schema(
|
|
258
287
|
name=self.table,
|
|
259
288
|
schema={
|
|
260
|
-
"url"
|
|
261
|
-
"text"
|
|
262
|
-
"embedding"
|
|
263
|
-
size=self.embedder.size,
|
|
264
|
-
element_type=FloatDataType()
|
|
289
|
+
"url": TextDataType(),
|
|
290
|
+
"text": TextDataType(),
|
|
291
|
+
"embedding": VectorDataType(
|
|
292
|
+
size=self.embedder.size, element_type=FloatDataType()
|
|
265
293
|
),
|
|
266
|
-
"sha"
|
|
294
|
+
"sha": TextDataType(),
|
|
267
295
|
},
|
|
268
296
|
mode="create_if_not_exists",
|
|
269
|
-
data=None
|
|
297
|
+
data=None,
|
|
270
298
|
)
|
|
271
299
|
|
|
272
|
-
|
|
273
300
|
def index_task(task: asyncio.Task):
|
|
274
|
-
|
|
275
301
|
try:
|
|
276
|
-
|
|
302
|
+
task.result()
|
|
277
303
|
except Exception as e:
|
|
278
304
|
logger.error("Index task failed", exc_info=e)
|
|
279
|
-
|
|
280
305
|
|
|
281
306
|
self._index_task = asyncio.create_task(self._indexer())
|
|
282
307
|
self._index_task.add_done_callback(index_task)
|
|
283
|
-
|
|
308
|
+
|
|
284
309
|
async def stop(self):
|
|
285
310
|
await super().stop()
|
|
286
311
|
await self._chan.close()
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
async def _indexer(self):
|
|
290
312
|
|
|
313
|
+
async def _indexer(self):
|
|
291
314
|
async for e in self._chan:
|
|
292
|
-
|
|
293
315
|
try:
|
|
294
316
|
if e.deleted:
|
|
295
|
-
|
|
296
317
|
# todo: consider using sql_alchemy or a library to do the escaping
|
|
297
318
|
def escape_sql_string(value):
|
|
298
319
|
if not isinstance(value, str):
|
|
299
320
|
raise TypeError("Input must be a string")
|
|
300
321
|
return value.replace("'", "''")
|
|
301
322
|
|
|
302
|
-
self.room.developer.log_nowait(
|
|
303
|
-
|
|
304
|
-
|
|
323
|
+
self.room.developer.log_nowait(
|
|
324
|
+
type="indexer.delete", data={"path": e.path}
|
|
325
|
+
)
|
|
326
|
+
await self.room.database.delete(
|
|
327
|
+
table=self.table, where=f"url='{escape_sql_string(e.path)}'"
|
|
328
|
+
)
|
|
305
329
|
|
|
306
330
|
else:
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
331
|
+
self.room.developer.log_nowait(
|
|
332
|
+
type="indexer.index", data={"path": e.path}
|
|
333
|
+
)
|
|
310
334
|
|
|
311
335
|
async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
|
|
312
|
-
|
|
313
336
|
# if we already indexed this chunk, lets use the existing embedding instead of generating a new one
|
|
314
337
|
results = await self.room.database.search(
|
|
315
338
|
table=self.table,
|
|
316
339
|
where={
|
|
317
|
-
"sha"
|
|
340
|
+
"sha": sha,
|
|
318
341
|
},
|
|
319
|
-
limit=1
|
|
342
|
+
limit=1,
|
|
320
343
|
)
|
|
321
344
|
|
|
322
345
|
if len(results) != 0:
|
|
323
|
-
logger.info(
|
|
346
|
+
logger.info(
|
|
347
|
+
f"chunk found from {e.path} {sha}, reusing embedding"
|
|
348
|
+
)
|
|
324
349
|
return results[0]["embedding"]
|
|
325
|
-
|
|
326
|
-
logger.info(
|
|
327
|
-
|
|
350
|
+
|
|
351
|
+
logger.info(
|
|
352
|
+
f"chunk not found from {e.path} {sha}, generating embedding"
|
|
353
|
+
)
|
|
354
|
+
|
|
328
355
|
return await self.embedder.embed(text=text)
|
|
329
356
|
|
|
330
357
|
basename = os.path.basename(e.path)
|
|
@@ -335,67 +362,69 @@ class StorageIndexer(SingleRoomAgent):
|
|
|
335
362
|
# let's make the filename it's own chunk
|
|
336
363
|
rows.append(
|
|
337
364
|
{
|
|
338
|
-
"url"
|
|
339
|
-
"text"
|
|
340
|
-
"sha"
|
|
341
|
-
"embedding"
|
|
365
|
+
"url": e.path,
|
|
366
|
+
"text": basename,
|
|
367
|
+
"sha": chunk_sha,
|
|
368
|
+
"embedding": await lookup_or_embed(
|
|
369
|
+
sha=chunk_sha, text=basename
|
|
370
|
+
),
|
|
342
371
|
}
|
|
343
372
|
)
|
|
344
|
-
|
|
345
|
-
|
|
373
|
+
|
|
346
374
|
text = await self.read_file(path=e.path)
|
|
347
|
-
if text
|
|
348
|
-
|
|
375
|
+
if text is not None:
|
|
349
376
|
# the content will be transformed into additional chunks
|
|
350
|
-
for chunk in await self.chunker.chunk(
|
|
351
|
-
|
|
352
|
-
|
|
377
|
+
for chunk in await self.chunker.chunk(
|
|
378
|
+
text=text, max_length=self.embedder.max_length
|
|
379
|
+
):
|
|
380
|
+
logger.info(
|
|
381
|
+
f"processing chunk from {e.path}: {chunk.start}"
|
|
382
|
+
)
|
|
383
|
+
chunk_sha = hashlib.sha256(
|
|
384
|
+
chunk.text.encode("utf-8")
|
|
385
|
+
).hexdigest()
|
|
353
386
|
rows.append(
|
|
354
387
|
{
|
|
355
|
-
"url"
|
|
356
|
-
"text"
|
|
357
|
-
"embedding"
|
|
358
|
-
|
|
388
|
+
"url": e.path,
|
|
389
|
+
"text": chunk.text,
|
|
390
|
+
"embedding": await lookup_or_embed(
|
|
391
|
+
sha=chunk_sha, text=chunk.text
|
|
392
|
+
),
|
|
393
|
+
"sha": chunk_sha,
|
|
359
394
|
}
|
|
360
395
|
)
|
|
361
|
-
await self.room.database.merge(
|
|
396
|
+
await self.room.database.merge(
|
|
397
|
+
table=self.table, on="sha", records=rows
|
|
398
|
+
)
|
|
362
399
|
await self.refresh_index()
|
|
363
400
|
|
|
364
|
-
|
|
365
|
-
|
|
366
401
|
except Exception as e:
|
|
367
402
|
logger.error("error while indexing", exc_info=e)
|
|
368
403
|
|
|
369
|
-
|
|
370
404
|
def _on_file_deleted(self, path: str, participant_id: str):
|
|
371
405
|
self._chan.send_nowait(FileIndexEvent(path=path, deleted=True))
|
|
372
|
-
|
|
406
|
+
|
|
373
407
|
def _on_file_updated(self, path: str, participant_id: str):
|
|
374
408
|
self._chan.send_nowait(FileIndexEvent(path=path, deleted=False))
|
|
375
409
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
410
|
|
|
380
411
|
class SiteIndexer(TaskRunner):
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
412
|
+
def __init__(
|
|
413
|
+
self,
|
|
414
|
+
*,
|
|
384
415
|
name,
|
|
385
416
|
chunker: Optional[Chunker] = None,
|
|
386
|
-
embedder:Optional[Embedder] = None,
|
|
417
|
+
embedder: Optional[Embedder] = None,
|
|
387
418
|
title=None,
|
|
388
419
|
description=None,
|
|
389
420
|
requires=None,
|
|
390
|
-
supports_tools
|
|
391
|
-
labels: Optional[list[str]] = None
|
|
392
|
-
|
|
421
|
+
supports_tools=None,
|
|
422
|
+
labels: Optional[list[str]] = None,
|
|
393
423
|
):
|
|
394
|
-
|
|
395
|
-
if chunker == None:
|
|
424
|
+
if chunker is None:
|
|
396
425
|
chunker = ChonkieChunker()
|
|
397
426
|
|
|
398
|
-
if embedder
|
|
427
|
+
if embedder is None:
|
|
399
428
|
embedder = open_ai_embedding_3_large()
|
|
400
429
|
|
|
401
430
|
self.chunker = chunker
|
|
@@ -406,47 +435,29 @@ class SiteIndexer(TaskRunner):
|
|
|
406
435
|
title=title,
|
|
407
436
|
description=description,
|
|
408
437
|
requires=[
|
|
409
|
-
RequiredToolkit(
|
|
410
|
-
name="meshagent.firecrawl",
|
|
411
|
-
tools=[
|
|
412
|
-
"firecrawl_queue"
|
|
413
|
-
]
|
|
414
|
-
),
|
|
438
|
+
RequiredToolkit(name="meshagent.firecrawl", tools=["firecrawl_queue"]),
|
|
415
439
|
],
|
|
416
440
|
supports_tools=supports_tools,
|
|
417
441
|
input_schema={
|
|
418
|
-
"type"
|
|
419
|
-
"required"
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
"
|
|
425
|
-
|
|
426
|
-
"description" : "default: firecrawl"
|
|
427
|
-
},
|
|
428
|
-
"table" : {
|
|
429
|
-
"type" : "string",
|
|
430
|
-
"description" : "default: index"
|
|
431
|
-
},
|
|
432
|
-
"url" : {
|
|
433
|
-
"type" : "string",
|
|
434
|
-
"description" : "default: index"
|
|
435
|
-
}
|
|
436
|
-
}
|
|
442
|
+
"type": "object",
|
|
443
|
+
"required": ["queue", "table", "url"],
|
|
444
|
+
"additionalProperties": False,
|
|
445
|
+
"properties": {
|
|
446
|
+
"queue": {"type": "string", "description": "default: firecrawl"},
|
|
447
|
+
"table": {"type": "string", "description": "default: index"},
|
|
448
|
+
"url": {"type": "string", "description": "default: index"},
|
|
449
|
+
},
|
|
437
450
|
},
|
|
438
451
|
output_schema={
|
|
439
|
-
"type"
|
|
440
|
-
"required"
|
|
441
|
-
"additionalProperties"
|
|
442
|
-
"properties"
|
|
452
|
+
"type": "object",
|
|
453
|
+
"required": [],
|
|
454
|
+
"additionalProperties": False,
|
|
455
|
+
"properties": {},
|
|
443
456
|
},
|
|
444
|
-
labels=labels
|
|
457
|
+
labels=labels,
|
|
445
458
|
)
|
|
446
459
|
|
|
447
|
-
|
|
448
460
|
async def ask(self, *, context, arguments):
|
|
449
|
-
|
|
450
461
|
queue = arguments["queue"]
|
|
451
462
|
table = arguments["table"]
|
|
452
463
|
url = arguments["url"]
|
|
@@ -459,129 +470,135 @@ class SiteIndexer(TaskRunner):
|
|
|
459
470
|
except ValueError:
|
|
460
471
|
pass
|
|
461
472
|
|
|
462
|
-
|
|
463
473
|
async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
|
|
464
|
-
|
|
465
474
|
# if we already indexed this chunk, lets use the existing embedding instead of generating a new one
|
|
466
475
|
if exists:
|
|
467
|
-
|
|
468
476
|
results = await self.room.database.search(
|
|
469
477
|
table=self.table,
|
|
470
478
|
where={
|
|
471
|
-
"sha"
|
|
479
|
+
"sha": sha,
|
|
472
480
|
},
|
|
473
|
-
limit=1
|
|
481
|
+
limit=1,
|
|
474
482
|
)
|
|
475
483
|
|
|
476
|
-
|
|
477
484
|
if len(results) != 0:
|
|
478
485
|
logger.info(f"chunk found from {url} {sha}, reusing embedding")
|
|
479
486
|
return results[0]["embedding"]
|
|
480
|
-
|
|
487
|
+
|
|
481
488
|
logger.info(f"chunk not found from {url} {sha}, generating embedding")
|
|
482
|
-
|
|
489
|
+
|
|
483
490
|
return await self.embedder.embed(text=text)
|
|
484
|
-
|
|
485
|
-
|
|
491
|
+
|
|
486
492
|
async def crawl():
|
|
487
493
|
logger.info(f"starting to crawl: {url}")
|
|
488
494
|
await context.room.agents.invoke_tool(
|
|
489
495
|
toolkit="meshagent.firecrawl",
|
|
490
496
|
tool="firecrawl_queue",
|
|
491
|
-
arguments={
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
"limit" : 100
|
|
495
|
-
})
|
|
496
|
-
|
|
497
|
+
arguments={"url": url, "queue": queue, "limit": 100},
|
|
498
|
+
)
|
|
499
|
+
|
|
497
500
|
logger.info(f"done with crawl: {url}")
|
|
498
|
-
await context.room.queues.send(name=queue, message={
|
|
499
|
-
|
|
501
|
+
await context.room.queues.send(name=queue, message={"done": True})
|
|
502
|
+
|
|
500
503
|
def crawl_done(task: asyncio.Task):
|
|
501
504
|
try:
|
|
502
505
|
task.result()
|
|
503
506
|
except Exception as e:
|
|
504
507
|
logger.error("crawl failed", exc_info=e)
|
|
505
508
|
|
|
506
|
-
|
|
507
509
|
crawl_task = asyncio.create_task(crawl())
|
|
508
510
|
crawl_task.add_done_callback(crawl_done)
|
|
509
|
-
|
|
511
|
+
|
|
510
512
|
rows = []
|
|
511
513
|
|
|
512
514
|
id = 0
|
|
513
|
-
|
|
515
|
+
|
|
514
516
|
while True:
|
|
515
|
-
message = await context.room.queues.receive(
|
|
516
|
-
|
|
517
|
-
|
|
517
|
+
message = await context.room.queues.receive(
|
|
518
|
+
name=queue, create=True, wait=True
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
if message is None:
|
|
518
522
|
break
|
|
519
523
|
|
|
520
524
|
if message.get("type", None) == "crawl.completed":
|
|
521
525
|
break
|
|
522
|
-
|
|
526
|
+
|
|
523
527
|
if "data" in message:
|
|
524
528
|
for data in message["data"]:
|
|
525
529
|
try:
|
|
526
|
-
url
|
|
527
|
-
text
|
|
528
|
-
title
|
|
529
|
-
title_sha
|
|
530
|
+
url: str = data["metadata"]["url"]
|
|
531
|
+
text: str = data["markdown"]
|
|
532
|
+
title: str = data["metadata"]["title"]
|
|
533
|
+
title_sha: str = hashlib.sha256(
|
|
534
|
+
text.encode("utf-8")
|
|
535
|
+
).hexdigest()
|
|
530
536
|
|
|
531
537
|
logger.info(f"processing crawled page: {url}")
|
|
532
|
-
|
|
538
|
+
|
|
533
539
|
# let's make the title it's own chunk
|
|
534
540
|
rows.append(
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
541
|
+
{
|
|
542
|
+
"id": id,
|
|
543
|
+
"url": url,
|
|
544
|
+
"text": title,
|
|
545
|
+
"sha": title_sha,
|
|
546
|
+
"embedding": await lookup_or_embed(
|
|
547
|
+
sha=title_sha, text=title
|
|
548
|
+
),
|
|
549
|
+
}
|
|
550
|
+
)
|
|
551
|
+
|
|
544
552
|
id = id + 1
|
|
545
|
-
|
|
553
|
+
|
|
546
554
|
# the content will be transformed into additional chunks
|
|
547
|
-
for chunk in await self.chunker.chunk(
|
|
555
|
+
for chunk in await self.chunker.chunk(
|
|
556
|
+
text=text, max_length=self.embedder.max_length
|
|
557
|
+
):
|
|
548
558
|
logger.info(f"processing chunk from {url}: {chunk.text}")
|
|
549
|
-
chunk_sha = hashlib.sha256(
|
|
559
|
+
chunk_sha = hashlib.sha256(
|
|
560
|
+
chunk.text.encode("utf-8")
|
|
561
|
+
).hexdigest()
|
|
550
562
|
rows.append(
|
|
551
563
|
{
|
|
552
|
-
"id"
|
|
553
|
-
"url"
|
|
554
|
-
"text"
|
|
555
|
-
"embedding"
|
|
564
|
+
"id": id,
|
|
565
|
+
"url": url,
|
|
566
|
+
"text": chunk.text,
|
|
567
|
+
"embedding": await lookup_or_embed(
|
|
568
|
+
sha=chunk_sha, text=chunk.text
|
|
569
|
+
),
|
|
556
570
|
}
|
|
557
571
|
)
|
|
558
|
-
|
|
572
|
+
|
|
559
573
|
id = id + 1
|
|
560
574
|
|
|
561
575
|
except Exception as e:
|
|
562
576
|
logger.error(f"failed to process: {url}", exc_info=e)
|
|
563
577
|
|
|
564
578
|
logger.info(f"saving crawl: {url}")
|
|
565
|
-
|
|
579
|
+
|
|
566
580
|
await context.room.database.create_table_with_schema(
|
|
567
581
|
name=table,
|
|
568
582
|
schema={
|
|
569
|
-
"id"
|
|
570
|
-
"url"
|
|
571
|
-
"text"
|
|
572
|
-
"embedding"
|
|
573
|
-
size=self.embedder.size,
|
|
574
|
-
element_type=FloatDataType()
|
|
583
|
+
"id": IntDataType(),
|
|
584
|
+
"url": TextDataType(),
|
|
585
|
+
"text": TextDataType(),
|
|
586
|
+
"embedding": VectorDataType(
|
|
587
|
+
size=self.embedder.size, element_type=FloatDataType()
|
|
575
588
|
),
|
|
576
|
-
"sha"
|
|
589
|
+
"sha": TextDataType(),
|
|
577
590
|
},
|
|
578
591
|
mode="overwrite",
|
|
579
|
-
data=rows
|
|
592
|
+
data=rows,
|
|
580
593
|
)
|
|
581
594
|
|
|
582
595
|
if len(rows) > 255:
|
|
583
|
-
await context.room.database.create_vector_index(
|
|
596
|
+
await context.room.database.create_vector_index(
|
|
597
|
+
table=table, column="embedding"
|
|
598
|
+
)
|
|
584
599
|
|
|
585
|
-
await context.room.database.create_full_text_search_index(
|
|
600
|
+
await context.room.database.create_full_text_search_index(
|
|
601
|
+
table=table, column="text"
|
|
602
|
+
)
|
|
586
603
|
|
|
587
604
|
return {}
|