meshagent-agents 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of meshagent-agents might be problematic. Click here for more details.
- meshagent/agents/__init__.py +5 -0
- meshagent/agents/adapter.py +39 -0
- meshagent/agents/agent.py +427 -0
- meshagent/agents/chat.py +316 -0
- meshagent/agents/context.py +90 -0
- meshagent/agents/development.py +32 -0
- meshagent/agents/hosting.py +117 -0
- meshagent/agents/indexer.py +593 -0
- meshagent/agents/listener.py +155 -0
- meshagent/agents/planning.py +603 -0
- meshagent/agents/prompt.py +49 -0
- meshagent/agents/pydantic.py +137 -0
- meshagent/agents/schema.py +50 -0
- meshagent/agents/single_shot_writer.py +92 -0
- meshagent/agents/version.py +1 -0
- meshagent/agents/worker.py +126 -0
- meshagent/agents/writer.py +82 -0
- meshagent_agents-0.0.1.dist-info/LICENSE +201 -0
- meshagent_agents-0.0.1.dist-info/METADATA +29 -0
- meshagent_agents-0.0.1.dist-info/RECORD +22 -0
- meshagent_agents-0.0.1.dist-info/WHEEL +5 -0
- meshagent_agents-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,593 @@
|
|
|
1
|
+
from meshagent.agents import TaskRunner, RequiredToolkit, SingleRoomAgent
|
|
2
|
+
from meshagent.tools import Toolkit, Tool, ToolContext
|
|
3
|
+
from meshagent.api.room_server_client import TextDataType, VectorDataType, FloatDataType, IntDataType
|
|
4
|
+
from openai import AsyncOpenAI
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from meshagent.api.chan import Chan
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import chonkie
|
|
10
|
+
import asyncio
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
from functools import wraps
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
# TODO: install chonkie, chonkie[semantic], openai
|
|
18
|
+
|
|
19
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _async_debounce(wait):
|
|
25
|
+
def decorator(func):
|
|
26
|
+
task = None
|
|
27
|
+
|
|
28
|
+
@wraps(func)
|
|
29
|
+
async def debounced(*args, **kwargs):
|
|
30
|
+
nonlocal task
|
|
31
|
+
|
|
32
|
+
async def call_func():
|
|
33
|
+
await asyncio.sleep(wait)
|
|
34
|
+
await func(*args, **kwargs)
|
|
35
|
+
|
|
36
|
+
if task and not task.done():
|
|
37
|
+
task.cancel()
|
|
38
|
+
|
|
39
|
+
task = asyncio.create_task(call_func())
|
|
40
|
+
return task
|
|
41
|
+
|
|
42
|
+
return debounced
|
|
43
|
+
|
|
44
|
+
return decorator
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger("indexer")
|
|
47
|
+
logger.setLevel(logging.INFO)
|
|
48
|
+
|
|
49
|
+
class Chunk:
|
|
50
|
+
def __init__(self, *, text: str, start: int, end: int):
|
|
51
|
+
self.text = text
|
|
52
|
+
self.start = start
|
|
53
|
+
self.end = end
|
|
54
|
+
|
|
55
|
+
class Chunker:
|
|
56
|
+
|
|
57
|
+
async def chunk(self, *, text: str, max_length: Optional[int] = None) -> list[Chunk]:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
class ChonkieChunker(Chunker):
|
|
61
|
+
def __init__(self, chunker: Optional[chonkie.BaseChunker] = None):
|
|
62
|
+
super().__init__()
|
|
63
|
+
|
|
64
|
+
if chunker == None:
|
|
65
|
+
chunker = chonkie.SemanticChunker()
|
|
66
|
+
|
|
67
|
+
self._chunker = chunker
|
|
68
|
+
|
|
69
|
+
async def chunk(self, *, text: str, max_length: Optional[int] = None) -> list[Chunk]:
|
|
70
|
+
chunks = await asyncio.to_thread(self._chunker.chunk, text=text)
|
|
71
|
+
mapped = []
|
|
72
|
+
for chunk in chunks:
|
|
73
|
+
mapped.append(Chunk(text=chunk.text, start=chunk.start_index, end=chunk.end_index))
|
|
74
|
+
return mapped
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class Embedder:
|
|
78
|
+
def __init__(self, *, size: int, max_length: int):
|
|
79
|
+
self.size = size
|
|
80
|
+
self.max_length = max_length
|
|
81
|
+
|
|
82
|
+
async def embed(self, *, text: str) -> list[float]:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
class OpenAIEmbedder(Embedder):
|
|
86
|
+
def __init__(self, *, size: int, max_length: int, model: str, openai: Optional[AsyncOpenAI] = None, ):
|
|
87
|
+
if openai == None:
|
|
88
|
+
openai = AsyncOpenAI()
|
|
89
|
+
|
|
90
|
+
self._openai = openai
|
|
91
|
+
self._model = model
|
|
92
|
+
|
|
93
|
+
super().__init__(size=size, max_length=max_length)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
async def embed(self, *, text):
|
|
97
|
+
return (await self._openai.embeddings.create(input=text, model=self._model, encoding_format="float")).data[0].embedding
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class RagTool(Tool):
|
|
102
|
+
def __init__(self, *, name = "rag_search", table: str, title = "RAG search", description = "perform a RAG search", rules = None, thumbnail_url = None, embedder: Optional[Embedder] = None):
|
|
103
|
+
|
|
104
|
+
self.table = table
|
|
105
|
+
|
|
106
|
+
super().__init__(
|
|
107
|
+
name=name,
|
|
108
|
+
input_schema={
|
|
109
|
+
"type":"object",
|
|
110
|
+
"additionalProperties" : False,
|
|
111
|
+
"required" : [
|
|
112
|
+
"query"
|
|
113
|
+
],
|
|
114
|
+
"properties" : {
|
|
115
|
+
"query" : {
|
|
116
|
+
"type" : "string"
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
},
|
|
120
|
+
title=title,
|
|
121
|
+
description=description,
|
|
122
|
+
rules=rules, thumbnail_url=thumbnail_url)
|
|
123
|
+
|
|
124
|
+
self._embedder = embedder
|
|
125
|
+
|
|
126
|
+
async def execute(self, context: ToolContext, query: str):
|
|
127
|
+
|
|
128
|
+
if self._embedder == None:
|
|
129
|
+
results = await context.room.database.search(
|
|
130
|
+
table=self.table,
|
|
131
|
+
text=query,
|
|
132
|
+
limit=10
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
embedding = await self._embedder.embed(text=query)
|
|
136
|
+
results = await context.room.database.search(
|
|
137
|
+
table=self.table,
|
|
138
|
+
text=query,
|
|
139
|
+
vector=embedding,
|
|
140
|
+
limit=10
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
results = list(map(lambda r: f"from {r["url"]}: {r["text"]}", results))
|
|
144
|
+
|
|
145
|
+
return {
|
|
146
|
+
"results" : results
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def open_ai_embedding_3_small():
|
|
151
|
+
return OpenAIEmbedder(model="text-embedding-3-small", max_length=8191, size=1536)
|
|
152
|
+
|
|
153
|
+
def open_ai_embedding_3_large():
|
|
154
|
+
return OpenAIEmbedder(model="text-embedding-3-large", max_length=8191, size=3072)
|
|
155
|
+
|
|
156
|
+
def open_ai_embedding_ada_2():
|
|
157
|
+
return OpenAIEmbedder(model="text-embedding-ada-002", max_length=8191, size=1536)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class RagToolkit(Toolkit):
|
|
161
|
+
def __init__(self, table: str, embedder:Optional[Embedder] = None):
|
|
162
|
+
|
|
163
|
+
if embedder == None:
|
|
164
|
+
embedder = open_ai_embedding_3_large()
|
|
165
|
+
|
|
166
|
+
super().__init__(
|
|
167
|
+
name="meshagent.rag",
|
|
168
|
+
title="RAG",
|
|
169
|
+
description="Searches against an index",
|
|
170
|
+
tools=[
|
|
171
|
+
RagTool(table=table, embedder=embedder)
|
|
172
|
+
]
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class FileIndexEvent:
|
|
177
|
+
def __init__(self, *, path: str, deleted: bool):
|
|
178
|
+
self.path = path
|
|
179
|
+
self.deleted = deleted
|
|
180
|
+
|
|
181
|
+
class StorageIndexer(SingleRoomAgent):
|
|
182
|
+
|
|
183
|
+
def __init__(
|
|
184
|
+
self,
|
|
185
|
+
*,
|
|
186
|
+
name,
|
|
187
|
+
title=None,
|
|
188
|
+
description=None,
|
|
189
|
+
requires=None,
|
|
190
|
+
labels = None,
|
|
191
|
+
chunker: Optional[Chunker] = None,
|
|
192
|
+
embedder:Optional[Embedder] = None,
|
|
193
|
+
table: str = "storage_index",
|
|
194
|
+
):
|
|
195
|
+
super().__init__(name=name, title=title, description=description, requires=requires, labels=labels)
|
|
196
|
+
|
|
197
|
+
self._chan = Chan[FileIndexEvent]()
|
|
198
|
+
|
|
199
|
+
if chunker == None:
|
|
200
|
+
chunker = ChonkieChunker()
|
|
201
|
+
|
|
202
|
+
if embedder == None:
|
|
203
|
+
embedder = open_ai_embedding_3_large()
|
|
204
|
+
|
|
205
|
+
self.chunker = chunker
|
|
206
|
+
self.embedder = embedder
|
|
207
|
+
self.table = table
|
|
208
|
+
self._vector_index_created = False
|
|
209
|
+
self._fts_created = False
|
|
210
|
+
|
|
211
|
+
async def read_file(self, *, path: str) -> str | None:
|
|
212
|
+
pass
|
|
213
|
+
|
|
214
|
+
@_async_debounce(10)
|
|
215
|
+
async def refresh_index(self):
|
|
216
|
+
|
|
217
|
+
self.room.developer.log_nowait(type="indexer.rebuild", data={})
|
|
218
|
+
|
|
219
|
+
indexes = await self.room.database.list_indexes(table=self.table)
|
|
220
|
+
|
|
221
|
+
logger.info(f"existing indexes {indexes}")
|
|
222
|
+
|
|
223
|
+
for index in indexes:
|
|
224
|
+
|
|
225
|
+
if "embedding" in index["columns"]:
|
|
226
|
+
self._vector_index_created = True
|
|
227
|
+
|
|
228
|
+
if "text" in index["columns"]:
|
|
229
|
+
self._fts_created = True
|
|
230
|
+
|
|
231
|
+
if self._vector_index_created == False:
|
|
232
|
+
try:
|
|
233
|
+
logger.info("attempting to create embedding index")
|
|
234
|
+
await self.room.database.create_vector_index(table=self.table, column="embedding", replace=False)
|
|
235
|
+
self._vector_index_created = True
|
|
236
|
+
except Exception as e:
|
|
237
|
+
# Will fail if there aren't enough rows
|
|
238
|
+
pass
|
|
239
|
+
|
|
240
|
+
if self._fts_created == False:
|
|
241
|
+
try:
|
|
242
|
+
logger.info("attempting to create fts index")
|
|
243
|
+
await self.room.database.create_full_text_search_index(table=self.table, column="text", replace=False)
|
|
244
|
+
self._fts_created = True
|
|
245
|
+
except Exception as e:
|
|
246
|
+
# Will fail if there aren't enough rows
|
|
247
|
+
pass
|
|
248
|
+
|
|
249
|
+
if self._fts_created == True or self._vector_index_created == True:
|
|
250
|
+
logger.info("optimizing existing index")
|
|
251
|
+
await self.room.database.optimize(table=self.table)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
async def start(self, *, room):
|
|
255
|
+
await super().start(room=room)
|
|
256
|
+
|
|
257
|
+
room.storage.on("file.updated", self._on_file_updated)
|
|
258
|
+
room.storage.on("file.deleted", self._on_file_deleted)
|
|
259
|
+
|
|
260
|
+
await room.database.create_table_with_schema(
|
|
261
|
+
name=self.table,
|
|
262
|
+
schema={
|
|
263
|
+
"url" : TextDataType(),
|
|
264
|
+
"text" : TextDataType(),
|
|
265
|
+
"embedding" : VectorDataType(
|
|
266
|
+
size=self.embedder.size,
|
|
267
|
+
element_type=FloatDataType()
|
|
268
|
+
),
|
|
269
|
+
"sha" : TextDataType(),
|
|
270
|
+
},
|
|
271
|
+
mode="create_if_not_exists",
|
|
272
|
+
data=None
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def index_task(task: asyncio.Task):
|
|
277
|
+
|
|
278
|
+
try:
|
|
279
|
+
result = task.result()
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.error("Index task failed", exc_info=e)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
self._index_task = asyncio.create_task(self._indexer())
|
|
285
|
+
self._index_task.add_done_callback(index_task)
|
|
286
|
+
|
|
287
|
+
await self.install_requirements()
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
async def stop(self):
|
|
291
|
+
await super().stop()
|
|
292
|
+
await self._chan.close()
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
async def _indexer(self):
|
|
296
|
+
|
|
297
|
+
async for e in self._chan:
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
if e.deleted:
|
|
301
|
+
|
|
302
|
+
# todo: consider using sql_alchemy or a library to do the escaping
|
|
303
|
+
def escape_sql_string(value):
|
|
304
|
+
if not isinstance(value, str):
|
|
305
|
+
raise TypeError("Input must be a string")
|
|
306
|
+
return value.replace("'", "''")
|
|
307
|
+
|
|
308
|
+
self.room.developer.log_nowait(type="indexer.delete", data={"path": e.path})
|
|
309
|
+
await self.room.database.delete(table=self.table, where=f"url='{escape_sql_string(e.path)}'")
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
else:
|
|
313
|
+
|
|
314
|
+
self.room.developer.log_nowait(type="indexer.index", data={"path": e.path})
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
|
|
318
|
+
|
|
319
|
+
# if we already indexed this chunk, lets use the existing embedding instead of generating a new one
|
|
320
|
+
results = await self.room.database.search(
|
|
321
|
+
table=self.table,
|
|
322
|
+
where={
|
|
323
|
+
"sha" : sha,
|
|
324
|
+
},
|
|
325
|
+
limit=1
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if len(results) != 0:
|
|
329
|
+
logger.info(f"chunk found from {e.path} {sha}, reusing embedding")
|
|
330
|
+
return results[0]["embedding"]
|
|
331
|
+
|
|
332
|
+
logger.info(f"chunk not found from {e.path} {sha}, generating embedding")
|
|
333
|
+
|
|
334
|
+
return await self.embedder.embed(text=text)
|
|
335
|
+
|
|
336
|
+
basename = os.path.basename(e.path)
|
|
337
|
+
|
|
338
|
+
chunk_sha = hashlib.sha256(basename.encode("utf-8")).hexdigest()
|
|
339
|
+
|
|
340
|
+
rows = []
|
|
341
|
+
# let's make the filename it's own chunk
|
|
342
|
+
rows.append(
|
|
343
|
+
{
|
|
344
|
+
"url" : e.path,
|
|
345
|
+
"text" : basename,
|
|
346
|
+
"sha" : chunk_sha,
|
|
347
|
+
"embedding" : await lookup_or_embed(sha=chunk_sha, text=basename)
|
|
348
|
+
}
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
text = await self.read_file(path=e.path)
|
|
353
|
+
if text != None:
|
|
354
|
+
|
|
355
|
+
# the content will be transformed into additional chunks
|
|
356
|
+
for chunk in await self.chunker.chunk(text=text, max_length = self.embedder.max_length):
|
|
357
|
+
logger.info(f"processing chunk from {e.path}: {chunk.start}")
|
|
358
|
+
chunk_sha = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest()
|
|
359
|
+
rows.append(
|
|
360
|
+
{
|
|
361
|
+
"url" : e.path,
|
|
362
|
+
"text" : chunk.text,
|
|
363
|
+
"embedding" : await lookup_or_embed(sha=chunk_sha, text=chunk.text),
|
|
364
|
+
"sha" : chunk_sha,
|
|
365
|
+
}
|
|
366
|
+
)
|
|
367
|
+
await self.room.database.merge(table=self.table, on="sha", records=rows)
|
|
368
|
+
await self.refresh_index()
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.error("error while indexing", exc_info=e)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _on_file_deleted(self, path: str):
|
|
377
|
+
self._chan.send_nowait(FileIndexEvent(path=path, deleted=True))
|
|
378
|
+
|
|
379
|
+
def _on_file_updated(self, path: str):
|
|
380
|
+
self._chan.send_nowait(FileIndexEvent(path=path, deleted=False))
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
class SiteIndexer(TaskRunner):
|
|
387
|
+
|
|
388
|
+
def __init__(self,
|
|
389
|
+
*,
|
|
390
|
+
name,
|
|
391
|
+
chunker: Optional[Chunker] = None,
|
|
392
|
+
embedder:Optional[Embedder] = None,
|
|
393
|
+
title=None,
|
|
394
|
+
description=None,
|
|
395
|
+
requires=None,
|
|
396
|
+
supports_tools = None,
|
|
397
|
+
labels: Optional[list[str]] = None
|
|
398
|
+
|
|
399
|
+
):
|
|
400
|
+
|
|
401
|
+
if chunker == None:
|
|
402
|
+
chunker = ChonkieChunker()
|
|
403
|
+
|
|
404
|
+
if embedder == None:
|
|
405
|
+
embedder = open_ai_embedding_3_large()
|
|
406
|
+
|
|
407
|
+
self.chunker = chunker
|
|
408
|
+
self.embedder = embedder
|
|
409
|
+
|
|
410
|
+
super().__init__(
|
|
411
|
+
name=name,
|
|
412
|
+
title=title,
|
|
413
|
+
description=description,
|
|
414
|
+
requires=[
|
|
415
|
+
RequiredToolkit(
|
|
416
|
+
name="meshagent.firecrawl",
|
|
417
|
+
tools=[
|
|
418
|
+
"firecrawl_queue"
|
|
419
|
+
]
|
|
420
|
+
),
|
|
421
|
+
],
|
|
422
|
+
supports_tools=supports_tools,
|
|
423
|
+
input_schema={
|
|
424
|
+
"type" : "object",
|
|
425
|
+
"required" : [
|
|
426
|
+
"queue", "table", "url"
|
|
427
|
+
],
|
|
428
|
+
"additionalProperties" : False,
|
|
429
|
+
"properties" : {
|
|
430
|
+
"queue" : {
|
|
431
|
+
"type" : "string",
|
|
432
|
+
"description" : "default: firecrawl"
|
|
433
|
+
},
|
|
434
|
+
"table" : {
|
|
435
|
+
"type" : "string",
|
|
436
|
+
"description" : "default: index"
|
|
437
|
+
},
|
|
438
|
+
"url" : {
|
|
439
|
+
"type" : "string",
|
|
440
|
+
"description" : "default: index"
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
},
|
|
444
|
+
output_schema={
|
|
445
|
+
"type" : "object",
|
|
446
|
+
"required" : [],
|
|
447
|
+
"additionalProperties" : False,
|
|
448
|
+
"properties" : {},
|
|
449
|
+
},
|
|
450
|
+
labels=labels
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
async def ask(self, *, context, arguments):
|
|
455
|
+
|
|
456
|
+
queue = arguments["queue"]
|
|
457
|
+
table = arguments["table"]
|
|
458
|
+
url = arguments["url"]
|
|
459
|
+
|
|
460
|
+
tables = await context.room.database.list_tables()
|
|
461
|
+
|
|
462
|
+
exists = False
|
|
463
|
+
try:
|
|
464
|
+
exists = tables.index(table)
|
|
465
|
+
except ValueError:
|
|
466
|
+
pass
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
|
|
470
|
+
|
|
471
|
+
# if we already indexed this chunk, lets use the existing embedding instead of generating a new one
|
|
472
|
+
if exists:
|
|
473
|
+
|
|
474
|
+
results = await self.room.database.search(
|
|
475
|
+
table=self.table,
|
|
476
|
+
where={
|
|
477
|
+
"sha" : sha,
|
|
478
|
+
},
|
|
479
|
+
limit=1
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
if len(results) != 0:
|
|
484
|
+
logger.info(f"chunk found from {url} {sha}, reusing embedding")
|
|
485
|
+
return results[0]["embedding"]
|
|
486
|
+
|
|
487
|
+
logger.info(f"chunk not found from {url} {sha}, generating embedding")
|
|
488
|
+
|
|
489
|
+
return await self.embedder.embed(text=text)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
async def crawl():
|
|
493
|
+
logger.info(f"starting to crawl: {url}")
|
|
494
|
+
await context.room.agents.invoke_tool(
|
|
495
|
+
toolkit="meshagent.firecrawl",
|
|
496
|
+
tool="firecrawl_queue",
|
|
497
|
+
arguments={
|
|
498
|
+
"url" : url,
|
|
499
|
+
"queue": queue,
|
|
500
|
+
"limit" : 100
|
|
501
|
+
})
|
|
502
|
+
|
|
503
|
+
logger.info(f"done with crawl: {url}")
|
|
504
|
+
await context.room.queues.send(name=queue, message={ "done" : True })
|
|
505
|
+
|
|
506
|
+
def crawl_done(task: asyncio.Task):
|
|
507
|
+
try:
|
|
508
|
+
task.result()
|
|
509
|
+
except Exception as e:
|
|
510
|
+
logger.error("crawl failed", exc_info=e)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
crawl_task = asyncio.create_task(crawl())
|
|
514
|
+
crawl_task.add_done_callback(crawl_done)
|
|
515
|
+
|
|
516
|
+
rows = []
|
|
517
|
+
|
|
518
|
+
id = 0
|
|
519
|
+
|
|
520
|
+
while True:
|
|
521
|
+
message = await context.room.queues.receive(name=queue, create=True, wait=True)
|
|
522
|
+
|
|
523
|
+
if message == None:
|
|
524
|
+
break
|
|
525
|
+
|
|
526
|
+
if message.get("type", None) == "crawl.completed":
|
|
527
|
+
break
|
|
528
|
+
|
|
529
|
+
if "data" in message:
|
|
530
|
+
for data in message["data"]:
|
|
531
|
+
try:
|
|
532
|
+
url : str = data["metadata"]["url"]
|
|
533
|
+
text : str = data["markdown"]
|
|
534
|
+
title : str = data["metadata"]["title"]
|
|
535
|
+
title_sha : str = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
536
|
+
|
|
537
|
+
logger.info(f"processing crawled page: {url}")
|
|
538
|
+
|
|
539
|
+
# let's make the title it's own chunk
|
|
540
|
+
rows.append(
|
|
541
|
+
{
|
|
542
|
+
"id" : id,
|
|
543
|
+
"url" : url,
|
|
544
|
+
"text" : title,
|
|
545
|
+
"sha" : title_sha,
|
|
546
|
+
"embedding" : await lookup_or_embed(sha=title_sha, text=title)
|
|
547
|
+
}
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
id = id + 1
|
|
551
|
+
|
|
552
|
+
# the content will be transformed into additional chunks
|
|
553
|
+
for chunk in await self.chunker.chunk(text=text, max_length = self.embedder.max_length):
|
|
554
|
+
logger.info(f"processing chunk from {url}: {chunk.text}")
|
|
555
|
+
chunk_sha = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest()
|
|
556
|
+
rows.append(
|
|
557
|
+
{
|
|
558
|
+
"id" : id,
|
|
559
|
+
"url" : url,
|
|
560
|
+
"text" : chunk.text,
|
|
561
|
+
"embedding" : await lookup_or_embed(sha=chunk_sha, text=chunk.text)
|
|
562
|
+
}
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
id = id + 1
|
|
566
|
+
|
|
567
|
+
except Exception as e:
|
|
568
|
+
logger.error(f"failed to process: {url}", exc_info=e)
|
|
569
|
+
|
|
570
|
+
logger.info(f"saving crawl: {url}")
|
|
571
|
+
|
|
572
|
+
await context.room.database.create_table_with_schema(
|
|
573
|
+
name=table,
|
|
574
|
+
schema={
|
|
575
|
+
"id" : IntDataType(),
|
|
576
|
+
"url" : TextDataType(),
|
|
577
|
+
"text" : TextDataType(),
|
|
578
|
+
"embedding" : VectorDataType(
|
|
579
|
+
size=self.embedder.size,
|
|
580
|
+
element_type=FloatDataType()
|
|
581
|
+
),
|
|
582
|
+
"sha" : TextDataType(),
|
|
583
|
+
},
|
|
584
|
+
mode="overwrite",
|
|
585
|
+
data=rows
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
if len(rows) > 255:
|
|
589
|
+
await context.room.database.create_vector_index(table=table, column="embedding")
|
|
590
|
+
|
|
591
|
+
await context.room.database.create_full_text_search_index(table=table, column="text")
|
|
592
|
+
|
|
593
|
+
return {}
|