meshagent-agents 0.0.37__py3-none-any.whl → 0.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of meshagent-agents might be problematic. Click here for more details.

@@ -1,6 +1,11 @@
1
1
  from meshagent.agents import TaskRunner, RequiredToolkit, SingleRoomAgent
2
2
  from meshagent.tools import Toolkit, Tool, ToolContext
3
- from meshagent.api.room_server_client import TextDataType, VectorDataType, FloatDataType, IntDataType
3
+ from meshagent.api.room_server_client import (
4
+ TextDataType,
5
+ VectorDataType,
6
+ FloatDataType,
7
+ IntDataType,
8
+ )
4
9
  from openai import AsyncOpenAI
5
10
  from typing import Optional
6
11
  from meshagent.api.chan import Chan
@@ -16,7 +21,6 @@ import os
16
21
 
17
22
  # TODO: install chonkie, chonkie[semantic], openai
18
23
 
19
- from concurrent.futures import ThreadPoolExecutor, as_completed
20
24
 
21
25
  def _async_debounce(wait):
22
26
  def decorator(func):
@@ -40,6 +44,7 @@ def _async_debounce(wait):
40
44
 
41
45
  return decorator
42
46
 
47
+
43
48
  logger = logging.getLogger("indexer")
44
49
 
45
50
 
@@ -49,124 +54,141 @@ class Chunk:
49
54
  self.start = start
50
55
  self.end = end
51
56
 
52
- class Chunker:
53
57
 
54
- async def chunk(self, *, text: str, max_length: Optional[int] = None) -> list[Chunk]:
58
+ class Chunker:
59
+ async def chunk(
60
+ self, *, text: str, max_length: Optional[int] = None
61
+ ) -> list[Chunk]:
55
62
  pass
56
63
 
64
+
57
65
  class ChonkieChunker(Chunker):
58
66
  def __init__(self, chunker: Optional[chonkie.BaseChunker] = None):
59
67
  super().__init__()
60
68
 
61
- if chunker == None:
69
+ if chunker is None:
62
70
  chunker = chonkie.SemanticChunker()
63
71
 
64
72
  self._chunker = chunker
65
73
 
66
- async def chunk(self, *, text: str, max_length: Optional[int] = None) -> list[Chunk]:
74
+ async def chunk(
75
+ self, *, text: str, max_length: Optional[int] = None
76
+ ) -> list[Chunk]:
67
77
  chunks = await asyncio.to_thread(self._chunker.chunk, text=text)
68
78
  mapped = []
69
79
  for chunk in chunks:
70
- mapped.append(Chunk(text=chunk.text, start=chunk.start_index, end=chunk.end_index))
80
+ mapped.append(
81
+ Chunk(text=chunk.text, start=chunk.start_index, end=chunk.end_index)
82
+ )
71
83
  return mapped
72
-
84
+
73
85
 
74
86
  class Embedder:
75
87
  def __init__(self, *, size: int, max_length: int):
76
88
  self.size = size
77
89
  self.max_length = max_length
78
-
90
+
79
91
  async def embed(self, *, text: str) -> list[float]:
80
92
  pass
81
93
 
94
+
82
95
  class OpenAIEmbedder(Embedder):
83
- def __init__(self, *, size: int, max_length: int, model: str, openai: Optional[AsyncOpenAI] = None, ):
84
- if openai == None:
96
+ def __init__(
97
+ self,
98
+ *,
99
+ size: int,
100
+ max_length: int,
101
+ model: str,
102
+ openai: Optional[AsyncOpenAI] = None,
103
+ ):
104
+ if openai is None:
85
105
  openai = AsyncOpenAI()
86
-
106
+
87
107
  self._openai = openai
88
108
  self._model = model
89
109
 
90
110
  super().__init__(size=size, max_length=max_length)
91
111
 
92
-
93
112
  async def embed(self, *, text):
94
- return (await self._openai.embeddings.create(input=text, model=self._model, encoding_format="float")).data[0].embedding
95
-
113
+ return (
114
+ (
115
+ await self._openai.embeddings.create(
116
+ input=text, model=self._model, encoding_format="float"
117
+ )
118
+ )
119
+ .data[0]
120
+ .embedding
121
+ )
96
122
 
97
123
 
98
124
  class RagTool(Tool):
99
- def __init__(self, *, name = "rag_search", table: str, title = "RAG search", description = "perform a RAG search", rules = None, thumbnail_url = None, embedder: Optional[Embedder] = None):
100
-
125
+ def __init__(
126
+ self,
127
+ *,
128
+ name="rag_search",
129
+ table: str,
130
+ title="RAG search",
131
+ description="perform a RAG search",
132
+ rules=None,
133
+ thumbnail_url=None,
134
+ embedder: Optional[Embedder] = None,
135
+ ):
101
136
  self.table = table
102
137
 
103
138
  super().__init__(
104
139
  name=name,
105
140
  input_schema={
106
- "type":"object",
107
- "additionalProperties" : False,
108
- "required" : [
109
- "query"
110
- ],
111
- "properties" : {
112
- "query" : {
113
- "type" : "string"
114
- }
115
- }
141
+ "type": "object",
142
+ "additionalProperties": False,
143
+ "required": ["query"],
144
+ "properties": {"query": {"type": "string"}},
116
145
  },
117
146
  title=title,
118
147
  description=description,
119
- rules=rules, thumbnail_url=thumbnail_url)
120
-
148
+ rules=rules,
149
+ thumbnail_url=thumbnail_url,
150
+ )
151
+
121
152
  self._embedder = embedder
122
153
 
123
154
  async def execute(self, context: ToolContext, query: str):
124
-
125
- if self._embedder == None:
155
+ if self._embedder is None:
126
156
  results = await context.room.database.search(
127
- table=self.table,
128
- text=query,
129
- limit=10
157
+ table=self.table, text=query, limit=10
130
158
  )
131
159
  else:
132
160
  embedding = await self._embedder.embed(text=query)
133
161
  results = await context.room.database.search(
134
- table=self.table,
135
- text=query,
136
- vector=embedding,
137
- limit=10
162
+ table=self.table, text=query, vector=embedding, limit=10
138
163
  )
139
164
 
140
- results = list(map(lambda r: f"from {r["url"]}: {r["text"]}", results))
165
+ results = list(map(lambda r: f"from {r['url']}: {r['text']}", results))
166
+
167
+ return {"results": results}
141
168
 
142
- return {
143
- "results" : results
144
- }
145
-
146
169
 
147
170
  def open_ai_embedding_3_small():
148
171
  return OpenAIEmbedder(model="text-embedding-3-small", max_length=8191, size=1536)
149
172
 
173
+
150
174
  def open_ai_embedding_3_large():
151
175
  return OpenAIEmbedder(model="text-embedding-3-large", max_length=8191, size=3072)
152
176
 
177
+
153
178
  def open_ai_embedding_ada_2():
154
179
  return OpenAIEmbedder(model="text-embedding-ada-002", max_length=8191, size=1536)
155
180
 
156
181
 
157
182
  class RagToolkit(Toolkit):
158
- def __init__(self, table: str, embedder:Optional[Embedder] = None):
159
-
160
- if embedder == None:
183
+ def __init__(self, table: str, embedder: Optional[Embedder] = None):
184
+ if embedder is None:
161
185
  embedder = open_ai_embedding_3_large()
162
186
 
163
187
  super().__init__(
164
188
  name="meshagent.rag",
165
189
  title="RAG",
166
190
  description="Searches against an index",
167
- tools=[
168
- RagTool(table=table, embedder=embedder)
169
- ]
191
+ tools=[RagTool(table=table, embedder=embedder)],
170
192
  )
171
193
 
172
194
 
@@ -175,8 +197,8 @@ class FileIndexEvent:
175
197
  self.path = path
176
198
  self.deleted = deleted
177
199
 
178
- class StorageIndexer(SingleRoomAgent):
179
200
 
201
+ class StorageIndexer(SingleRoomAgent):
180
202
  def __init__(
181
203
  self,
182
204
  *,
@@ -184,19 +206,25 @@ class StorageIndexer(SingleRoomAgent):
184
206
  title=None,
185
207
  description=None,
186
208
  requires=None,
187
- labels = None,
209
+ labels=None,
188
210
  chunker: Optional[Chunker] = None,
189
- embedder:Optional[Embedder] = None,
211
+ embedder: Optional[Embedder] = None,
190
212
  table: str = "storage_index",
191
- ):
192
- super().__init__(name=name, title=title, description=description, requires=requires, labels=labels)
213
+ ):
214
+ super().__init__(
215
+ name=name,
216
+ title=title,
217
+ description=description,
218
+ requires=requires,
219
+ labels=labels,
220
+ )
193
221
 
194
222
  self._chan = Chan[FileIndexEvent]()
195
-
196
- if chunker == None:
223
+
224
+ if chunker is None:
197
225
  chunker = ChonkieChunker()
198
226
 
199
- if embedder == None:
227
+ if embedder is None:
200
228
  embedder = open_ai_embedding_3_large()
201
229
 
202
230
  self.chunker = chunker
@@ -207,46 +235,47 @@ class StorageIndexer(SingleRoomAgent):
207
235
 
208
236
  async def read_file(self, *, path: str) -> str | None:
209
237
  pass
210
-
238
+
211
239
  @_async_debounce(10)
212
240
  async def refresh_index(self):
213
-
214
241
  self.room.developer.log_nowait(type="indexer.rebuild", data={})
215
242
 
216
243
  indexes = await self.room.database.list_indexes(table=self.table)
217
244
 
218
245
  logger.info(f"existing indexes {indexes}")
219
-
220
- for index in indexes:
221
246
 
247
+ for index in indexes:
222
248
  if "embedding" in index["columns"]:
223
249
  self._vector_index_created = True
224
-
250
+
225
251
  if "text" in index["columns"]:
226
- self._fts_created = True
252
+ self._fts_created = True
227
253
 
228
- if self._vector_index_created == False:
254
+ if not self._vector_index_created:
229
255
  try:
230
256
  logger.info("attempting to create embedding index")
231
- await self.room.database.create_vector_index(table=self.table, column="embedding", replace=False)
257
+ await self.room.database.create_vector_index(
258
+ table=self.table, column="embedding", replace=False
259
+ )
232
260
  self._vector_index_created = True
233
- except Exception as e:
261
+ except Exception:
234
262
  # Will fail if there aren't enough rows
235
263
  pass
236
264
 
237
- if self._fts_created == False:
265
+ if not self._fts_created:
238
266
  try:
239
267
  logger.info("attempting to create fts index")
240
- await self.room.database.create_full_text_search_index(table=self.table, column="text", replace=False)
268
+ await self.room.database.create_full_text_search_index(
269
+ table=self.table, column="text", replace=False
270
+ )
241
271
  self._fts_created = True
242
- except Exception as e:
272
+ except Exception:
243
273
  # Will fail if there aren't enough rows
244
274
  pass
245
275
 
246
- if self._fts_created == True or self._vector_index_created == True:
276
+ if self._fts_created or self._vector_index_created:
247
277
  logger.info("optimizing existing index")
248
278
  await self.room.database.optimize(table=self.table)
249
-
250
279
 
251
280
  async def start(self, *, room):
252
281
  await super().start(room=room)
@@ -257,74 +286,72 @@ class StorageIndexer(SingleRoomAgent):
257
286
  await room.database.create_table_with_schema(
258
287
  name=self.table,
259
288
  schema={
260
- "url" : TextDataType(),
261
- "text" : TextDataType(),
262
- "embedding" : VectorDataType(
263
- size=self.embedder.size,
264
- element_type=FloatDataType()
289
+ "url": TextDataType(),
290
+ "text": TextDataType(),
291
+ "embedding": VectorDataType(
292
+ size=self.embedder.size, element_type=FloatDataType()
265
293
  ),
266
- "sha" : TextDataType(),
294
+ "sha": TextDataType(),
267
295
  },
268
296
  mode="create_if_not_exists",
269
- data=None
297
+ data=None,
270
298
  )
271
299
 
272
-
273
300
  def index_task(task: asyncio.Task):
274
-
275
301
  try:
276
- result = task.result()
302
+ task.result()
277
303
  except Exception as e:
278
304
  logger.error("Index task failed", exc_info=e)
279
-
280
305
 
281
306
  self._index_task = asyncio.create_task(self._indexer())
282
307
  self._index_task.add_done_callback(index_task)
283
-
308
+
284
309
  async def stop(self):
285
310
  await super().stop()
286
311
  await self._chan.close()
287
-
288
-
289
- async def _indexer(self):
290
312
 
313
+ async def _indexer(self):
291
314
  async for e in self._chan:
292
-
293
315
  try:
294
316
  if e.deleted:
295
-
296
317
  # todo: consider using sql_alchemy or a library to do the escaping
297
318
  def escape_sql_string(value):
298
319
  if not isinstance(value, str):
299
320
  raise TypeError("Input must be a string")
300
321
  return value.replace("'", "''")
301
322
 
302
- self.room.developer.log_nowait(type="indexer.delete", data={"path": e.path})
303
- await self.room.database.delete(table=self.table, where=f"url='{escape_sql_string(e.path)}'")
304
-
323
+ self.room.developer.log_nowait(
324
+ type="indexer.delete", data={"path": e.path}
325
+ )
326
+ await self.room.database.delete(
327
+ table=self.table, where=f"url='{escape_sql_string(e.path)}'"
328
+ )
305
329
 
306
330
  else:
307
-
308
- self.room.developer.log_nowait(type="indexer.index", data={"path": e.path})
309
-
331
+ self.room.developer.log_nowait(
332
+ type="indexer.index", data={"path": e.path}
333
+ )
310
334
 
311
335
  async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
312
-
313
336
  # if we already indexed this chunk, lets use the existing embedding instead of generating a new one
314
337
  results = await self.room.database.search(
315
338
  table=self.table,
316
339
  where={
317
- "sha" : sha,
340
+ "sha": sha,
318
341
  },
319
- limit=1
342
+ limit=1,
320
343
  )
321
344
 
322
345
  if len(results) != 0:
323
- logger.info(f"chunk found from {e.path} {sha}, reusing embedding")
346
+ logger.info(
347
+ f"chunk found from {e.path} {sha}, reusing embedding"
348
+ )
324
349
  return results[0]["embedding"]
325
-
326
- logger.info(f"chunk not found from {e.path} {sha}, generating embedding")
327
-
350
+
351
+ logger.info(
352
+ f"chunk not found from {e.path} {sha}, generating embedding"
353
+ )
354
+
328
355
  return await self.embedder.embed(text=text)
329
356
 
330
357
  basename = os.path.basename(e.path)
@@ -335,67 +362,69 @@ class StorageIndexer(SingleRoomAgent):
335
362
  # let's make the filename it's own chunk
336
363
  rows.append(
337
364
  {
338
- "url" : e.path,
339
- "text" : basename,
340
- "sha" : chunk_sha,
341
- "embedding" : await lookup_or_embed(sha=chunk_sha, text=basename)
365
+ "url": e.path,
366
+ "text": basename,
367
+ "sha": chunk_sha,
368
+ "embedding": await lookup_or_embed(
369
+ sha=chunk_sha, text=basename
370
+ ),
342
371
  }
343
372
  )
344
-
345
-
373
+
346
374
  text = await self.read_file(path=e.path)
347
- if text != None:
348
-
375
+ if text is not None:
349
376
  # the content will be transformed into additional chunks
350
- for chunk in await self.chunker.chunk(text=text, max_length = self.embedder.max_length):
351
- logger.info(f"processing chunk from {e.path}: {chunk.start}")
352
- chunk_sha = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest()
377
+ for chunk in await self.chunker.chunk(
378
+ text=text, max_length=self.embedder.max_length
379
+ ):
380
+ logger.info(
381
+ f"processing chunk from {e.path}: {chunk.start}"
382
+ )
383
+ chunk_sha = hashlib.sha256(
384
+ chunk.text.encode("utf-8")
385
+ ).hexdigest()
353
386
  rows.append(
354
387
  {
355
- "url" : e.path,
356
- "text" : chunk.text,
357
- "embedding" : await lookup_or_embed(sha=chunk_sha, text=chunk.text),
358
- "sha" : chunk_sha,
388
+ "url": e.path,
389
+ "text": chunk.text,
390
+ "embedding": await lookup_or_embed(
391
+ sha=chunk_sha, text=chunk.text
392
+ ),
393
+ "sha": chunk_sha,
359
394
  }
360
395
  )
361
- await self.room.database.merge(table=self.table, on="sha", records=rows)
396
+ await self.room.database.merge(
397
+ table=self.table, on="sha", records=rows
398
+ )
362
399
  await self.refresh_index()
363
400
 
364
-
365
-
366
401
  except Exception as e:
367
402
  logger.error("error while indexing", exc_info=e)
368
403
 
369
-
370
404
  def _on_file_deleted(self, path: str, participant_id: str):
371
405
  self._chan.send_nowait(FileIndexEvent(path=path, deleted=True))
372
-
406
+
373
407
  def _on_file_updated(self, path: str, participant_id: str):
374
408
  self._chan.send_nowait(FileIndexEvent(path=path, deleted=False))
375
409
 
376
-
377
-
378
-
379
410
 
380
411
  class SiteIndexer(TaskRunner):
381
-
382
- def __init__(self,
383
- *,
412
+ def __init__(
413
+ self,
414
+ *,
384
415
  name,
385
416
  chunker: Optional[Chunker] = None,
386
- embedder:Optional[Embedder] = None,
417
+ embedder: Optional[Embedder] = None,
387
418
  title=None,
388
419
  description=None,
389
420
  requires=None,
390
- supports_tools = None,
391
- labels: Optional[list[str]] = None
392
-
421
+ supports_tools=None,
422
+ labels: Optional[list[str]] = None,
393
423
  ):
394
-
395
- if chunker == None:
424
+ if chunker is None:
396
425
  chunker = ChonkieChunker()
397
426
 
398
- if embedder == None:
427
+ if embedder is None:
399
428
  embedder = open_ai_embedding_3_large()
400
429
 
401
430
  self.chunker = chunker
@@ -406,47 +435,29 @@ class SiteIndexer(TaskRunner):
406
435
  title=title,
407
436
  description=description,
408
437
  requires=[
409
- RequiredToolkit(
410
- name="meshagent.firecrawl",
411
- tools=[
412
- "firecrawl_queue"
413
- ]
414
- ),
438
+ RequiredToolkit(name="meshagent.firecrawl", tools=["firecrawl_queue"]),
415
439
  ],
416
440
  supports_tools=supports_tools,
417
441
  input_schema={
418
- "type" : "object",
419
- "required" : [
420
- "queue", "table", "url"
421
- ],
422
- "additionalProperties" : False,
423
- "properties" : {
424
- "queue" : {
425
- "type" : "string",
426
- "description" : "default: firecrawl"
427
- },
428
- "table" : {
429
- "type" : "string",
430
- "description" : "default: index"
431
- },
432
- "url" : {
433
- "type" : "string",
434
- "description" : "default: index"
435
- }
436
- }
442
+ "type": "object",
443
+ "required": ["queue", "table", "url"],
444
+ "additionalProperties": False,
445
+ "properties": {
446
+ "queue": {"type": "string", "description": "default: firecrawl"},
447
+ "table": {"type": "string", "description": "default: index"},
448
+ "url": {"type": "string", "description": "default: index"},
449
+ },
437
450
  },
438
451
  output_schema={
439
- "type" : "object",
440
- "required" : [],
441
- "additionalProperties" : False,
442
- "properties" : {},
452
+ "type": "object",
453
+ "required": [],
454
+ "additionalProperties": False,
455
+ "properties": {},
443
456
  },
444
- labels=labels
457
+ labels=labels,
445
458
  )
446
459
 
447
-
448
460
  async def ask(self, *, context, arguments):
449
-
450
461
  queue = arguments["queue"]
451
462
  table = arguments["table"]
452
463
  url = arguments["url"]
@@ -459,129 +470,135 @@ class SiteIndexer(TaskRunner):
459
470
  except ValueError:
460
471
  pass
461
472
 
462
-
463
473
  async def lookup_or_embed(*, sha: str, text: str) -> list[float]:
464
-
465
474
  # if we already indexed this chunk, lets use the existing embedding instead of generating a new one
466
475
  if exists:
467
-
468
476
  results = await self.room.database.search(
469
477
  table=self.table,
470
478
  where={
471
- "sha" : sha,
479
+ "sha": sha,
472
480
  },
473
- limit=1
481
+ limit=1,
474
482
  )
475
483
 
476
-
477
484
  if len(results) != 0:
478
485
  logger.info(f"chunk found from {url} {sha}, reusing embedding")
479
486
  return results[0]["embedding"]
480
-
487
+
481
488
  logger.info(f"chunk not found from {url} {sha}, generating embedding")
482
-
489
+
483
490
  return await self.embedder.embed(text=text)
484
-
485
-
491
+
486
492
  async def crawl():
487
493
  logger.info(f"starting to crawl: {url}")
488
494
  await context.room.agents.invoke_tool(
489
495
  toolkit="meshagent.firecrawl",
490
496
  tool="firecrawl_queue",
491
- arguments={
492
- "url" : url,
493
- "queue": queue,
494
- "limit" : 100
495
- })
496
-
497
+ arguments={"url": url, "queue": queue, "limit": 100},
498
+ )
499
+
497
500
  logger.info(f"done with crawl: {url}")
498
- await context.room.queues.send(name=queue, message={ "done" : True })
499
-
501
+ await context.room.queues.send(name=queue, message={"done": True})
502
+
500
503
  def crawl_done(task: asyncio.Task):
501
504
  try:
502
505
  task.result()
503
506
  except Exception as e:
504
507
  logger.error("crawl failed", exc_info=e)
505
508
 
506
-
507
509
  crawl_task = asyncio.create_task(crawl())
508
510
  crawl_task.add_done_callback(crawl_done)
509
-
511
+
510
512
  rows = []
511
513
 
512
514
  id = 0
513
-
515
+
514
516
  while True:
515
- message = await context.room.queues.receive(name=queue, create=True, wait=True)
516
-
517
- if message == None:
517
+ message = await context.room.queues.receive(
518
+ name=queue, create=True, wait=True
519
+ )
520
+
521
+ if message is None:
518
522
  break
519
523
 
520
524
  if message.get("type", None) == "crawl.completed":
521
525
  break
522
-
526
+
523
527
  if "data" in message:
524
528
  for data in message["data"]:
525
529
  try:
526
- url : str = data["metadata"]["url"]
527
- text : str = data["markdown"]
528
- title : str = data["metadata"]["title"]
529
- title_sha : str = hashlib.sha256(text.encode("utf-8")).hexdigest()
530
+ url: str = data["metadata"]["url"]
531
+ text: str = data["markdown"]
532
+ title: str = data["metadata"]["title"]
533
+ title_sha: str = hashlib.sha256(
534
+ text.encode("utf-8")
535
+ ).hexdigest()
530
536
 
531
537
  logger.info(f"processing crawled page: {url}")
532
-
538
+
533
539
  # let's make the title it's own chunk
534
540
  rows.append(
535
- {
536
- "id" : id,
537
- "url" : url,
538
- "text" : title,
539
- "sha" : title_sha,
540
- "embedding" : await lookup_or_embed(sha=title_sha, text=title)
541
- }
542
- )
543
-
541
+ {
542
+ "id": id,
543
+ "url": url,
544
+ "text": title,
545
+ "sha": title_sha,
546
+ "embedding": await lookup_or_embed(
547
+ sha=title_sha, text=title
548
+ ),
549
+ }
550
+ )
551
+
544
552
  id = id + 1
545
-
553
+
546
554
  # the content will be transformed into additional chunks
547
- for chunk in await self.chunker.chunk(text=text, max_length = self.embedder.max_length):
555
+ for chunk in await self.chunker.chunk(
556
+ text=text, max_length=self.embedder.max_length
557
+ ):
548
558
  logger.info(f"processing chunk from {url}: {chunk.text}")
549
- chunk_sha = hashlib.sha256(chunk.text.encode("utf-8")).hexdigest()
559
+ chunk_sha = hashlib.sha256(
560
+ chunk.text.encode("utf-8")
561
+ ).hexdigest()
550
562
  rows.append(
551
563
  {
552
- "id" : id,
553
- "url" : url,
554
- "text" : chunk.text,
555
- "embedding" : await lookup_or_embed(sha=chunk_sha, text=chunk.text)
564
+ "id": id,
565
+ "url": url,
566
+ "text": chunk.text,
567
+ "embedding": await lookup_or_embed(
568
+ sha=chunk_sha, text=chunk.text
569
+ ),
556
570
  }
557
571
  )
558
-
572
+
559
573
  id = id + 1
560
574
 
561
575
  except Exception as e:
562
576
  logger.error(f"failed to process: {url}", exc_info=e)
563
577
 
564
578
  logger.info(f"saving crawl: {url}")
565
-
579
+
566
580
  await context.room.database.create_table_with_schema(
567
581
  name=table,
568
582
  schema={
569
- "id" : IntDataType(),
570
- "url" : TextDataType(),
571
- "text" : TextDataType(),
572
- "embedding" : VectorDataType(
573
- size=self.embedder.size,
574
- element_type=FloatDataType()
583
+ "id": IntDataType(),
584
+ "url": TextDataType(),
585
+ "text": TextDataType(),
586
+ "embedding": VectorDataType(
587
+ size=self.embedder.size, element_type=FloatDataType()
575
588
  ),
576
- "sha" : TextDataType(),
589
+ "sha": TextDataType(),
577
590
  },
578
591
  mode="overwrite",
579
- data=rows
592
+ data=rows,
580
593
  )
581
594
 
582
595
  if len(rows) > 255:
583
- await context.room.database.create_vector_index(table=table, column="embedding")
596
+ await context.room.database.create_vector_index(
597
+ table=table, column="embedding"
598
+ )
584
599
 
585
- await context.room.database.create_full_text_search_index(table=table, column="text")
600
+ await context.room.database.create_full_text_search_index(
601
+ table=table, column="text"
602
+ )
586
603
 
587
604
  return {}