typeagent-py 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. typeagent/aitools/auth.py +61 -0
  2. typeagent/aitools/embeddings.py +232 -0
  3. typeagent/aitools/utils.py +244 -0
  4. typeagent/aitools/vectorbase.py +175 -0
  5. typeagent/knowpro/answer_context_schema.py +49 -0
  6. typeagent/knowpro/answer_response_schema.py +34 -0
  7. typeagent/knowpro/answers.py +577 -0
  8. typeagent/knowpro/collections.py +759 -0
  9. typeagent/knowpro/common.py +9 -0
  10. typeagent/knowpro/convknowledge.py +112 -0
  11. typeagent/knowpro/convsettings.py +94 -0
  12. typeagent/knowpro/convutils.py +49 -0
  13. typeagent/knowpro/date_time_schema.py +32 -0
  14. typeagent/knowpro/field_helpers.py +87 -0
  15. typeagent/knowpro/fuzzyindex.py +144 -0
  16. typeagent/knowpro/interfaces.py +818 -0
  17. typeagent/knowpro/knowledge.py +88 -0
  18. typeagent/knowpro/kplib.py +125 -0
  19. typeagent/knowpro/query.py +1128 -0
  20. typeagent/knowpro/search.py +628 -0
  21. typeagent/knowpro/search_query_schema.py +165 -0
  22. typeagent/knowpro/searchlang.py +729 -0
  23. typeagent/knowpro/searchlib.py +345 -0
  24. typeagent/knowpro/secindex.py +100 -0
  25. typeagent/knowpro/serialization.py +390 -0
  26. typeagent/knowpro/textlocindex.py +179 -0
  27. typeagent/knowpro/utils.py +17 -0
  28. typeagent/mcp/server.py +139 -0
  29. typeagent/podcasts/podcast.py +473 -0
  30. typeagent/podcasts/podcast_import.py +105 -0
  31. typeagent/storage/__init__.py +25 -0
  32. typeagent/storage/memory/__init__.py +13 -0
  33. typeagent/storage/memory/collections.py +68 -0
  34. typeagent/storage/memory/convthreads.py +81 -0
  35. typeagent/storage/memory/messageindex.py +178 -0
  36. typeagent/storage/memory/propindex.py +289 -0
  37. typeagent/storage/memory/provider.py +84 -0
  38. typeagent/storage/memory/reltermsindex.py +318 -0
  39. typeagent/storage/memory/semrefindex.py +660 -0
  40. typeagent/storage/memory/timestampindex.py +176 -0
  41. typeagent/storage/sqlite/__init__.py +31 -0
  42. typeagent/storage/sqlite/collections.py +362 -0
  43. typeagent/storage/sqlite/messageindex.py +382 -0
  44. typeagent/storage/sqlite/propindex.py +119 -0
  45. typeagent/storage/sqlite/provider.py +293 -0
  46. typeagent/storage/sqlite/reltermsindex.py +328 -0
  47. typeagent/storage/sqlite/schema.py +248 -0
  48. typeagent/storage/sqlite/semrefindex.py +156 -0
  49. typeagent/storage/sqlite/timestampindex.py +146 -0
  50. typeagent/storage/utils.py +41 -0
  51. typeagent_py-0.1.0.dist-info/METADATA +28 -0
  52. typeagent_py-0.1.0.dist-info/RECORD +55 -0
  53. typeagent_py-0.1.0.dist-info/WHEEL +5 -0
  54. typeagent_py-0.1.0.dist-info/licenses/LICENSE +21 -0
  55. typeagent_py-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,382 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ """SQLite-based message text index implementation."""
5
+
6
+ import json
7
+ import sqlite3
8
+ import typing
9
+
10
+ import numpy as np
11
+
12
+ from ...aitools.embeddings import NormalizedEmbedding
13
+ from ...aitools.vectorbase import ScoredInt, VectorBase
14
+
15
+ from ...knowpro.convsettings import MessageTextIndexSettings
16
+ from ...knowpro import interfaces
17
+ from ...knowpro.interfaces import TextLocationData, TextToTextLocationIndexData
18
+ from ...knowpro.textlocindex import ScoredTextLocation
19
+
20
+ from ...storage.memory.messageindex import IMessageTextEmbeddingIndex
21
+
22
+ from .schema import deserialize_embedding, serialize_embedding
23
+
24
+
25
+ class SqliteMessageTextIndex(IMessageTextEmbeddingIndex):
26
+ """SQLite-backed message text index with embedding support."""
27
+
28
+ def __init__(
29
+ self,
30
+ db: sqlite3.Connection,
31
+ settings: MessageTextIndexSettings,
32
+ message_collection: interfaces.IMessageCollection | None = None,
33
+ ):
34
+ self.db = db
35
+ self.settings = settings
36
+ self._message_collection = message_collection
37
+ self._vectorbase = VectorBase(settings=settings.embedding_index_settings)
38
+ if self._size():
39
+ cursor = self.db.cursor()
40
+ cursor.execute("SELECT embedding FROM MessageTextIndex")
41
+ for row in cursor.fetchall():
42
+ self._vectorbase.add_embedding(None, deserialize_embedding(row[0]))
43
+
44
+ async def size(self) -> int:
45
+ return self._size()
46
+
47
+ def _size(self) -> int:
48
+ cursor = self.db.cursor()
49
+ cursor.execute("SELECT COUNT(*) FROM MessageTextIndex")
50
+ return cursor.fetchone()[0]
51
+
52
+ async def add_messages_starting_at(
53
+ self,
54
+ start_message_ordinal: int,
55
+ messages: list[interfaces.IMessage],
56
+ ) -> None:
57
+ """Add messages to the text index starting at the given ordinal."""
58
+ chunks_to_embed: list[tuple[int, int, str]] = []
59
+ for msg_ord, message in enumerate(messages, start_message_ordinal):
60
+ for chunk_ord, chunk in enumerate(message.text_chunks):
61
+ chunks_to_embed.append((msg_ord, chunk_ord, chunk))
62
+
63
+ embeddings = await self._vectorbase.get_embeddings(
64
+ [chunk for _, _, chunk in chunks_to_embed], cache=False
65
+ )
66
+
67
+ insertion_data: list[tuple[int, int, bytes, int]] = []
68
+ for idx, ((msg_ord, chunk_ord, _), embedding) in enumerate(
69
+ zip(chunks_to_embed, embeddings)
70
+ ):
71
+ # Get the current VectorBase size to determine the index position
72
+ current_size = len(self._vectorbase)
73
+ index_position = current_size + idx
74
+ insertion_data.append(
75
+ (msg_ord, chunk_ord, serialize_embedding(embedding), index_position)
76
+ )
77
+
78
+ # Bulk insert text chunks (without embeddings yet)
79
+ cursor = self.db.cursor()
80
+ if insertion_data:
81
+ cursor.executemany(
82
+ """
83
+ INSERT INTO MessageTextIndex
84
+ (msg_id, chunk_ordinal, embedding, index_position)
85
+ VALUES (?, ?, ?, ?)
86
+ """,
87
+ insertion_data,
88
+ )
89
+
90
+ async def add_messages(
91
+ self,
92
+ messages: typing.Iterable[interfaces.IMessage],
93
+ ) -> None:
94
+ """Add messages to the text index (backward compatibility method)."""
95
+ message_list = list(messages)
96
+ if not message_list:
97
+ return
98
+
99
+ # Get the current collection size to determine starting ordinal
100
+ start_ordinal = await self.size()
101
+
102
+ await self.add_messages_starting_at(start_ordinal, message_list)
103
+
104
+ async def rebuild_from_all_messages(self) -> None:
105
+ """Rebuild the entire message text index from all messages in the collection."""
106
+ if self._message_collection is None:
107
+ return
108
+
109
+ # Clear existing index
110
+ await self.clear()
111
+
112
+ # Add all messages with their ordinals
113
+ message_list = await self._message_collection.get_slice(
114
+ 0, await self._message_collection.size()
115
+ )
116
+
117
+ if message_list:
118
+ await self.add_messages_starting_at(0, message_list)
119
+
120
+ print(f"DEBUG: Rebuilt message text index with {await self.size()} entries")
121
+
122
+ async def lookup_text(
123
+ self, text: str, max_matches: int | None = None, min_score: float | None = None
124
+ ) -> list[ScoredTextLocation]:
125
+ """Look up text using VectorBase."""
126
+ fuzzy_results = await self._vectorbase.fuzzy_lookup(
127
+ text, max_hits=max_matches, min_score=min_score
128
+ )
129
+ return self._vectorbase_lookup_to_scored_locations(fuzzy_results)
130
+
131
+ def _vectorbase_lookup_to_scored_locations(
132
+ self,
133
+ fuzzy_results: list[ScoredInt],
134
+ predicate: typing.Callable[[interfaces.MessageOrdinal], bool] | None = None,
135
+ ) -> list[ScoredTextLocation]:
136
+ """Convert VectorBase fuzzy results to scored text locations using optimized DB query."""
137
+ if not fuzzy_results:
138
+ return []
139
+
140
+ # Fetch the rows corresponding to fuzzy_results
141
+ cursor = self.db.cursor()
142
+ index_positions = [scored_int.item for scored_int in fuzzy_results]
143
+ placeholders = ",".join("?" * len(index_positions))
144
+ cursor.execute(
145
+ f"""
146
+ SELECT msg_id, chunk_ordinal, index_position
147
+ FROM MessageTextIndex
148
+ WHERE index_position IN ({placeholders})
149
+ ORDER BY index_position
150
+ """,
151
+ index_positions,
152
+ )
153
+ rows = cursor.fetchall()
154
+
155
+ # Create a mapping from index_position to (msg_id, chunk_ordinal)
156
+ position_to_location = {
157
+ index_position: (msg_id, chunk_ordinal)
158
+ for msg_id, chunk_ordinal, index_position in rows
159
+ }
160
+
161
+ # Build scored locations, applying predicate filter if provided
162
+ scored_locations = []
163
+ for scored_int in fuzzy_results:
164
+ if scored_int.item in position_to_location:
165
+ msg_id, chunk_ordinal = position_to_location[scored_int.item]
166
+
167
+ # Apply predicate filter if provided
168
+ if predicate is None or predicate(msg_id):
169
+ text_location = interfaces.TextLocation(
170
+ message_ordinal=msg_id,
171
+ chunk_ordinal=chunk_ordinal,
172
+ )
173
+ scored_locations.append(
174
+ ScoredTextLocation(text_location, scored_int.score)
175
+ )
176
+
177
+ return scored_locations
178
+
179
+ def _scored_locations_to_message_ordinals(
180
+ self,
181
+ scored_locations: list[ScoredTextLocation],
182
+ max_matches: int | None = None,
183
+ ) -> list[interfaces.ScoredMessageOrdinal]:
184
+ """Convert scored text locations to scored message ordinals by grouping chunks."""
185
+ # Group by message and take the best score per message
186
+ message_scores: dict[int, float] = {}
187
+ for scored_loc in scored_locations:
188
+ msg_ord = scored_loc.text_location.message_ordinal
189
+ if msg_ord not in message_scores:
190
+ message_scores[msg_ord] = scored_loc.score
191
+ else:
192
+ # Take the best score for this message
193
+ message_scores[msg_ord] = max(message_scores[msg_ord], scored_loc.score)
194
+
195
+ # Convert to list and sort by score
196
+ result = [
197
+ interfaces.ScoredMessageOrdinal(msg_ordinal, score)
198
+ for msg_ordinal, score in message_scores.items()
199
+ ]
200
+ result.sort(key=lambda x: x.score, reverse=True)
201
+
202
+ # Apply max_matches limit to final results
203
+ if max_matches is not None:
204
+ result = result[:max_matches]
205
+
206
+ return result
207
+
208
+ async def lookup_messages(
209
+ self,
210
+ message_text: str,
211
+ max_matches: int | None = None,
212
+ threshold_score: float | None = None,
213
+ ) -> list[interfaces.ScoredMessageOrdinal]:
214
+ """Look up messages by text content."""
215
+ scored_locations = await self.lookup_text(message_text, None, threshold_score)
216
+ return self._scored_locations_to_message_ordinals(scored_locations, max_matches)
217
+
218
+ async def lookup_messages_in_subset(
219
+ self,
220
+ message_text: str,
221
+ ordinals_to_search: list[interfaces.MessageOrdinal],
222
+ max_matches: int | None = None,
223
+ threshold_score: float | None = None,
224
+ ) -> list[interfaces.ScoredMessageOrdinal]:
225
+ """Look up messages in a subset of ordinals."""
226
+ # Get all matches first
227
+ all_matches = await self.lookup_messages(message_text, None, threshold_score)
228
+
229
+ # Filter to only include the specified ordinals
230
+ ordinals_set = set(ordinals_to_search)
231
+ filtered_matches = [
232
+ match for match in all_matches if match.message_ordinal in ordinals_set
233
+ ]
234
+
235
+ # Apply max_matches limit
236
+ if max_matches is not None:
237
+ filtered_matches = filtered_matches[:max_matches]
238
+
239
+ return filtered_matches
240
+
241
+ async def generate_embedding(self, text: str) -> NormalizedEmbedding:
242
+ """Generate an embedding for the given text."""
243
+ return await self._vectorbase.get_embedding(text)
244
+
245
+ def lookup_by_embedding(
246
+ self,
247
+ text_embedding: NormalizedEmbedding,
248
+ max_matches: int | None = None,
249
+ threshold_score: float | None = None,
250
+ predicate: typing.Callable[[interfaces.MessageOrdinal], bool] | None = None,
251
+ ) -> list[interfaces.ScoredMessageOrdinal]:
252
+ """Look up messages by embedding using optimized VectorBase similarity search."""
253
+ fuzzy_results = self._vectorbase.fuzzy_lookup_embedding(
254
+ text_embedding, max_hits=max_matches, min_score=threshold_score
255
+ )
256
+ scored_locations = self._vectorbase_lookup_to_scored_locations(
257
+ fuzzy_results, predicate
258
+ )
259
+ return self._scored_locations_to_message_ordinals(scored_locations, max_matches)
260
+
261
+ def lookup_in_subset_by_embedding(
262
+ self,
263
+ text_embedding: NormalizedEmbedding,
264
+ ordinals_to_search: list[interfaces.MessageOrdinal],
265
+ max_matches: int | None = None,
266
+ threshold_score: float | None = None,
267
+ ) -> list[interfaces.ScoredMessageOrdinal]:
268
+ """Look up messages in a subset by embedding (synchronous version)."""
269
+ ordinals_set = set(ordinals_to_search)
270
+ return self.lookup_by_embedding(
271
+ text_embedding,
272
+ max_matches,
273
+ threshold_score,
274
+ predicate=lambda ordinal: ordinal in ordinals_set,
275
+ )
276
+
277
+ async def is_empty(self) -> bool:
278
+ """Check if the index is empty."""
279
+ size = await self.size()
280
+ return size == 0
281
+
282
+ async def serialize(self) -> interfaces.MessageTextIndexData:
283
+ """Serialize the message text index."""
284
+ # Get all data from the MessageTextIndex table
285
+ cursor = self.db.cursor()
286
+ cursor.execute(
287
+ """
288
+ SELECT msg_id, chunk_ordinal, embedding
289
+ FROM MessageTextIndex
290
+ ORDER BY msg_id, chunk_ordinal
291
+ """
292
+ )
293
+
294
+ # Build the text locations and embeddings
295
+ text_locations = []
296
+ embeddings_list = []
297
+
298
+ from ..sqlite.schema import deserialize_embedding
299
+
300
+ for msg_id, chunk_ordinal, embedding_blob in cursor.fetchall():
301
+ # Create text location data
302
+ text_location = TextLocationData(
303
+ messageOrdinal=msg_id, chunkOrdinal=chunk_ordinal
304
+ )
305
+ text_locations.append(text_location)
306
+
307
+ if embedding_blob:
308
+ embedding = deserialize_embedding(embedding_blob)
309
+ embeddings_list.append(embedding)
310
+ else:
311
+ # Handle case where embedding is None
312
+ embeddings_list.append(None)
313
+
314
+ if text_locations:
315
+ # Convert embeddings to numpy array if we have any
316
+ valid_embeddings = [e for e in embeddings_list if e is not None]
317
+ if valid_embeddings:
318
+ embeddings_array = np.array(valid_embeddings, dtype=np.float32)
319
+ else:
320
+ embeddings_array = None
321
+
322
+ index_data = TextToTextLocationIndexData(
323
+ textLocations=text_locations, embeddings=embeddings_array
324
+ )
325
+ return interfaces.MessageTextIndexData(indexData=index_data)
326
+
327
+ return {}
328
+
329
+ async def deserialize(self, data: interfaces.MessageTextIndexData) -> None:
330
+ """Deserialize message text index data."""
331
+ cursor = self.db.cursor()
332
+
333
+ # Clear existing data
334
+ cursor.execute("DELETE FROM MessageTextIndex")
335
+
336
+ # Get the index data
337
+ index_data = data.get("indexData")
338
+ if not index_data:
339
+ return
340
+
341
+ text_locations = index_data.get("textLocations", [])
342
+ if not text_locations:
343
+ return
344
+
345
+ embeddings = index_data.get("embeddings")
346
+ if embeddings is None:
347
+ return
348
+
349
+ # Prepare all insertion data for bulk operation
350
+ insertion_data: list[tuple[int, int, bytes, int]] = []
351
+ for idx, (text_location, embedding) in enumerate(
352
+ zip(text_locations, embeddings, strict=True)
353
+ ):
354
+ msg_id = text_location["messageOrdinal"]
355
+ chunk_ordinal = text_location["chunkOrdinal"]
356
+ assert embedding is not None
357
+ embedding_blob = serialize_embedding(embedding)
358
+ # Get the current VectorBase size to determine the index position
359
+ current_size = len(self._vectorbase)
360
+ index_position = current_size + idx
361
+ insertion_data.append(
362
+ (msg_id, chunk_ordinal, embedding_blob, index_position)
363
+ )
364
+
365
+ # Bulk insert all the data
366
+ if insertion_data:
367
+ cursor.executemany(
368
+ """
369
+ INSERT INTO MessageTextIndex
370
+ (msg_id, chunk_ordinal, embedding, index_position)
371
+ VALUES (?, ?, ?, ?)
372
+ """,
373
+ insertion_data,
374
+ )
375
+
376
+ # Update VectorBase
377
+ self._vectorbase.add_embeddings(embeddings)
378
+
379
+ async def clear(self) -> None:
380
+ """Clear the message text index."""
381
+ cursor = self.db.cursor()
382
+ cursor.execute("DELETE FROM MessageTextIndex")
@@ -0,0 +1,119 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ """SQLite-based property index implementation."""
5
+
6
+ import sqlite3
7
+
8
+ from ...knowpro import interfaces
9
+ from ...knowpro.interfaces import ScoredSemanticRefOrdinal
10
+
11
+
12
+ class SqlitePropertyIndex(interfaces.IPropertyToSemanticRefIndex):
13
+ """SQLite-backed implementation of property to semantic ref index."""
14
+
15
+ def __init__(self, db: sqlite3.Connection):
16
+ self.db = db
17
+
18
+ async def size(self) -> int:
19
+ cursor = self.db.cursor()
20
+ cursor.execute(
21
+ "SELECT COUNT(*) FROM (SELECT DISTINCT prop_name, value_str FROM PropertyIndex)"
22
+ )
23
+ return cursor.fetchone()[0]
24
+
25
+ async def get_values(self) -> list[str]:
26
+ cursor = self.db.cursor()
27
+ cursor.execute(
28
+ "SELECT DISTINCT value_str FROM PropertyIndex ORDER BY value_str"
29
+ )
30
+ return [row[0] for row in cursor.fetchall()]
31
+
32
+ async def add_property(
33
+ self,
34
+ property_name: str,
35
+ value: str,
36
+ semantic_ref_ordinal: (
37
+ interfaces.SemanticRefOrdinal | interfaces.ScoredSemanticRefOrdinal
38
+ ),
39
+ ) -> None:
40
+ # Extract semref_id and score from the ordinal
41
+ if isinstance(semantic_ref_ordinal, interfaces.ScoredSemanticRefOrdinal):
42
+ semref_id = semantic_ref_ordinal.semantic_ref_ordinal
43
+ score = semantic_ref_ordinal.score
44
+ else:
45
+ semref_id = semantic_ref_ordinal
46
+ score = 1.0
47
+
48
+ # Normalize property name and value (to match in-memory implementation)
49
+ from ...storage.memory.propindex import (
50
+ make_property_term_text,
51
+ split_property_term_text,
52
+ )
53
+
54
+ term_text = make_property_term_text(property_name, value)
55
+ term_text = term_text.lower() # Matches PropertyIndex._prepare_term_text
56
+ property_name, value = split_property_term_text(term_text)
57
+ # Remove "prop." prefix that was added by make_property_term_text
58
+ if property_name.startswith("prop."):
59
+ property_name = property_name[5:]
60
+
61
+ cursor = self.db.cursor()
62
+ cursor.execute(
63
+ """
64
+ INSERT INTO PropertyIndex (prop_name, value_str, score, semref_id)
65
+ VALUES (?, ?, ?, ?)
66
+ """,
67
+ (property_name, value, score, semref_id),
68
+ )
69
+
70
+ async def clear(self) -> None:
71
+ cursor = self.db.cursor()
72
+ cursor.execute("DELETE FROM PropertyIndex")
73
+
74
+ async def lookup_property(
75
+ self,
76
+ property_name: str,
77
+ value: str,
78
+ ) -> list[interfaces.ScoredSemanticRefOrdinal] | None:
79
+ # Normalize property name and value (to match in-memory implementation)
80
+ from ...storage.memory.propindex import (
81
+ make_property_term_text,
82
+ split_property_term_text,
83
+ )
84
+
85
+ term_text = make_property_term_text(property_name, value)
86
+ term_text = term_text.lower() # Matches PropertyIndex._prepare_term_text
87
+ property_name, value = split_property_term_text(term_text)
88
+ # Remove "prop." prefix that was added by make_property_term_text
89
+ if property_name.startswith("prop."):
90
+ property_name = property_name[5:]
91
+
92
+ cursor = self.db.cursor()
93
+ cursor.execute(
94
+ "SELECT semref_id, score FROM PropertyIndex WHERE prop_name = ? AND value_str = ?",
95
+ (property_name, value),
96
+ )
97
+
98
+ results = [
99
+ ScoredSemanticRefOrdinal(semref_id, score)
100
+ for semref_id, score in cursor.fetchall()
101
+ ]
102
+
103
+ return results if results else None
104
+
105
+ async def remove_property(self, prop_name: str, semref_id: int) -> None:
106
+ """Remove all properties for a specific property name and semantic ref."""
107
+ cursor = self.db.cursor()
108
+ cursor.execute(
109
+ "DELETE FROM PropertyIndex WHERE prop_name = ? AND semref_id = ?",
110
+ (prop_name, semref_id),
111
+ )
112
+
113
+ async def remove_all_for_semref(self, semref_id: int) -> None:
114
+ """Remove all properties for a specific semantic ref."""
115
+ cursor = self.db.cursor()
116
+ cursor.execute(
117
+ "DELETE FROM PropertyIndex WHERE semref_id = ?",
118
+ (semref_id,),
119
+ )