langroid 0.3.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/agent/base.py CHANGED
@@ -784,15 +784,51 @@ class Agent(ABC):
784
784
  # ]
785
785
  # }
786
786
 
787
+ if not isinstance(json_data, dict):
788
+ return None
789
+
787
790
  properties = json_data.get("properties")
788
- if properties is not None:
791
+ if isinstance(properties, dict):
789
792
  json_data = properties
790
793
  request = json_data.get("request")
791
- if (
792
- request is None
793
- or not (isinstance(request, str))
794
- or request not in self.llm_tools_handled
795
- ):
794
+
795
+ if request is None:
796
+ handled = [self.llm_tools_map[r] for r in self.llm_tools_handled]
797
+ default_keys = set(ToolMessage.__fields__.keys())
798
+ request_keys = set(json_data.keys())
799
+
800
+ def maybe_parse(tool: type[ToolMessage]) -> Optional[ToolMessage]:
801
+ all_keys = set(tool.__fields__.keys())
802
+ non_inherited_keys = all_keys.difference(default_keys)
803
+ # If the request has any keys not valid for the tool and
804
+ # does not specify some key specific to the type
805
+ # (e.g. not just `purpose`), the LLM must explicitly specify `request`
806
+ if not (
807
+ request_keys.issubset(all_keys)
808
+ and len(request_keys.intersection(non_inherited_keys)) > 0
809
+ ):
810
+ return None
811
+
812
+ try:
813
+ return tool.parse_obj(json_data)
814
+ except ValidationError:
815
+ return None
816
+
817
+ candidate_tools = list(
818
+ filter(
819
+ lambda t: t is not None,
820
+ map(maybe_parse, handled),
821
+ )
822
+ )
823
+
824
+ # If only one valid candidate exists, we infer
825
+ # "request" to be the only possible value
826
+ if len(candidate_tools) == 1:
827
+ return candidate_tools[0]
828
+ else:
829
+ return None
830
+
831
+ if not isinstance(request, str) or request not in self.llm_tools_handled:
796
832
  return None
797
833
 
798
834
  message_class = self.llm_tools_map.get(request)
@@ -427,11 +427,11 @@ class ChatAgent(Agent):
427
427
  but the Assistant fn-calling seems to pay attn to these,
428
428
  and if we don't want this, we should set this to False.)
429
429
  """
430
+ if require_recipient and message_class is not None:
431
+ message_class = message_class.require_recipient()
430
432
  super().enable_message_handling(message_class) # enables handling only
431
433
  tools = self._get_tool_list(message_class)
432
434
  if message_class is not None:
433
- if require_recipient:
434
- message_class = message_class.require_recipient()
435
435
  request = message_class.default_value("request")
436
436
  llm_function = message_class.llm_function_schema(defaults=include_defaults)
437
437
  self.llm_functions_map[request] = llm_function
@@ -538,12 +538,13 @@ class DocChatAgent(ChatAgent):
538
538
  ]
539
539
 
540
540
  def get_field_values(self, fields: list[str]) -> Dict[str, str]:
541
- """Get string-listing of possible values of each filterable field,
541
+ """Get string-listing of possible values of each field,
542
542
  e.g.
543
543
  {
544
544
  "genre": "crime, drama, mystery, ... (10 more)",
545
545
  "certificate": "R, PG-13, PG, R",
546
546
  }
547
+ The field names may have "metadata." prefix, e.g. "metadata.genre".
547
548
  """
548
549
  field_values: Dict[str, Set[str]] = {}
549
550
  # make empty set for each field
@@ -556,8 +557,11 @@ class DocChatAgent(ChatAgent):
556
557
  for d in docs:
557
558
  # extract fields from d
558
559
  doc_field_vals = extract_fields(d, fields)
559
- for field, val in doc_field_vals.items():
560
- field_values[field].add(val)
560
+ # the `field` returned by extract_fields may contain only the last
561
+ # part of the field name, e.g. "genre" instead of "metadata.genre",
562
+ # so we use the orig_field name to fill in the values
563
+ for (field, val), orig_field in zip(doc_field_vals.items(), fields):
564
+ field_values[orig_field].add(val)
561
565
  # For each field make a string showing list of possible values,
562
566
  # truncate to 20 values, and if there are more, indicate how many
563
567
  # more there are, e.g. Genre: crime, drama, mystery, ... (20 more)
@@ -680,7 +684,13 @@ class DocChatAgent(ChatAgent):
680
684
  )
681
685
  return response
682
686
  if query_str == "":
683
- return None
687
+ return ChatDocument(
688
+ content=NO_ANSWER + " since query was empty",
689
+ metadata=ChatDocMetaData(
690
+ source="No query provided",
691
+ sender=Entity.LLM,
692
+ ),
693
+ )
684
694
  elif query_str == "?" and self.response is not None:
685
695
  return self.justify_response()
686
696
  elif (query_str.startswith(("summar", "?")) and self.response is None) or (
@@ -22,7 +22,6 @@ from langroid.mytypes import DocMetaData, Document
22
22
  from langroid.parsing.table_loader import describe_dataframe
23
23
  from langroid.utils.constants import DONE, NO_ANSWER
24
24
  from langroid.utils.pydantic_utils import (
25
- clean_schema,
26
25
  dataframe_to_documents,
27
26
  )
28
27
  from langroid.vector_store.lancedb import LanceDB
@@ -41,24 +40,26 @@ class LanceDocChatAgent(DocChatAgent):
41
40
  def _get_clean_vecdb_schema(self) -> str:
42
41
  """Get a cleaned schema of the vector-db, to pass to the LLM
43
42
  as part of instructions on how to generate a SQL filter."""
43
+
44
+ tbl_pandas = (
45
+ self.vecdb.client.open_table(self.vecdb.config.collection_name)
46
+ .search()
47
+ .limit(1)
48
+ .to_pandas(flatten=True)
49
+ )
44
50
  if len(self.config.filter_fields) == 0:
45
- filterable_fields = (
46
- self.vecdb.client.open_table(self.vecdb.config.collection_name)
47
- .search()
48
- .limit(1)
49
- .to_pandas(flatten=True)
50
- .columns.tolist()
51
- )
51
+ filterable_fields = tbl_pandas.columns.tolist()
52
52
  # drop id, vector, metadata.id, metadata.window_ids, metadata.is_chunk
53
- for fields in [
54
- "id",
55
- "vector",
56
- "metadata.id",
57
- "metadata.window_ids",
58
- "metadata.is_chunk",
59
- ]:
60
- if fields in filterable_fields:
61
- filterable_fields.remove(fields)
53
+ filterable_fields = list(
54
+ set(filterable_fields)
55
+ - {
56
+ "id",
57
+ "vector",
58
+ "metadata.id",
59
+ "metadata.window_ids",
60
+ "metadata.is_chunk",
61
+ }
62
+ )
62
63
  logger.warning(
63
64
  f"""
64
65
  No filter_fields set in config, so using these fields as filterable fields:
@@ -69,15 +70,7 @@ class LanceDocChatAgent(DocChatAgent):
69
70
 
70
71
  if self.from_dataframe:
71
72
  return self.df_description
72
- schema_dict = clean_schema(
73
- self.vecdb.schema,
74
- excludes=["id", "vector"],
75
- )
76
- # intersect config.filter_fields with schema_dict.keys() in case
77
- # there are extraneous fields in config.filter_fields
78
- filter_fields_set = set(
79
- self.config.filter_fields or schema_dict.keys()
80
- ).intersection(schema_dict.keys())
73
+ filter_fields_set = set(self.config.filter_fields)
81
74
 
82
75
  # remove 'content' from filter_fields_set, even if it's not in filter_fields_set
83
76
  filter_fields_set.discard("content")
@@ -85,10 +78,14 @@ class LanceDocChatAgent(DocChatAgent):
85
78
  # possible values of filterable fields
86
79
  filter_field_values = self.get_field_values(list(filter_fields_set))
87
80
 
81
+ schema_dict: Dict[str, Dict[str, Any]] = dict(
82
+ (field, {}) for field in filter_fields_set
83
+ )
88
84
  # add field values to schema_dict as another field `values` for each field
89
85
  for field, values in filter_field_values.items():
90
- if field in schema_dict:
91
- schema_dict[field]["values"] = values
86
+ schema_dict[field]["values"] = values
87
+ dtype = tbl_pandas[field].dtype.name
88
+ schema_dict[field]["dtype"] = dtype
92
89
  # if self.config.filter_fields is set, restrict to these:
93
90
  if len(self.config.filter_fields) > 0:
94
91
  schema_dict = {
@@ -37,20 +37,30 @@ class QueryPlanCriticConfig(LanceQueryPlanAgentConfig):
37
37
  system_message = f"""
38
38
  You are an expert at carefully planning a query that needs to be answered
39
39
  based on a large collection of documents. These docs have a special `content` field
40
- and additional FILTERABLE fields in the SCHEMA below:
40
+ and additional FILTERABLE fields in the SCHEMA below, along with the
41
+ SAMPLE VALUES for each field, and the DTYPE in PANDAS TERMINOLOGY.
41
42
 
42
43
  {{doc_schema}}
43
44
 
45
+ The ORIGINAL QUERY is handled by a QUERY PLANNER who sends the PLAN to an ASSISTANT,
46
+ who returns an ANSWER.
47
+
44
48
  You will receive a QUERY PLAN consisting of:
45
- - ORIGINAL QUERY,
46
- - SQL-Like FILTER, WHICH CAN BE EMPTY (and it's fine if results sound reasonable)
49
+ - ORIGINAL QUERY from the user, which a QUERY PLANNER processes,
50
+ to create a QUERY PLAN, to be handled by an ASSISTANT.
51
+ - PANDAS-LIKE FILTER, WHICH CAN BE EMPTY (and it's fine if results sound reasonable)
47
52
  FILTER SHOULD ONLY BE USED IF EXPLICITLY REQUIRED BY THE QUERY.
48
- - REPHRASED QUERY that will be used to match against the CONTENT (not filterable)
49
- of the documents.
53
+ - REPHRASED QUERY (CANNOT BE EMPTY) that will be used to match against the
54
+ CONTENT (not filterable) of the documents.
50
55
  In general the REPHRASED QUERY should be relied upon to match the CONTENT
51
56
  of the docs. Thus the REPHRASED QUERY itself acts like a
52
57
  SEMANTIC/LEXICAL/FUZZY FILTER since the Assistant is able to use it to match
53
- the CONTENT of the docs in various ways (semantic, lexical, fuzzy, etc.).
58
+ the CONTENT of the docs in various ways (semantic, lexical, fuzzy, etc.).
59
+ Keep in mind that the ASSISTANT does NOT know anything about the FILTER fields,
60
+ so the REPHRASED QUERY should NOT mention ANY FILTER fields.
61
+ The assistant will answer based on documents whose CONTENTS match the QUERY,
62
+ possibly REPHRASED.
63
+ !!!!****THE REPHRASED QUERY SHOULD NEVER BE EMPTY****!!!
54
64
  - DATAFRAME CALCULATION, which must be a SINGLE LINE calculation (or empty),
55
65
  [NOTE ==> This calculation is applied AFTER the FILTER and REPHRASED QUERY.],
56
66
  - ANSWER received from an assistant that used this QUERY PLAN.
@@ -43,23 +43,27 @@ class LanceQueryPlanAgentConfig(ChatAgentConfig):
43
43
  You will receive a QUERY, to be answered based on an EXTREMELY LARGE collection
44
44
  of documents you DO NOT have access to, but your ASSISTANT does.
45
45
  You only know that these documents have a special `content` field
46
- and additional FILTERABLE fields in the SCHEMA below:
46
+ and additional FILTERABLE fields in the SCHEMA below, along with the
47
+ SAMPLE VALUES for each field, and the DTYPE in PANDAS TERMINOLOGY.
47
48
 
48
49
  {{doc_schema}}
49
50
 
50
51
  Based on the QUERY and the above SCHEMA, your task is to determine a QUERY PLAN,
51
52
  consisting of:
52
- - a FILTER (can be empty string) that would help the ASSISTANT to answer the query.
53
+ - a PANDAS-TYPE FILTER (can be empty string) that would help the ASSISTANT to
54
+ answer the query.
53
55
  Remember the FILTER can refer to ANY fields in the above SCHEMA
54
56
  EXCEPT the `content` field of the documents.
55
57
  ONLY USE A FILTER IF EXPLICITLY MENTIONED IN THE QUERY.
56
58
  TO get good results, for STRING MATCHES, consider using LIKE instead of =, e.g.
57
59
  "CEO LIKE '%Jobs%'" instead of "CEO = 'Steve Jobs'"
58
- - a possibly REPHRASED QUERY to be answerable given the FILTER.
60
+ YOUR FILTER MUST BE A PANDAS-TYPE FILTER, respecting the shown DTYPES.
61
+ - a possibly REPHRASED QUERY (CANNOT BE EMPTY) to be answerable given the FILTER.
59
62
  Keep in mind that the ASSISTANT does NOT know anything about the FILTER fields,
60
63
  so the REPHRASED QUERY should NOT mention ANY FILTER fields.
61
64
  The assistant will answer based on documents whose CONTENTS match the QUERY,
62
65
  possibly REPHRASED.
66
+ !!!!****THE REPHRASED QUERY SHOULD NEVER BE EMPTY****!!!
63
67
  - an OPTIONAL SINGLE-LINE Pandas-dataframe calculation/aggregation string
64
68
  that can be used to calculate the answer to the original query,
65
69
  e.g. "df["rating"].mean()",
@@ -99,7 +103,7 @@ class LanceQueryPlanAgentConfig(ChatAgentConfig):
99
103
  hence this computation will give the total deaths in shoplifting crimes.
100
104
  ------------- END OF EXAMPLE ----------------
101
105
 
102
- The FILTER must be a SQL-like condition, e.g.
106
+ The FILTER must be a PANDAS-like condition, e.g.
103
107
  "year > 2000 AND genre = 'ScienceFiction'".
104
108
  To ensure you get useful results, you should make your FILTER
105
109
  NOT TOO STRICT, e.g. look for approximate match using LIKE, etc.
@@ -1,16 +1,21 @@
1
1
  import logging
2
2
 
3
3
  from langroid.agent.tool_message import ToolMessage
4
- from langroid.pydantic_v1 import BaseModel
4
+ from langroid.pydantic_v1 import BaseModel, Field
5
5
 
6
6
  logger = logging.getLogger(__name__)
7
7
 
8
8
 
9
9
  class QueryPlan(BaseModel):
10
- original_query: str
11
- query: str
12
- filter: str
13
- dataframe_calc: str = ""
10
+ original_query: str = Field(..., description="The original query for reference")
11
+ query: str = Field(..., description="A possibly NON-EMPTY rephrased query")
12
+ filter: str = Field(
13
+ "",
14
+ description="Filter condition if needed (or empty if no filter is needed)",
15
+ )
16
+ dataframe_calc: str = Field(
17
+ "", description="An optional Pandas-dataframe calculation/aggregation string"
18
+ )
14
19
 
15
20
 
16
21
  class QueryPlanTool(ToolMessage):
@@ -19,8 +24,9 @@ class QueryPlanTool(ToolMessage):
19
24
  Given a user's query, generate a query <plan> consisting of:
20
25
  - <original_query> - the original query for reference
21
26
  - <filter> condition if needed (or empty string if no filter is needed)
22
- - <query> - a possibly rephrased query that can be used to match the CONTENT
23
- of the documents (can be same as <original_query> if no rephrasing is needed)
27
+ - <query> - a possibly NON-EMPTY rephrased query that can be used to match the
28
+ CONTENT of the documents
29
+ (can be same as <original_query> if no rephrasing is needed)
24
30
  - <dataframe_calc> - a Pandas-dataframe calculation/aggregation string
25
31
  that can be used to calculate the answer
26
32
  (or empty string if no calculation is needed).
@@ -34,7 +40,7 @@ class QueryPlanAnswerTool(ToolMessage):
34
40
  Assemble query <plan> and <answer>
35
41
  """
36
42
  plan: QueryPlan
37
- answer: str
43
+ answer: str = Field(..., description="The answer received from the assistant")
38
44
 
39
45
 
40
46
  class QueryPlanFeedbackTool(ToolMessage):
@@ -35,12 +35,10 @@ class ToolMessage(ABC, BaseModel):
35
35
  request (str): name of agent method to map to.
36
36
  purpose (str): purpose of agent method, expressed in general terms.
37
37
  (This is used when auto-generating the tool instruction to the LLM)
38
- result (str): example of result of agent method.
39
38
  """
40
39
 
41
40
  request: str
42
41
  purpose: str
43
- result: str = ""
44
42
 
45
43
  class Config:
46
44
  arbitrary_types_allowed = False
@@ -48,7 +46,7 @@ class ToolMessage(ABC, BaseModel):
48
46
  validate_assignment = True
49
47
  # do not include these fields in the generated schema
50
48
  # since we don't require the LLM to specify them
51
- schema_extra = {"exclude": {"purpose", "result"}}
49
+ schema_extra = {"exclude": {"purpose"}}
52
50
 
53
51
  @classmethod
54
52
  def instructions(cls) -> str:
@@ -110,13 +108,13 @@ class ToolMessage(ABC, BaseModel):
110
108
  return "\n\n".join(examples_jsons)
111
109
 
112
110
  def to_json(self) -> str:
113
- return self.json(indent=4, exclude={"result", "purpose"})
111
+ return self.json(indent=4, exclude={"purpose"})
114
112
 
115
113
  def json_example(self) -> str:
116
- return self.json(indent=4, exclude={"result", "purpose"})
114
+ return self.json(indent=4, exclude={"purpose"})
117
115
 
118
116
  def dict_example(self) -> Dict[str, Any]:
119
- return self.dict(exclude={"result", "purpose"})
117
+ return self.dict(exclude={"purpose"})
120
118
 
121
119
  @classmethod
122
120
  def default_value(cls, f: str) -> Any:
@@ -220,9 +218,7 @@ class ToolMessage(ABC, BaseModel):
220
218
  if "description" not in parameters["properties"][name]:
221
219
  parameters["properties"][name]["description"] = description
222
220
 
223
- excludes = (
224
- ["result", "purpose"] if request else ["request", "result", "purpose"]
225
- )
221
+ excludes = ["purpose"] if request else ["request", "purpose"]
226
222
  # exclude 'excludes' from parameters["properties"]:
227
223
  parameters["properties"] = {
228
224
  field: details
@@ -263,5 +259,5 @@ class ToolMessage(ABC, BaseModel):
263
259
  Returns:
264
260
  Dict[str, Any]: simplified schema
265
261
  """
266
- schema = generate_simple_schema(cls, exclude=["result", "purpose"])
262
+ schema = generate_simple_schema(cls, exclude=["purpose"])
267
263
  return schema
@@ -9,8 +9,6 @@ from typing import (
9
9
  Tuple,
10
10
  Type,
11
11
  TypeVar,
12
- get_args,
13
- get_origin,
14
12
  no_type_check,
15
13
  )
16
14
 
@@ -313,54 +311,6 @@ def pydantic_obj_from_flat_dict(
313
311
  return model(**nested_data)
314
312
 
315
313
 
316
- def clean_schema(model: Type[BaseModel], excludes: List[str] = []) -> Dict[str, Any]:
317
- """
318
- Generate a simple schema for a given Pydantic model,
319
- including inherited fields, with an option to exclude certain fields.
320
- Handles cases where fields are Lists or other generic types and includes
321
- field descriptions if available.
322
-
323
- Args:
324
- model (Type[BaseModel]): The Pydantic model class.
325
- excludes (List[str]): A list of field names to exclude.
326
-
327
- Returns:
328
- Dict[str, Any]: A dictionary representing the simple schema.
329
- """
330
- schema = {}
331
-
332
- for field_name, field_info in model.__fields__.items():
333
- if field_name in excludes:
334
- continue
335
-
336
- field_type = field_info.outer_type_
337
- description = field_info.field_info.description or ""
338
-
339
- # Handle generic types like List[...]
340
- if get_origin(field_type):
341
- inner_types = get_args(field_type)
342
- inner_type_names = [
343
- t.__name__ if hasattr(t, "__name__") else str(t) for t in inner_types
344
- ]
345
- field_type_str = (
346
- f"{get_origin(field_type).__name__}" f'[{", ".join(inner_type_names)}]'
347
- )
348
- schema[field_name] = {"type": field_type_str, "description": description}
349
- elif issubclass(field_type, BaseModel):
350
- # Directly use the nested model's schema,
351
- # integrating it into the current level
352
- nested_schema = clean_schema(field_type, excludes)
353
- schema[field_name] = {**nested_schema, "description": description}
354
- else:
355
- # For basic types, use 'type'
356
- schema[field_name] = {
357
- "type": field_type.__name__,
358
- "description": description,
359
- }
360
-
361
- return schema
362
-
363
-
364
314
  @contextmanager
365
315
  def temp_update(
366
316
  pydantic_object: BaseModel, updates: Dict[str, Any]
@@ -1,14 +1,14 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Optional, Sequence, Tuple
4
+ from typing import Dict, List, Optional, Sequence, Tuple, Type
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
 
9
9
  from langroid.embedding_models.base import EmbeddingModel, EmbeddingModelsConfig
10
10
  from langroid.embedding_models.models import OpenAIEmbeddingsConfig
11
- from langroid.mytypes import Document
11
+ from langroid.mytypes import DocMetaData, Document
12
12
  from langroid.pydantic_v1 import BaseSettings
13
13
  from langroid.utils.algorithms.graph import components, topological_sort
14
14
  from langroid.utils.configuration import settings
@@ -32,6 +32,9 @@ class VectorStoreConfig(BaseSettings):
32
32
  timeout: int = 60
33
33
  host: str = "127.0.0.1"
34
34
  port: int = 6333
35
+ # used when parsing search results back as Document objects
36
+ document_class: Type[Document] = Document
37
+ metadata_class: Type[DocMetaData] = DocMetaData
35
38
  # compose_file: str = "langroid/vector_store/docker-compose-qdrant.yml"
36
39
 
37
40
 
@@ -113,8 +116,7 @@ class VectorStore(ABC):
113
116
  """
114
117
 
115
118
  self.config.collection_name = collection_name
116
- if collection_name not in self.list_collections() or replace:
117
- self.create_collection(collection_name, replace=replace)
119
+ self.config.replace_collection = replace
118
120
 
119
121
  @abstractmethod
120
122
  def create_collection(self, collection_name: str, replace: bool = False) -> None:
@@ -8,7 +8,7 @@ from langroid.embedding_models.base import (
8
8
  )
9
9
  from langroid.embedding_models.models import OpenAIEmbeddingsConfig
10
10
  from langroid.exceptions import LangroidImportError
11
- from langroid.mytypes import DocMetaData, Document
11
+ from langroid.mytypes import Document
12
12
  from langroid.utils.configuration import settings
13
13
  from langroid.utils.output.printing import print_long_text
14
14
  from langroid.vector_store.base import VectorStore, VectorStoreConfig
@@ -200,7 +200,9 @@ class ChromaDB(VectorStore):
200
200
  else:
201
201
  m["window_ids"] = m["window_ids"].split(",")
202
202
  docs = [
203
- Document(content=d, metadata=DocMetaData(**m))
203
+ self.config.document_class(
204
+ content=d, metadata=self.config.metadata_class(**m)
205
+ )
204
206
  for d, m in zip(contents, metadatas)
205
207
  ]
206
208
  return docs
@@ -32,13 +32,7 @@ from langroid.utils.configuration import settings
32
32
  from langroid.utils.pydantic_utils import (
33
33
  dataframe_to_document_model,
34
34
  dataframe_to_documents,
35
- extend_document_class,
36
- extra_metadata,
37
- flatten_pydantic_instance,
38
- flatten_pydantic_model,
39
- nested_dict_from_flat,
40
35
  )
41
- from langroid.utils.system import pydantic_major_version
42
36
  from langroid.vector_store.base import VectorStore, VectorStoreConfig
43
37
 
44
38
  try:
@@ -58,10 +52,6 @@ class LanceDBConfig(VectorStoreConfig):
58
52
  storage_path: str = ".lancedb/data"
59
53
  embedding: EmbeddingModelsConfig = OpenAIEmbeddingsConfig()
60
54
  distance: str = "cosine"
61
- # document_class is used to store in lancedb with right schema,
62
- # and also to retrieve the right type of Documents when searching.
63
- document_class: Type[Document] = Document
64
- flatten: bool = False # flatten Document class into LanceSchema ?
65
55
 
66
56
 
67
57
  class LanceDB(VectorStore):
@@ -78,7 +68,6 @@ class LanceDB(VectorStore):
78
68
  self.port = config.port
79
69
  self.is_from_dataframe = False # were docs ingested from a dataframe?
80
70
  self.df_metadata_columns: List[str] = [] # metadata columns from dataframe
81
- self._setup_schemas(config.document_class)
82
71
 
83
72
  load_dotenv()
84
73
  if self.config.cloud:
@@ -104,40 +93,6 @@ class LanceDB(VectorStore):
104
93
  uri=new_storage_path,
105
94
  )
106
95
 
107
- # Note: Only create collection if a non-null collection name is provided.
108
- # This is useful to delay creation of vecdb until we have a suitable
109
- # collection name (e.g. we could get it from the url or folder path).
110
- if config.collection_name is not None:
111
- self.create_collection(
112
- config.collection_name, replace=config.replace_collection
113
- )
114
-
115
- def _setup_schemas(self, doc_cls: Type[Document] | None) -> None:
116
- try:
117
- doc_cls = doc_cls or self.config.document_class
118
- self.unflattened_schema = self._create_lance_schema(doc_cls)
119
- self.schema = (
120
- self._create_flat_lance_schema(doc_cls)
121
- if self.config.flatten
122
- else self.unflattened_schema
123
- )
124
- except (AttributeError, TypeError) as e:
125
- pydantic_version = pydantic_major_version()
126
- if pydantic_version > 1:
127
- raise ValueError(
128
- f"""
129
- {e}
130
- ====
131
- You are using Pydantic v{pydantic_version},
132
- which is not yet compatible with Langroid's LanceDB integration.
133
- To use Lancedb with Langroid, please install the
134
- latest pydantic 1.x instead of pydantic v2, e.g.
135
- pip install "pydantic<2.0.0"
136
- """
137
- )
138
- else:
139
- raise e
140
-
141
96
  def clear_empty_collections(self) -> int:
142
97
  coll_names = self.list_collections()
143
98
  n_deletes = 0
@@ -234,91 +189,8 @@ class LanceDB(VectorStore):
234
189
  ) # type: ignore
235
190
  return NewModel # type: ignore
236
191
 
237
- def _create_flat_lance_schema(self, doc_cls: Type[Document]) -> Type[BaseModel]:
238
- """
239
- Flat version of the lance_schema, as nested Pydantic schemas are not yet
240
- supported by LanceDB.
241
- """
242
- if not has_lancedb:
243
- raise LangroidImportError("lancedb", "lancedb")
244
- lance_model = self._create_lance_schema(doc_cls)
245
- FlatModel = flatten_pydantic_model(lance_model, base_model=LanceModel)
246
- return FlatModel
247
-
248
192
  def create_collection(self, collection_name: str, replace: bool = False) -> None:
249
- """
250
- Create a collection with the given name, optionally replacing an existing
251
- collection if `replace` is True.
252
- Args:
253
- collection_name (str): Name of the collection to create.
254
- replace (bool): Whether to replace an existing collection
255
- with the same name. Defaults to False.
256
- """
257
- self.config.collection_name = collection_name
258
- collections = self.list_collections()
259
- if collection_name in collections:
260
- coll = self.client.open_table(collection_name)
261
- if coll.head().shape[0] > 0:
262
- logger.warning(f"Non-empty Collection {collection_name} already exists")
263
- if not replace:
264
- logger.warning("Not replacing collection")
265
- return
266
- else:
267
- logger.warning("Recreating fresh collection")
268
- try:
269
- self.client.create_table(
270
- collection_name, schema=self.schema, mode="overwrite"
271
- )
272
- except (AttributeError, TypeError) as e:
273
- pydantic_version = pydantic_major_version()
274
- if pydantic_version > 1:
275
- raise ValueError(
276
- f"""
277
- {e}
278
- ====
279
- You are using Pydantic v{pydantic_version},
280
- which is not yet compatible with Langroid's LanceDB integration.
281
- To use Lancedb with Langroid, please install the
282
- latest pydantic 1.x instead of pydantic v2, e.g.
283
- pip install "pydantic<2.0.0"
284
- """
285
- )
286
- else:
287
- raise e
288
-
289
- if settings.debug:
290
- level = logger.getEffectiveLevel()
291
- logger.setLevel(logging.INFO)
292
- logger.setLevel(level)
293
-
294
- def _maybe_set_doc_class_schema(self, doc: Document) -> None:
295
- """
296
- Set the config.document_class and self.schema based on doc if needed
297
- Args:
298
- doc: an instance of Document, to be added to a collection
299
- """
300
- extra_metadata_fields = extra_metadata(doc, self.config.document_class)
301
- if len(extra_metadata_fields) > 0:
302
- logger.warning(
303
- f"""
304
- Added documents contain extra metadata fields:
305
- {extra_metadata_fields}
306
- which were not present in the original config.document_class.
307
- Trying to change document_class and corresponding schemas.
308
- Overriding LanceDBConfig.document_class with an auto-generated
309
- Pydantic class that includes these extra fields.
310
- If this fails, or you see odd results, it is recommended that you
311
- define a subclass of Document, with metadata of class derived from
312
- DocMetaData, with extra fields defined via
313
- `Field(..., description="...")` declarations,
314
- and set this document class as the value of the
315
- LanceDBConfig.document_class attribute.
316
- """
317
- )
318
-
319
- doc_cls = extend_document_class(doc)
320
- self.config.document_class = doc_cls
321
- self._setup_schemas(doc_cls)
193
+ self.config.replace_collection = replace
322
194
 
323
195
  def add_documents(self, documents: Sequence[Document]) -> None:
324
196
  super().maybe_add_ids(documents)
@@ -329,39 +201,52 @@ class LanceDB(VectorStore):
329
201
  coll_name = self.config.collection_name
330
202
  if coll_name is None:
331
203
  raise ValueError("No collection name set, cannot ingest docs")
332
- self._maybe_set_doc_class_schema(documents[0])
204
+ # self._maybe_set_doc_class_schema(documents[0])
205
+ table_exists = False
333
206
  if (
334
- coll_name not in colls
335
- or self.client.open_table(coll_name).head(1).shape[0] == 0
207
+ coll_name in colls
208
+ and self.client.open_table(coll_name).head(1).shape[0] > 0
336
209
  ):
337
- # collection either doesn't exist or is empty, so replace it,
338
- self.create_collection(coll_name, replace=True)
210
+ # collection exists and is not empty:
211
+ # if replace_collection is True, we'll overwrite the existing collection,
212
+ # else we'll append to it.
213
+ if self.config.replace_collection:
214
+ self.client.drop_table(coll_name)
215
+ else:
216
+ table_exists = True
339
217
 
340
218
  ids = [str(d.id()) for d in documents]
341
219
  # don't insert all at once, batch in chunks of b,
342
220
  # else we get an API error
343
221
  b = self.config.batch_size
344
222
 
345
- def make_batches() -> Generator[List[BaseModel], None, None]:
223
+ def make_batches() -> Generator[List[Dict[str, Any]], None, None]:
346
224
  for i in range(0, len(ids), b):
347
225
  batch = [
348
- self.unflattened_schema(
226
+ dict(
349
227
  id=ids[i + j],
350
228
  vector=embedding_vecs[i + j],
351
229
  **doc.dict(),
352
230
  )
353
231
  for j, doc in enumerate(documents[i : i + b])
354
232
  ]
355
- if self.config.flatten:
356
- batch = [
357
- flatten_pydantic_instance(instance) # type: ignore
358
- for instance in batch
359
- ]
360
233
  yield batch
361
234
 
362
- tbl = self.client.open_table(self.config.collection_name)
363
235
  try:
364
- tbl.add(make_batches())
236
+ if table_exists:
237
+ tbl = self.client.open_table(coll_name)
238
+ tbl.add(make_batches())
239
+ else:
240
+ batch_gen = make_batches()
241
+ batch = next(batch_gen)
242
+ # use first batch to create table...
243
+ tbl = self.client.create_table(
244
+ coll_name,
245
+ data=batch,
246
+ mode="create",
247
+ )
248
+ # ... and add the rest
249
+ tbl.add(batch_gen)
365
250
  except Exception as e:
366
251
  logger.error(
367
252
  f"""
@@ -427,7 +312,6 @@ class LanceDB(VectorStore):
427
312
  exclude=["vector"],
428
313
  )
429
314
  self.config.document_class = doc_cls # type: ignore
430
- self._setup_schemas(doc_cls) # type: ignore
431
315
  else:
432
316
  # collection exists and is not empty, so append to it
433
317
  tbl = self.client.open_table(self.config.collection_name)
@@ -452,35 +336,19 @@ class LanceDB(VectorStore):
452
336
  return self._records_to_docs(records)
453
337
 
454
338
  def _records_to_docs(self, records: List[Dict[str, Any]]) -> List[Document]:
455
- if self.config.flatten:
456
- docs = [
457
- self.unflattened_schema(**nested_dict_from_flat(rec)) for rec in records
458
- ]
459
- else:
460
- try:
461
- docs = [self.schema(**rec) for rec in records]
462
- except ValidationError as e:
463
- raise ValueError(
464
- f"""
465
- Error validating LanceDB result: {e}
466
- HINT: This could happen when you're re-using an
467
- existing LanceDB store with a different schema.
468
- Try deleting your local lancedb storage at `{self.config.storage_path}`
469
- re-ingesting your documents and/or replacing the collections.
470
- """
471
- )
472
-
473
- doc_cls = self.config.document_class
474
- doc_cls_field_names = doc_cls.__fields__.keys()
475
- return [
476
- doc_cls(
477
- **{
478
- field_name: getattr(doc, field_name)
479
- for field_name in doc_cls_field_names
480
- }
339
+ try:
340
+ docs = [self.config.document_class(**rec) for rec in records]
341
+ except ValidationError as e:
342
+ raise ValueError(
343
+ f"""
344
+ Error validating LanceDB result: {e}
345
+ HINT: This could happen when you're re-using an
346
+ existing LanceDB store with a different schema.
347
+ Try deleting your local lancedb storage at `{self.config.storage_path}`
348
+ re-ingesting your documents and/or replacing the collections.
349
+ """
481
350
  )
482
- for doc in docs
483
- ]
351
+ return docs
484
352
 
485
353
  def get_all_documents(self, where: str = "") -> List[Document]:
486
354
  if self.config.collection_name is None:
@@ -380,7 +380,11 @@ class QdrantDB(VectorStore):
380
380
  with_payload=True,
381
381
  with_vectors=False,
382
382
  )
383
- docs += [Document(**record.payload) for record in results] # type: ignore
383
+ docs += [
384
+ self.config.document_class(**record.payload) # type: ignore
385
+ for record in results
386
+ ]
387
+ # ignore
384
388
  if next_page_offset is None:
385
389
  break
386
390
  offset = next_page_offset # type: ignore
@@ -451,7 +455,7 @@ class QdrantDB(VectorStore):
451
455
  ] # 2D list -> 1D list
452
456
  scores = [match.score for match in search_result if match is not None]
453
457
  docs = [
454
- Document(**(match.payload)) # type: ignore
458
+ self.config.document_class(**(match.payload)) # type: ignore
455
459
  for match in search_result
456
460
  if match is not None
457
461
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.3.1
3
+ Version: 0.5.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -1,22 +1,22 @@
1
1
  langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
2
2
  langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
3
- langroid/agent/base.py,sha256=eeYZ-NYbrepOjUVQS9K0nDhE8x2gKUNjgxFTA24mook,37560
3
+ langroid/agent/base.py,sha256=x6SbInDGJUL_kusr-ligYsCwuaid2CmcRkzlucOXyw0,38999
4
4
  langroid/agent/batch.py,sha256=feRA_yRG768ElOQjrKEefcRv6Aefd_yY7qktuYUQDwc,10040
5
5
  langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  langroid/agent/callbacks/chainlit.py,sha256=UKG2_v4ktfkEaGvdouVRHEqQejEYya2Rli8jrP65TmA,22055
7
- langroid/agent/chat_agent.py,sha256=bTQrIMbN8JxxtnVNC-xzODVLvH3SHmy5vijRjY3cCUE,41564
7
+ langroid/agent/chat_agent.py,sha256=M5tdp1HuFthhMChLNd5XKBWxoiMSTkOuXlM8JoRLiUk,41586
8
8
  langroid/agent/chat_document.py,sha256=MwtNABK28tfSzqCeQlxoauT8uPn8oldU7dlnrX8aQ10,11232
9
9
  langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
11
11
  langroid/agent/openai_assistant.py,sha256=3saI9PwF8IZNJcjqyUy-rj73TInAzdlk14LiOvT_Dkc,33548
12
12
  langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
13
- langroid/agent/special/doc_chat_agent.py,sha256=CXFLfDMEabaBZwZwFgNOaG3E3S86xcBM4txrsMD_70I,54014
14
- langroid/agent/special/lance_doc_chat_agent.py,sha256=USp0U3eTaJzwF_3bdqE7CedSLbaqAi2tm-VzygcyLaA,10175
13
+ langroid/agent/special/doc_chat_agent.py,sha256=8NPAhMnHkFUolQ8EHos40tz5Vwuz_m33NjUfjheXWXY,54569
14
+ langroid/agent/special/lance_doc_chat_agent.py,sha256=Hjpu6u9UPAFMg5J6K97PRFaLbNrGhInC0N9oGi09CeY,10006
15
15
  langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
16
- langroid/agent/special/lance_rag/critic_agent.py,sha256=ufTdpHSeHgCzN85Q0sfWOrpBpsCjGVZdAg5yOH1ogU8,7296
16
+ langroid/agent/special/lance_rag/critic_agent.py,sha256=S3NA3OAO7XaXjCrmwhKB7qCPlgRZFvDxiB5Qra65Zhs,7959
17
17
  langroid/agent/special/lance_rag/lance_rag_task.py,sha256=l_HQgrYY-CX2FwIsS961aEF3bYog3GDYo98fj0C0mSk,2889
18
- langroid/agent/special/lance_rag/query_planner_agent.py,sha256=M4RC_0f98_pwVL7ygrr1VI80LgJiFcmKjJFH0M4tccI,9830
19
- langroid/agent/special/lance_tools.py,sha256=BksGrrNgGgyYWP0HnfAuXMc0KzXooFOzY2l5rDDMtQ8,1467
18
+ langroid/agent/special/lance_rag/query_planner_agent.py,sha256=QB8UYITUCkgSPturEwu_3i4kU8jXxW_jXNGSLlH5tMc,10109
19
+ langroid/agent/special/lance_tools.py,sha256=BznV_r3LAFyybvBRa9KQ0oU7mPM3uQVfri7PFp7M_qc,1894
20
20
  langroid/agent/special/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  langroid/agent/special/neo4j/csv_kg_chat.py,sha256=dRsAgMBa1H_EMI2YYgJR2Xyv1D7e4o3G9M64mTewq_c,6409
22
22
  langroid/agent/special/neo4j/neo4j_chat_agent.py,sha256=Y4Zu-m8WKO1xjeBRarV_m4y00Y5n_NR2B-hepjZp_cY,13104
@@ -34,7 +34,7 @@ langroid/agent/special/sql/utils/tools.py,sha256=vFYysk6Vi7HJjII8B4RitA3pt_z3gkS
34
34
  langroid/agent/special/table_chat_agent.py,sha256=d9v2wsblaRx7oMnKhLV7uO_ujvk9gh59pSGvBXyeyNc,9659
35
35
  langroid/agent/task.py,sha256=vKM2dmRYSH4i_VA0lf2axUtZcTGU44rVHz6EyxI4kG0,73990
36
36
  langroid/agent/team.py,sha256=88VNRSmK35WEl620GfBzuIrBASXYSeBZ8yDKX-nP_Bo,75778
37
- langroid/agent/tool_message.py,sha256=wIyZnUcZpxkiRPvM9O3MO3b5BBAdLEEan9kqPbvtApc,9743
37
+ langroid/agent/tool_message.py,sha256=ggxmIZO_wi6x5uD-YWml07Bfgms-ohOSKHyQQdJFi4o,9571
38
38
  langroid/agent/tools/__init__.py,sha256=e-63cfwQNk_ftRKQwgDAJQK16QLbRVWDBILeXIc7wLk,402
39
39
  langroid/agent/tools/duckduckgo_search_tool.py,sha256=NhsCaGZkdv28nja7yveAhSK_w6l_Ftym8agbrdzqgfo,1935
40
40
  langroid/agent/tools/extract_tool.py,sha256=u5lL9rKBzaLBOrRyLnTAZ97pQ1uxyLP39XsWMnpaZpw,3789
@@ -118,20 +118,20 @@ langroid/utils/output/citations.py,sha256=PSY2cpti8W-ZGFMAgj1lYoEIZy0lsniLpCliMs
118
118
  langroid/utils/output/printing.py,sha256=yzPJZN-8_jyOJmI9N_oLwEDfjMwVgk3IDiwnZ4eK_AE,2962
119
119
  langroid/utils/output/status.py,sha256=rzbE7mDJcgNNvdtylCseQcPGCGghtJvVq3lB-OPJ49E,1049
120
120
  langroid/utils/pandas_utils.py,sha256=UctS986Jtl_MvU5rA7-GfrjEHXP7MNu8ePhepv0bTn0,755
121
- langroid/utils/pydantic_utils.py,sha256=FKC8VKXH2uBEpFjnnMgIcEsQn6hs31ftea8zv5pMK9g,21740
121
+ langroid/utils/pydantic_utils.py,sha256=X35qxjE4sSIi-oBMkI1s9fiUIJbpXHLmJqcJ7zsy0jg,19914
122
122
  langroid/utils/system.py,sha256=nvKeeUAj4eviR4kYpcr9h-HYdhqUNMTRBTHBOhz0GdU,5182
123
123
  langroid/utils/web/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
124
  langroid/utils/web/login.py,sha256=1iz9eUAHa87vpKIkzwkmFa00avwFWivDSAr7QUhK7U0,2528
125
125
  langroid/vector_store/__init__.py,sha256=6xBjb_z4QtUy4vz4RuFbcbSwmHrggHL8-q0DwCf3PMM,972
126
- langroid/vector_store/base.py,sha256=tuEPaxJcuU_39sRnUjjNd8D8n8IjP6jrbwQv_ecNpSw,13532
127
- langroid/vector_store/chromadb.py,sha256=bZ5HjwgKgfJj1PUHsatYsrHv-v0dpOfMR2l0tJ2H0_A,7890
128
- langroid/vector_store/lancedb.py,sha256=9x7e_5zo7nLhMbhjYby2ZpBJ-vyawcC0_XAuatfHJf8,20517
126
+ langroid/vector_store/base.py,sha256=pkc4n0yWGVk7iRUOLFkU_ID5NiBFfAcA3lBlPNX79pU,13623
127
+ langroid/vector_store/chromadb.py,sha256=KMfHrgovQEOeJR_LsMpGM8BteJ50wpisDu608RhU3SU,7940
128
+ langroid/vector_store/lancedb.py,sha256=MLubJBhtNIFX6zY0qANqCoB6MlL-oZiJCg9gZp2H2rs,14620
129
129
  langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3HmhHQICXLs,11663
130
130
  langroid/vector_store/momento.py,sha256=qR-zBF1RKVHQZPZQYW_7g-XpTwr46p8HJuYPCkfJbM4,10534
131
131
  langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
132
- langroid/vector_store/qdrantdb.py,sha256=HkcK6jOf-FEDoOiG94MpsYDJr98T7vZkDyG__1BlnWI,17354
133
- pyproject.toml,sha256=x0YGXi9ennkubMYlFO-Eeyp6h2YE_aOBbeRJrUtTm34,7063
134
- langroid-0.3.1.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
135
- langroid-0.3.1.dist-info/METADATA,sha256=9WLpuCfOtRfjB30PZa2jwGmnlotxXRZgHqt6UWiNh4E,54402
136
- langroid-0.3.1.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
137
- langroid-0.3.1.dist-info/RECORD,,
132
+ langroid/vector_store/qdrantdb.py,sha256=v88lqFkepADvlN6lByUj9I4NEKa9X9lWH16uTPPbYrE,17457
133
+ pyproject.toml,sha256=pZsOBzFd2HoJ_P1_r3XSbCuD-wAllBqu6xr75947ITU,7063
134
+ langroid-0.5.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
135
+ langroid-0.5.0.dist-info/METADATA,sha256=WRQVNy4M8RgAEw5hQwEh8YgjGX3RoqtbdQP6gxZ6ya4,54402
136
+ langroid-0.5.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
137
+ langroid-0.5.0.dist-info/RECORD,,
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "langroid"
3
- version = "0.3.1"
3
+ version = "0.5.0"
4
4
  description = "Harness LLMs with Multi-Agent Programming"
5
5
  authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
6
6
  readme = "README.md"