langroid 0.32.2__py3-none-any.whl → 0.33.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {langroid-0.32.2.dist-info → langroid-0.33.4.dist-info}/METADATA +149 -123
  2. langroid-0.33.4.dist-info/RECORD +7 -0
  3. {langroid-0.32.2.dist-info → langroid-0.33.4.dist-info}/WHEEL +1 -1
  4. langroid-0.33.4.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/__init__.py +0 -41
  8. langroid/agent/base.py +0 -1983
  9. langroid/agent/batch.py +0 -398
  10. langroid/agent/callbacks/__init__.py +0 -0
  11. langroid/agent/callbacks/chainlit.py +0 -598
  12. langroid/agent/chat_agent.py +0 -1899
  13. langroid/agent/chat_document.py +0 -454
  14. langroid/agent/openai_assistant.py +0 -882
  15. langroid/agent/special/__init__.py +0 -59
  16. langroid/agent/special/arangodb/__init__.py +0 -0
  17. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  18. langroid/agent/special/arangodb/system_messages.py +0 -186
  19. langroid/agent/special/arangodb/tools.py +0 -107
  20. langroid/agent/special/arangodb/utils.py +0 -36
  21. langroid/agent/special/doc_chat_agent.py +0 -1466
  22. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  23. langroid/agent/special/lance_rag/__init__.py +0 -9
  24. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  25. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  26. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  27. langroid/agent/special/lance_tools.py +0 -61
  28. langroid/agent/special/neo4j/__init__.py +0 -0
  29. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  30. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  31. langroid/agent/special/neo4j/system_messages.py +0 -120
  32. langroid/agent/special/neo4j/tools.py +0 -32
  33. langroid/agent/special/relevance_extractor_agent.py +0 -127
  34. langroid/agent/special/retriever_agent.py +0 -56
  35. langroid/agent/special/sql/__init__.py +0 -17
  36. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  37. langroid/agent/special/sql/utils/__init__.py +0 -21
  38. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  39. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  40. langroid/agent/special/sql/utils/system_message.py +0 -35
  41. langroid/agent/special/sql/utils/tools.py +0 -64
  42. langroid/agent/special/table_chat_agent.py +0 -263
  43. langroid/agent/task.py +0 -2095
  44. langroid/agent/tool_message.py +0 -393
  45. langroid/agent/tools/__init__.py +0 -38
  46. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  47. langroid/agent/tools/file_tools.py +0 -234
  48. langroid/agent/tools/google_search_tool.py +0 -39
  49. langroid/agent/tools/metaphor_search_tool.py +0 -67
  50. langroid/agent/tools/orchestration.py +0 -303
  51. langroid/agent/tools/recipient_tool.py +0 -235
  52. langroid/agent/tools/retrieval_tool.py +0 -32
  53. langroid/agent/tools/rewind_tool.py +0 -137
  54. langroid/agent/tools/segment_extract_tool.py +0 -41
  55. langroid/agent/xml_tool_message.py +0 -382
  56. langroid/cachedb/__init__.py +0 -17
  57. langroid/cachedb/base.py +0 -58
  58. langroid/cachedb/momento_cachedb.py +0 -108
  59. langroid/cachedb/redis_cachedb.py +0 -153
  60. langroid/embedding_models/__init__.py +0 -39
  61. langroid/embedding_models/base.py +0 -74
  62. langroid/embedding_models/models.py +0 -461
  63. langroid/embedding_models/protoc/__init__.py +0 -0
  64. langroid/embedding_models/protoc/embeddings.proto +0 -19
  65. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  66. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  67. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  68. langroid/embedding_models/remote_embeds.py +0 -153
  69. langroid/exceptions.py +0 -65
  70. langroid/language_models/__init__.py +0 -53
  71. langroid/language_models/azure_openai.py +0 -153
  72. langroid/language_models/base.py +0 -678
  73. langroid/language_models/config.py +0 -18
  74. langroid/language_models/mock_lm.py +0 -124
  75. langroid/language_models/openai_gpt.py +0 -1964
  76. langroid/language_models/prompt_formatter/__init__.py +0 -16
  77. langroid/language_models/prompt_formatter/base.py +0 -40
  78. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  79. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  80. langroid/language_models/utils.py +0 -151
  81. langroid/mytypes.py +0 -84
  82. langroid/parsing/__init__.py +0 -52
  83. langroid/parsing/agent_chats.py +0 -38
  84. langroid/parsing/code_parser.py +0 -121
  85. langroid/parsing/document_parser.py +0 -718
  86. langroid/parsing/para_sentence_split.py +0 -62
  87. langroid/parsing/parse_json.py +0 -155
  88. langroid/parsing/parser.py +0 -313
  89. langroid/parsing/repo_loader.py +0 -790
  90. langroid/parsing/routing.py +0 -36
  91. langroid/parsing/search.py +0 -275
  92. langroid/parsing/spider.py +0 -102
  93. langroid/parsing/table_loader.py +0 -94
  94. langroid/parsing/url_loader.py +0 -111
  95. langroid/parsing/urls.py +0 -273
  96. langroid/parsing/utils.py +0 -373
  97. langroid/parsing/web_search.py +0 -155
  98. langroid/prompts/__init__.py +0 -9
  99. langroid/prompts/dialog.py +0 -17
  100. langroid/prompts/prompts_config.py +0 -5
  101. langroid/prompts/templates.py +0 -141
  102. langroid/pydantic_v1/__init__.py +0 -10
  103. langroid/pydantic_v1/main.py +0 -4
  104. langroid/utils/__init__.py +0 -19
  105. langroid/utils/algorithms/__init__.py +0 -3
  106. langroid/utils/algorithms/graph.py +0 -103
  107. langroid/utils/configuration.py +0 -98
  108. langroid/utils/constants.py +0 -30
  109. langroid/utils/git_utils.py +0 -252
  110. langroid/utils/globals.py +0 -49
  111. langroid/utils/logging.py +0 -135
  112. langroid/utils/object_registry.py +0 -66
  113. langroid/utils/output/__init__.py +0 -20
  114. langroid/utils/output/citations.py +0 -41
  115. langroid/utils/output/printing.py +0 -99
  116. langroid/utils/output/status.py +0 -40
  117. langroid/utils/pandas_utils.py +0 -30
  118. langroid/utils/pydantic_utils.py +0 -602
  119. langroid/utils/system.py +0 -286
  120. langroid/utils/types.py +0 -93
  121. langroid/vector_store/__init__.py +0 -50
  122. langroid/vector_store/base.py +0 -357
  123. langroid/vector_store/chromadb.py +0 -214
  124. langroid/vector_store/lancedb.py +0 -401
  125. langroid/vector_store/meilisearch.py +0 -299
  126. langroid/vector_store/momento.py +0 -278
  127. langroid/vector_store/qdrantdb.py +0 -468
  128. langroid-0.32.2.dist-info/RECORD +0 -128
  129. {langroid-0.32.2.dist-info → langroid-0.33.4.dist-info/licenses}/LICENSE +0 -0
@@ -1,30 +0,0 @@
1
- from typing import Any
2
-
3
- import pandas as pd
4
-
5
-
6
- def stringify(x: Any) -> str:
7
- # Convert x to DataFrame if it is not one already
8
- if isinstance(x, pd.Series):
9
- df = x.to_frame()
10
- elif not isinstance(x, pd.DataFrame):
11
- return str(x)
12
- else:
13
- df = x
14
-
15
- # Truncate long text columns to 1000 characters
16
- for col in df.columns:
17
- if df[col].dtype == object:
18
- df[col] = df[col].apply(
19
- lambda item: (
20
- (item[:1000] + "...")
21
- if isinstance(item, str) and len(item) > 1000
22
- else item
23
- )
24
- )
25
-
26
- # Limit to 10 rows
27
- df = df.head(10)
28
-
29
- # Convert to string
30
- return df.to_string(index=False) # type: ignore
@@ -1,602 +0,0 @@
1
- import logging
2
- from collections.abc import MutableMapping
3
- from contextlib import contextmanager
4
- from typing import (
5
- Any,
6
- Dict,
7
- Generator,
8
- List,
9
- Optional,
10
- Tuple,
11
- Type,
12
- TypeVar,
13
- no_type_check,
14
- )
15
-
16
- import numpy as np
17
- import pandas as pd
18
-
19
- from langroid.mytypes import DocMetaData, Document
20
- from langroid.pydantic_v1 import BaseModel, ValidationError, create_model
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- def flatten_dict(
26
- d: MutableMapping[str, Any], parent_key: str = "", sep: str = "."
27
- ) -> Dict[str, Any]:
28
- """Flatten a nested dictionary, using a separator in the keys.
29
- Useful for pydantic_v1 models with nested fields -- first use
30
- dct = mdl.model_dump()
31
- to get a nested dictionary, then use this function to flatten it.
32
- """
33
- items: List[Tuple[str, Any]] = []
34
- for k, v in d.items():
35
- new_key = f"{parent_key}{sep}{k}" if parent_key else k
36
- if isinstance(v, MutableMapping):
37
- items.extend(flatten_dict(v, new_key, sep=sep).items())
38
- else:
39
- items.append((new_key, v))
40
- return dict(items)
41
-
42
-
43
- def has_field(model_class: Type[BaseModel], field_name: str) -> bool:
44
- """Check if a Pydantic model class has a field with the given name."""
45
- return field_name in model_class.__fields__
46
-
47
-
48
- def _recursive_purge_dict_key(d: Dict[str, Any], k: str) -> None:
49
- """Remove a key from a dictionary recursively"""
50
- if isinstance(d, dict):
51
- for key in list(d.keys()):
52
- if key == k and "type" in d.keys():
53
- del d[key]
54
- else:
55
- _recursive_purge_dict_key(d[key], k)
56
-
57
-
58
- @no_type_check
59
- def _flatten_pydantic_model_ignore_defaults(
60
- model: Type[BaseModel],
61
- base_model: Type[BaseModel] = BaseModel,
62
- ) -> Type[BaseModel]:
63
- """
64
- Given a possibly nested Pydantic class, return a flattened version of it,
65
- by constructing top-level fields, whose names are formed from the path
66
- through the nested structure, separated by double underscores.
67
-
68
- This version ignores inherited defaults, so it is incomplete.
69
- But retaining it as it is simpler and may be useful in some cases.
70
- The full version is `flatten_pydantic_model`, see below.
71
-
72
- Args:
73
- model (Type[BaseModel]): The Pydantic model to flatten.
74
- base_model (Type[BaseModel], optional): The base model to use for the
75
- flattened model. Defaults to BaseModel.
76
-
77
- Returns:
78
- Type[BaseModel]: The flattened Pydantic model.
79
- """
80
-
81
- flattened_fields: Dict[str, Tuple[Any, ...]] = {}
82
- models_to_process = [(model, "")]
83
-
84
- while models_to_process:
85
- current_model, current_prefix = models_to_process.pop()
86
-
87
- for name, field in current_model.__annotations__.items():
88
- if issubclass(field, BaseModel):
89
- new_prefix = (
90
- f"{current_prefix}{name}__" if current_prefix else f"{name}__"
91
- )
92
- models_to_process.append((field, new_prefix))
93
- else:
94
- flattened_name = f"{current_prefix}{name}"
95
- flattened_fields[flattened_name] = (field, ...)
96
-
97
- return create_model(
98
- "FlatModel",
99
- __base__=base_model,
100
- **flattened_fields,
101
- )
102
-
103
-
104
- def flatten_pydantic_model(
105
- model: Type[BaseModel],
106
- base_model: Type[BaseModel] = BaseModel,
107
- ) -> Type[BaseModel]:
108
- """
109
- Given a possibly nested Pydantic class, return a flattened version of it,
110
- by constructing top-level fields, whose names are formed from the path
111
- through the nested structure, separated by double underscores.
112
-
113
- Args:
114
- model (Type[BaseModel]): The Pydantic model to flatten.
115
- base_model (Type[BaseModel], optional): The base model to use for the
116
- flattened model. Defaults to BaseModel.
117
-
118
- Returns:
119
- Type[BaseModel]: The flattened Pydantic model.
120
- """
121
-
122
- flattened_fields: Dict[str, Any] = {}
123
- models_to_process = [(model, "")]
124
-
125
- while models_to_process:
126
- current_model, current_prefix = models_to_process.pop()
127
-
128
- for name, field in current_model.__fields__.items():
129
- if isinstance(field.outer_type_, type) and issubclass(
130
- field.outer_type_, BaseModel
131
- ):
132
- new_prefix = (
133
- f"{current_prefix}{name}__" if current_prefix else f"{name}__"
134
- )
135
- models_to_process.append((field.outer_type_, new_prefix))
136
- else:
137
- flattened_name = f"{current_prefix}{name}"
138
-
139
- if field.default_factory is not field.default_factory:
140
- flattened_fields[flattened_name] = (
141
- field.outer_type_,
142
- field.default_factory,
143
- )
144
- elif field.default is not field.default:
145
- flattened_fields[flattened_name] = (
146
- field.outer_type_,
147
- field.default,
148
- )
149
- else:
150
- flattened_fields[flattened_name] = (field.outer_type_, ...)
151
-
152
- return create_model("FlatModel", __base__=base_model, **flattened_fields)
153
-
154
-
155
- def get_field_names(model: Type[BaseModel]) -> List[str]:
156
- """Get all field names from a possibly nested Pydantic model."""
157
- mdl = flatten_pydantic_model(model)
158
- fields = list(mdl.__fields__.keys())
159
- # fields may be like a__b__c , so we only want the last part
160
- return [f.split("__")[-1] for f in fields]
161
-
162
-
163
- def generate_simple_schema(
164
- model: Type[BaseModel], exclude: List[str] = []
165
- ) -> Dict[str, Any]:
166
- """
167
- Generates a JSON schema for a Pydantic model,
168
- with options to exclude specific fields.
169
-
170
- This function traverses the Pydantic model's fields, including nested models,
171
- to generate a dictionary representing the JSON schema. Fields specified in
172
- the exclude list will not be included in the generated schema.
173
-
174
- Args:
175
- model (Type[BaseModel]): The Pydantic model class to generate the schema for.
176
- exclude (List[str]): A list of string field names to be excluded from the
177
- generated schema. Defaults to an empty list.
178
-
179
- Returns:
180
- Dict[str, Any]: A dictionary representing the JSON schema of the provided model,
181
- with specified fields excluded.
182
- """
183
- if hasattr(model, "__fields__"):
184
- output: Dict[str, Any] = {}
185
- for field_name, field in model.__fields__.items():
186
- if field_name in exclude:
187
- continue # Skip excluded fields
188
-
189
- field_type = field.type_
190
- if issubclass(field_type, BaseModel):
191
- # Recursively generate schema for nested models
192
- output[field_name] = generate_simple_schema(field_type, exclude)
193
- else:
194
- # Represent the type as a string here
195
- output[field_name] = {"type": field_type.__name__}
196
- return output
197
- else:
198
- # Non-model type, return a simplified representation
199
- return {"type": model.__name__}
200
-
201
-
202
- def flatten_pydantic_instance(
203
- instance: BaseModel,
204
- prefix: str = "",
205
- force_str: bool = False,
206
- ) -> Dict[str, Any]:
207
- """
208
- Given a possibly nested Pydantic instance, return a flattened version of it,
209
- as a dict where nested traversal paths are translated to keys a__b__c.
210
-
211
- Args:
212
- instance (BaseModel): The Pydantic instance to flatten.
213
- prefix (str, optional): The prefix to use for the top-level fields.
214
- force_str (bool, optional): Whether to force all values to be strings.
215
-
216
- Returns:
217
- Dict[str, Any]: The flattened dict.
218
-
219
- """
220
- flat_data: Dict[str, Any] = {}
221
- for name, value in instance.dict().items():
222
- # Assuming nested pydantic model will be a dict here
223
- if isinstance(value, dict):
224
- nested_flat_data = flatten_pydantic_instance(
225
- instance.__fields__[name].type_(**value),
226
- prefix=f"{prefix}{name}__",
227
- force_str=force_str,
228
- )
229
- flat_data.update(nested_flat_data)
230
- else:
231
- flat_data[f"{prefix}{name}"] = str(value) if force_str else value
232
- return flat_data
233
-
234
-
235
- def extract_fields(doc: BaseModel, fields: List[str]) -> Dict[str, Any]:
236
- """
237
- Extract specified fields from a Pydantic object.
238
- Supports dotted field names, e.g. "metadata.author".
239
- Dotted fields are matched exactly according to the corresponding path.
240
- Non-dotted fields are matched against the last part of the path.
241
- Clashes ignored.
242
- Args:
243
- doc (BaseModel): The Pydantic object.
244
- fields (List[str]): The list of fields to extract.
245
-
246
- Returns:
247
- Dict[str, Any]: A dictionary of field names and values.
248
-
249
- """
250
-
251
- def get_value(obj: BaseModel, path: str) -> Any | None:
252
- for part in path.split("."):
253
- if hasattr(obj, part):
254
- obj = getattr(obj, part)
255
- else:
256
- return None
257
- return obj
258
-
259
- def traverse(obj: BaseModel, result: Dict[str, Any], prefix: str = "") -> None:
260
- for k, v in obj.__dict__.items():
261
- key = f"{prefix}.{k}" if prefix else k
262
- if isinstance(v, BaseModel):
263
- traverse(v, result, key)
264
- else:
265
- result[key] = v
266
-
267
- result: Dict[str, Any] = {}
268
-
269
- # Extract values for dotted field names and use last part as key
270
- for field in fields:
271
- if "." in field:
272
- value = get_value(doc, field)
273
- if value is not None:
274
- key = field.split(".")[-1]
275
- result[key] = value
276
-
277
- # Traverse the object to get non-dotted fields
278
- all_fields: Dict[str, Any] = {}
279
- traverse(doc, all_fields)
280
-
281
- # Add non-dotted fields to the result,
282
- # avoid overwriting if already present from dotted names
283
- for field in [f for f in fields if "." not in f]:
284
- for key, value in all_fields.items():
285
- if key.split(".")[-1] == field and field not in result:
286
- result[field] = value
287
-
288
- return result
289
-
290
-
291
- def nested_dict_from_flat(
292
- flat_data: Dict[str, Any],
293
- sub_dict: str = "",
294
- ) -> Dict[str, Any]:
295
- """
296
- Given a flattened version of a nested dict, reconstruct the nested dict.
297
- Field names in the flattened dict are assumed to be of the form
298
- "field1__field2__field3", going from top level down.
299
-
300
- Args:
301
- flat_data (Dict[str, Any]): The flattened dict.
302
- sub_dict (str, optional): The name of the sub-dict to extract from the
303
- flattened dict. Defaults to "" (extract the whole dict).
304
-
305
- Returns:
306
- Dict[str, Any]: The nested dict.
307
-
308
- """
309
- nested_data: Dict[str, Any] = {}
310
- for key, value in flat_data.items():
311
- if sub_dict != "" and not key.startswith(sub_dict + "__"):
312
- continue
313
- keys = key.split("__")
314
- d = nested_data
315
- for k in keys[:-1]:
316
- d = d.setdefault(k, {})
317
- d[keys[-1]] = value
318
- if sub_dict != "": # e.g. "payload"
319
- nested_data = nested_data[sub_dict]
320
- return nested_data
321
-
322
-
323
- def pydantic_obj_from_flat_dict(
324
- flat_data: Dict[str, Any],
325
- model: Type[BaseModel],
326
- sub_dict: str = "",
327
- ) -> BaseModel:
328
- """Flattened dict with a__b__c style keys -> nested dict -> pydantic object"""
329
- nested_data = nested_dict_from_flat(flat_data, sub_dict)
330
- return model(**nested_data)
331
-
332
-
333
- @contextmanager
334
- def temp_update(
335
- pydantic_object: BaseModel, updates: Dict[str, Any]
336
- ) -> Generator[None, None, None]:
337
- original_values = {}
338
- try:
339
- for field, value in updates.items():
340
- if hasattr(pydantic_object, field):
341
- # Save original value
342
- original_values[field] = getattr(pydantic_object, field)
343
- setattr(pydantic_object, field, value)
344
- else:
345
- # Raise error for non-existent field
346
- raise AttributeError(
347
- f"The field '{field}' does not exist in the "
348
- f"Pydantic model '{pydantic_object.__class__.__name__}'."
349
- )
350
- yield
351
- except ValidationError as e:
352
- # Handle validation error
353
- print(f"Validation error: {e}")
354
- finally:
355
- # Restore original values
356
- for field, value in original_values.items():
357
- setattr(pydantic_object, field, value)
358
-
359
-
360
- T = TypeVar("T", bound=BaseModel)
361
-
362
-
363
- @contextmanager
364
- def temp_params(config: T, field: str, temp: T) -> Generator[None, None, None]:
365
- """Context manager to temporarily override `field` in a `config`"""
366
- original_vals = getattr(config, field)
367
- try:
368
- # Apply temporary settings
369
- setattr(config, field, temp)
370
- yield
371
- finally:
372
- # Revert to original settings
373
- setattr(config, field, original_vals)
374
-
375
-
376
- def numpy_to_python_type(numpy_type: Type[Any]) -> Type[Any]:
377
- """Converts a numpy data type to its Python equivalent."""
378
- type_mapping = {
379
- np.float64: float,
380
- np.float32: float,
381
- np.int64: int,
382
- np.int32: int,
383
- np.bool_: bool,
384
- # Add other numpy types as necessary
385
- }
386
- return type_mapping.get(numpy_type, numpy_type)
387
-
388
-
389
- def dataframe_to_pydantic_model(df: pd.DataFrame) -> Type[BaseModel]:
390
- """Make a Pydantic model from a dataframe."""
391
- fields = {col: (type(df[col].iloc[0]), ...) for col in df.columns}
392
- return create_model("DataFrameModel", __base__=BaseModel, **fields) # type: ignore
393
-
394
-
395
- def dataframe_to_pydantic_objects(df: pd.DataFrame) -> List[BaseModel]:
396
- """Make a list of Pydantic objects from a dataframe."""
397
- Model = dataframe_to_pydantic_model(df)
398
- return [Model(**row.to_dict()) for index, row in df.iterrows()]
399
-
400
-
401
- def first_non_null(series: pd.Series) -> Any | None:
402
- """Find the first non-null item in a pandas Series."""
403
- for item in series:
404
- if item is not None:
405
- return item
406
- return None
407
-
408
-
409
- def dataframe_to_document_model(
410
- df: pd.DataFrame,
411
- content: str = "content",
412
- metadata: List[str] = [],
413
- exclude: List[str] = [],
414
- ) -> Type[BaseModel]:
415
- """
416
- Make a subclass of Document from a dataframe.
417
-
418
- Args:
419
- df (pd.DataFrame): The dataframe.
420
- content (str): The name of the column containing the content,
421
- which will map to the Document.content field.
422
- metadata (List[str]): A list of column names containing metadata;
423
- these will be included in the Document.metadata field.
424
- exclude (List[str]): A list of column names to exclude from the model.
425
- (e.g. "vector" when lance is used to add an embedding vector to the df)
426
-
427
- Returns:
428
- Type[BaseModel]: A pydantic model subclassing Document.
429
- """
430
-
431
- # Remove excluded columns
432
- df = df.drop(columns=exclude, inplace=False)
433
- # Check if metadata_cols is empty
434
-
435
- if metadata:
436
- # Define fields for the dynamic subclass of DocMetaData
437
- metadata_fields = {
438
- col: (
439
- Optional[numpy_to_python_type(type(first_non_null(df[col])))],
440
- None, # Optional[numpy_to_python_type(type(first_non_null(df[col])))],
441
- )
442
- for col in metadata
443
- }
444
- DynamicMetaData = create_model( # type: ignore
445
- "DynamicMetaData", __base__=DocMetaData, **metadata_fields
446
- )
447
- else:
448
- # Use the base DocMetaData class directly
449
- DynamicMetaData = DocMetaData
450
-
451
- # Define additional top-level fields for DynamicDocument
452
- additional_fields = {
453
- col: (
454
- Optional[numpy_to_python_type(type(first_non_null(df[col])))],
455
- None, # Optional[numpy_to_python_type(type(first_non_null(df[col])))],
456
- )
457
- for col in df.columns
458
- if col not in metadata and col != content
459
- }
460
-
461
- # Create a dynamic subclass of Document
462
- DynamicDocumentFields = {
463
- **{"metadata": (DynamicMetaData, ...)},
464
- **additional_fields,
465
- }
466
- DynamicDocument = create_model( # type: ignore
467
- "DynamicDocument", __base__=Document, **DynamicDocumentFields
468
- )
469
-
470
- def from_df_row(
471
- cls: type[BaseModel],
472
- row: pd.Series,
473
- content: str = "content",
474
- metadata: List[str] = [],
475
- ) -> BaseModel | None:
476
- content_val = row[content] if (content and content in row) else ""
477
- metadata_values = (
478
- {col: row[col] for col in metadata if col in row} if metadata else {}
479
- )
480
- additional_values = {
481
- col: row[col] for col in additional_fields if col in row and col != content
482
- }
483
- metadata = DynamicMetaData(**metadata_values)
484
- return cls(content=content_val, metadata=metadata, **additional_values)
485
-
486
- # Bind the method to the class
487
- DynamicDocument.from_df_row = classmethod(from_df_row)
488
-
489
- return DynamicDocument # type: ignore
490
-
491
-
492
- def dataframe_to_documents(
493
- df: pd.DataFrame,
494
- content: str = "content",
495
- metadata: List[str] = [],
496
- doc_cls: Type[BaseModel] | None = None,
497
- ) -> List[Document]:
498
- """
499
- Make a list of Document objects from a dataframe.
500
- Args:
501
- df (pd.DataFrame): The dataframe.
502
- content (str): The name of the column containing the content,
503
- which will map to the Document.content field.
504
- metadata (List[str]): A list of column names containing metadata;
505
- these will be included in the Document.metadata field.
506
- doc_cls (Type[BaseModel], optional): A Pydantic model subclassing
507
- Document. Defaults to None.
508
- Returns:
509
- List[Document]: The list of Document objects.
510
- """
511
- Model = doc_cls or dataframe_to_document_model(df, content, metadata)
512
- docs = [
513
- Model.from_df_row(row, content, metadata) # type: ignore
514
- for _, row in df.iterrows()
515
- ]
516
- return [m for m in docs if m is not None]
517
-
518
-
519
- def extra_metadata(document: Document, doc_cls: Type[Document] = Document) -> List[str]:
520
- """
521
- Checks for extra fields in a document's metadata that are not defined in the
522
- original metadata schema.
523
-
524
- Args:
525
- document (Document): The document instance to check for extra fields.
526
- doc_cls (Type[Document]): The class type derived from Document, used
527
- as a reference to identify extra fields in the document's metadata.
528
-
529
- Returns:
530
- List[str]: A list of strings representing the keys of the extra fields found
531
- in the document's metadata.
532
- """
533
- # Convert metadata to dict, including extra fields.
534
- metadata_fields = set(document.metadata.dict().keys())
535
-
536
- # Get defined fields in the metadata of doc_cls
537
- defined_fields = set(doc_cls.__fields__["metadata"].type_.__fields__.keys())
538
-
539
- # Identify extra fields not in defined fields.
540
- extra_fields = list(metadata_fields - defined_fields)
541
-
542
- return extra_fields
543
-
544
-
545
- def extend_document_class(d: Document) -> Type[Document]:
546
- """Generates a new pydantic class based on a given document instance.
547
-
548
- This function dynamically creates a new pydantic class with additional
549
- fields based on the "extra" metadata fields present in the given document
550
- instance. The new class is a subclass of the original Document class, with
551
- the original metadata fields retained and extra fields added as normal
552
- fields to the metadata.
553
-
554
- Args:
555
- d: An instance of the Document class.
556
-
557
- Returns:
558
- A new subclass of the Document class that includes the additional fields
559
- found in the metadata of the given document instance.
560
- """
561
- # Extract the fields from the original metadata class, including types,
562
- # correctly handling special types like List[str].
563
- original_metadata_fields = {
564
- k: (v.outer_type_ if v.shape != 1 else v.type_, ...)
565
- for k, v in DocMetaData.__fields__.items()
566
- }
567
- # Extract extra fields from the metadata instance with their types
568
- extra_fields = {
569
- k: (type(v), ...)
570
- for k, v in d.metadata.__dict__.items()
571
- if k not in DocMetaData.__fields__
572
- }
573
-
574
- # Combine original and extra fields for the new metadata class
575
- combined_fields = {**original_metadata_fields, **extra_fields}
576
-
577
- # Create a new metadata class with combined fields
578
- NewMetadataClass = create_model( # type: ignore
579
- "ExtendedDocMetadata", **combined_fields, __base__=DocMetaData
580
- )
581
- # NewMetadataClass.__config__.arbitrary_types_allowed = True
582
-
583
- # Create a new document class using the new metadata class
584
- NewDocumentClass = create_model(
585
- "ExtendedDocument",
586
- content=(str, ...),
587
- metadata=(NewMetadataClass, ...),
588
- __base__=Document,
589
- )
590
-
591
- return NewDocumentClass
592
-
593
-
594
- class PydanticWrapper(BaseModel):
595
- value: Any
596
-
597
-
598
- def get_pydantic_wrapper(value_type: type) -> type[PydanticWrapper]:
599
- class WrappedValue(PydanticWrapper):
600
- value: value_type # type: ignore
601
-
602
- return WrappedValue