langroid 0.33.6__py3-none-any.whl → 0.33.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. langroid/__init__.py +106 -0
  2. langroid/agent/__init__.py +41 -0
  3. langroid/agent/base.py +1983 -0
  4. langroid/agent/batch.py +398 -0
  5. langroid/agent/callbacks/__init__.py +0 -0
  6. langroid/agent/callbacks/chainlit.py +598 -0
  7. langroid/agent/chat_agent.py +1899 -0
  8. langroid/agent/chat_document.py +454 -0
  9. langroid/agent/openai_assistant.py +882 -0
  10. langroid/agent/special/__init__.py +59 -0
  11. langroid/agent/special/arangodb/__init__.py +0 -0
  12. langroid/agent/special/arangodb/arangodb_agent.py +656 -0
  13. langroid/agent/special/arangodb/system_messages.py +186 -0
  14. langroid/agent/special/arangodb/tools.py +107 -0
  15. langroid/agent/special/arangodb/utils.py +36 -0
  16. langroid/agent/special/doc_chat_agent.py +1466 -0
  17. langroid/agent/special/lance_doc_chat_agent.py +262 -0
  18. langroid/agent/special/lance_rag/__init__.py +9 -0
  19. langroid/agent/special/lance_rag/critic_agent.py +198 -0
  20. langroid/agent/special/lance_rag/lance_rag_task.py +82 -0
  21. langroid/agent/special/lance_rag/query_planner_agent.py +260 -0
  22. langroid/agent/special/lance_tools.py +61 -0
  23. langroid/agent/special/neo4j/__init__.py +0 -0
  24. langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
  25. langroid/agent/special/neo4j/neo4j_chat_agent.py +433 -0
  26. langroid/agent/special/neo4j/system_messages.py +120 -0
  27. langroid/agent/special/neo4j/tools.py +32 -0
  28. langroid/agent/special/relevance_extractor_agent.py +127 -0
  29. langroid/agent/special/retriever_agent.py +56 -0
  30. langroid/agent/special/sql/__init__.py +17 -0
  31. langroid/agent/special/sql/sql_chat_agent.py +654 -0
  32. langroid/agent/special/sql/utils/__init__.py +21 -0
  33. langroid/agent/special/sql/utils/description_extractors.py +190 -0
  34. langroid/agent/special/sql/utils/populate_metadata.py +85 -0
  35. langroid/agent/special/sql/utils/system_message.py +35 -0
  36. langroid/agent/special/sql/utils/tools.py +64 -0
  37. langroid/agent/special/table_chat_agent.py +263 -0
  38. langroid/agent/task.py +2099 -0
  39. langroid/agent/tool_message.py +393 -0
  40. langroid/agent/tools/__init__.py +38 -0
  41. langroid/agent/tools/duckduckgo_search_tool.py +50 -0
  42. langroid/agent/tools/file_tools.py +234 -0
  43. langroid/agent/tools/google_search_tool.py +39 -0
  44. langroid/agent/tools/metaphor_search_tool.py +68 -0
  45. langroid/agent/tools/orchestration.py +303 -0
  46. langroid/agent/tools/recipient_tool.py +235 -0
  47. langroid/agent/tools/retrieval_tool.py +32 -0
  48. langroid/agent/tools/rewind_tool.py +137 -0
  49. langroid/agent/tools/segment_extract_tool.py +41 -0
  50. langroid/agent/xml_tool_message.py +382 -0
  51. langroid/cachedb/__init__.py +17 -0
  52. langroid/cachedb/base.py +58 -0
  53. langroid/cachedb/momento_cachedb.py +108 -0
  54. langroid/cachedb/redis_cachedb.py +153 -0
  55. langroid/embedding_models/__init__.py +39 -0
  56. langroid/embedding_models/base.py +74 -0
  57. langroid/embedding_models/models.py +461 -0
  58. langroid/embedding_models/protoc/__init__.py +0 -0
  59. langroid/embedding_models/protoc/embeddings.proto +19 -0
  60. langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
  61. langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
  62. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
  63. langroid/embedding_models/remote_embeds.py +153 -0
  64. langroid/exceptions.py +71 -0
  65. langroid/language_models/__init__.py +53 -0
  66. langroid/language_models/azure_openai.py +153 -0
  67. langroid/language_models/base.py +678 -0
  68. langroid/language_models/config.py +18 -0
  69. langroid/language_models/mock_lm.py +124 -0
  70. langroid/language_models/openai_gpt.py +1964 -0
  71. langroid/language_models/prompt_formatter/__init__.py +16 -0
  72. langroid/language_models/prompt_formatter/base.py +40 -0
  73. langroid/language_models/prompt_formatter/hf_formatter.py +132 -0
  74. langroid/language_models/prompt_formatter/llama2_formatter.py +75 -0
  75. langroid/language_models/utils.py +151 -0
  76. langroid/mytypes.py +84 -0
  77. langroid/parsing/__init__.py +52 -0
  78. langroid/parsing/agent_chats.py +38 -0
  79. langroid/parsing/code_parser.py +121 -0
  80. langroid/parsing/document_parser.py +718 -0
  81. langroid/parsing/para_sentence_split.py +62 -0
  82. langroid/parsing/parse_json.py +155 -0
  83. langroid/parsing/parser.py +313 -0
  84. langroid/parsing/repo_loader.py +790 -0
  85. langroid/parsing/routing.py +36 -0
  86. langroid/parsing/search.py +275 -0
  87. langroid/parsing/spider.py +102 -0
  88. langroid/parsing/table_loader.py +94 -0
  89. langroid/parsing/url_loader.py +115 -0
  90. langroid/parsing/urls.py +273 -0
  91. langroid/parsing/utils.py +373 -0
  92. langroid/parsing/web_search.py +156 -0
  93. langroid/prompts/__init__.py +9 -0
  94. langroid/prompts/dialog.py +17 -0
  95. langroid/prompts/prompts_config.py +5 -0
  96. langroid/prompts/templates.py +141 -0
  97. langroid/pydantic_v1/__init__.py +10 -0
  98. langroid/pydantic_v1/main.py +4 -0
  99. langroid/utils/__init__.py +19 -0
  100. langroid/utils/algorithms/__init__.py +3 -0
  101. langroid/utils/algorithms/graph.py +103 -0
  102. langroid/utils/configuration.py +98 -0
  103. langroid/utils/constants.py +30 -0
  104. langroid/utils/git_utils.py +252 -0
  105. langroid/utils/globals.py +49 -0
  106. langroid/utils/logging.py +135 -0
  107. langroid/utils/object_registry.py +66 -0
  108. langroid/utils/output/__init__.py +20 -0
  109. langroid/utils/output/citations.py +41 -0
  110. langroid/utils/output/printing.py +99 -0
  111. langroid/utils/output/status.py +40 -0
  112. langroid/utils/pandas_utils.py +30 -0
  113. langroid/utils/pydantic_utils.py +602 -0
  114. langroid/utils/system.py +286 -0
  115. langroid/utils/types.py +93 -0
  116. langroid/vector_store/__init__.py +50 -0
  117. langroid/vector_store/base.py +359 -0
  118. langroid/vector_store/chromadb.py +214 -0
  119. langroid/vector_store/lancedb.py +406 -0
  120. langroid/vector_store/meilisearch.py +299 -0
  121. langroid/vector_store/momento.py +278 -0
  122. langroid/vector_store/qdrantdb.py +468 -0
  123. {langroid-0.33.6.dist-info → langroid-0.33.8.dist-info}/METADATA +95 -94
  124. langroid-0.33.8.dist-info/RECORD +127 -0
  125. {langroid-0.33.6.dist-info → langroid-0.33.8.dist-info}/WHEEL +1 -1
  126. langroid-0.33.6.dist-info/RECORD +0 -7
  127. langroid-0.33.6.dist-info/entry_points.txt +0 -4
  128. pyproject.toml +0 -356
  129. {langroid-0.33.6.dist-info → langroid-0.33.8.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,62 @@
1
+ import re
2
+ from typing import Callable, List
3
+
4
+ from bs4 import BeautifulSoup
5
+
6
+
7
+ def remove_extra_whitespace(s: str) -> str:
8
+ lines = s.split("\n")
9
+ cleaned_lines = [" ".join(line.split()) for line in lines]
10
+ return "\n".join(cleaned_lines)
11
+
12
+
13
+ def custom_sent_tokenize(text: str) -> List[str]:
14
+ sentences = [
15
+ sentence.strip()
16
+ for sentence in re.split(r"\.\s|\.\n", text)
17
+ if sentence.strip()
18
+ ]
19
+ # append a period if the sentence does not end with one
20
+ return [s + "." if s[-1] != "." else s for s in sentences]
21
+
22
+
23
+ def create_chunks(
24
+ text: str, chunk_size: int, length_fn: Callable[[str], int]
25
+ ) -> List[str]:
26
+ def _chunk_sentences(sentences: List[str], chunk_size: int) -> List[str]:
27
+ chunks = []
28
+ current_chunk: List[str] = []
29
+ current_chunk_length = 0
30
+
31
+ for sentence in sentences:
32
+ sentence_length = length_fn(sentence)
33
+ if current_chunk_length + sentence_length > chunk_size:
34
+ if current_chunk:
35
+ chunks.append(" ".join(current_chunk))
36
+ current_chunk = [sentence]
37
+ current_chunk_length = sentence_length
38
+ else:
39
+ current_chunk.append(sentence)
40
+ current_chunk_length += sentence_length
41
+
42
+ if current_chunk:
43
+ new_chunk = " ".join(current_chunk).strip()
44
+ if new_chunk:
45
+ chunks.append(" ".join(current_chunk).strip())
46
+
47
+ return chunks
48
+
49
+ soup = BeautifulSoup(text, "html.parser")
50
+ text = soup.get_text()
51
+ # First, try to split the document into paragraphs
52
+ paragraphs = text.split("\n\n")
53
+
54
+ # If paragraphs are too long, split them into sentences
55
+ if any(length_fn(p) > chunk_size for p in paragraphs):
56
+ sentences = custom_sent_tokenize(text)
57
+ chunks = _chunk_sentences(sentences, chunk_size)
58
+ else:
59
+ chunks = paragraphs
60
+
61
+ chunks = [chunk.strip() for chunk in chunks if chunk.strip() != ""]
62
+ return chunks
@@ -0,0 +1,155 @@
1
+ import ast
2
+ import json
3
+ from datetime import datetime
4
+ from typing import Any, Dict, Iterator, List, Union
5
+
6
+ import yaml
7
+ from json_repair import repair_json
8
+ from pyparsing import nestedExpr, originalTextFor
9
+
10
+
11
+ def is_valid_json(json_str: str) -> bool:
12
+ """Check if the input string is a valid JSON.
13
+
14
+ Args:
15
+ json_str (str): The input string to check.
16
+
17
+ Returns:
18
+ bool: True if the input string is a valid JSON, False otherwise.
19
+ """
20
+ try:
21
+ json.loads(json_str)
22
+ return True
23
+ except ValueError:
24
+ return False
25
+
26
+
27
+ def flatten(nested_list) -> Iterator[str]: # type: ignore
28
+ """Flatten a nested list into a single list of strings"""
29
+ for item in nested_list:
30
+ if isinstance(item, (list, tuple)):
31
+ for subitem in flatten(item):
32
+ yield subitem
33
+ else:
34
+ yield item
35
+
36
+
37
+ def get_json_candidates(s: str) -> List[str]:
38
+ """Get top-level JSON candidates, i.e. strings between curly braces."""
39
+ # Define the grammar for matching curly braces
40
+ curly_braces = originalTextFor(nestedExpr("{", "}"))
41
+
42
+ # Parse the string
43
+ try:
44
+ results = curly_braces.searchString(s)
45
+ # Properly convert nested lists to strings
46
+ return [r[0] for r in results]
47
+ except Exception:
48
+ return []
49
+
50
+
51
+ def parse_imperfect_json(json_string: str) -> Union[Dict[str, Any], List[Any]]:
52
+ if not json_string.strip():
53
+ raise ValueError("Empty string is not valid JSON")
54
+
55
+ # First, try parsing with ast.literal_eval
56
+ try:
57
+ result = ast.literal_eval(json_string)
58
+ if isinstance(result, (dict, list)):
59
+ return result
60
+ except (ValueError, SyntaxError):
61
+ pass
62
+
63
+ # If ast.literal_eval fails or returns non-dict/list, try repair_json
64
+ json_repaired_obj = repair_json(json_string, return_objects=True)
65
+ if isinstance(json_repaired_obj, (dict, list)):
66
+ return json_repaired_obj
67
+ else:
68
+ try:
69
+ # fallback on yaml
70
+ yaml_result = yaml.safe_load(json_string)
71
+ if isinstance(yaml_result, (dict, list)):
72
+ return yaml_result
73
+ except yaml.YAMLError:
74
+ pass
75
+
76
+ # If all methods fail, raise ValueError
77
+ raise ValueError(f"Unable to parse as JSON: {json_string}")
78
+
79
+
80
+ def try_repair_json_yaml(s: str) -> str | None:
81
+ """
82
+ Attempt to load as json, and if it fails, try repairing the JSON.
83
+ If that fails, replace any \n with space as a last resort.
84
+ NOTE - replacing \n with space will result in format loss,
85
+ which may matter in generated code (e.g. python, toml, etc)
86
+ """
87
+ s_repaired_obj = repair_json(s, return_objects=True)
88
+ if isinstance(s_repaired_obj, list):
89
+ if len(s_repaired_obj) > 0:
90
+ s_repaired_obj = s_repaired_obj[0]
91
+ else:
92
+ s_repaired_obj = None
93
+ if s_repaired_obj is not None:
94
+ return json.dumps(s_repaired_obj) # type: ignore
95
+ else:
96
+ try:
97
+ yaml_result = yaml.safe_load(s)
98
+ if isinstance(yaml_result, dict):
99
+ return json.dumps(yaml_result)
100
+ except yaml.YAMLError:
101
+ pass
102
+ # If it still fails, replace any \n with space as a last resort
103
+ s = s.replace("\n", " ")
104
+ if is_valid_json(s):
105
+ return s
106
+ else:
107
+ return None # all failed
108
+
109
+
110
+ def extract_top_level_json(s: str) -> List[str]:
111
+ """Extract all top-level JSON-formatted substrings from a given string.
112
+
113
+ Args:
114
+ s (str): The input string to search for JSON substrings.
115
+
116
+ Returns:
117
+ List[str]: A list of top-level JSON-formatted substrings.
118
+ """
119
+ # Find JSON object and array candidates
120
+ json_candidates = get_json_candidates(s)
121
+ maybe_repaired_jsons = map(try_repair_json_yaml, json_candidates)
122
+
123
+ return [candidate for candidate in maybe_repaired_jsons if candidate is not None]
124
+
125
+
126
+ def top_level_json_field(s: str, f: str) -> Any:
127
+ """
128
+ Extract the value of a field f from a top-level JSON object.
129
+ If there are multiple, just return the first.
130
+
131
+ Args:
132
+ s (str): The input string to search for JSON substrings.
133
+ f (str): The field to extract from the JSON object.
134
+
135
+ Returns:
136
+ str: The value of the field f in the top-level JSON object, if any.
137
+ Otherwise, return an empty string.
138
+ """
139
+
140
+ jsons = extract_top_level_json(s)
141
+ if len(jsons) == 0:
142
+ return ""
143
+ for j in jsons:
144
+ json_data = json.loads(j)
145
+ if f in json_data:
146
+ return json_data[f]
147
+
148
+ return ""
149
+
150
+
151
+ def datetime_to_json(obj: Any) -> Any:
152
+ if isinstance(obj, datetime):
153
+ return obj.isoformat()
154
+ # Let json.dumps() handle the raising of TypeError for non-serializable objects
155
+ return obj
@@ -0,0 +1,313 @@
1
+ import logging
2
+ from enum import Enum
3
+ from typing import Dict, List, Literal
4
+
5
+ import tiktoken
6
+
7
+ from langroid.mytypes import Document
8
+ from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
9
+ from langroid.pydantic_v1 import BaseSettings
10
+ from langroid.utils.object_registry import ObjectRegistry
11
+
12
+ logger = logging.getLogger(__name__)
13
+ logger.setLevel(logging.WARNING)
14
+
15
+
16
+ class Splitter(str, Enum):
17
+ TOKENS = "tokens"
18
+ PARA_SENTENCE = "para_sentence"
19
+ SIMPLE = "simple"
20
+
21
+
22
+ class PdfParsingConfig(BaseSettings):
23
+ library: Literal[
24
+ "fitz",
25
+ "pdfplumber",
26
+ "pypdf",
27
+ "unstructured",
28
+ "pdf2image",
29
+ ] = "pdfplumber"
30
+
31
+
32
+ class DocxParsingConfig(BaseSettings):
33
+ library: Literal["python-docx", "unstructured"] = "unstructured"
34
+
35
+
36
+ class DocParsingConfig(BaseSettings):
37
+ library: Literal["unstructured"] = "unstructured"
38
+
39
+
40
+ class ParsingConfig(BaseSettings):
41
+ splitter: str = Splitter.TOKENS
42
+ chunk_size: int = 200 # aim for this many tokens per chunk
43
+ overlap: int = 50 # overlap between chunks
44
+ max_chunks: int = 10_000
45
+ # aim to have at least this many chars per chunk when truncating due to punctuation
46
+ min_chunk_chars: int = 350
47
+ discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
48
+ n_similar_docs: int = 4
49
+ n_neighbor_ids: int = 5 # window size to store around each chunk
50
+ separators: List[str] = ["\n\n", "\n", " ", ""]
51
+ token_encoding_model: str = "text-embedding-ada-002"
52
+ pdf: PdfParsingConfig = PdfParsingConfig()
53
+ docx: DocxParsingConfig = DocxParsingConfig()
54
+ doc: DocParsingConfig = DocParsingConfig()
55
+
56
+
57
+ class Parser:
58
+ def __init__(self, config: ParsingConfig):
59
+ self.config = config
60
+ try:
61
+ self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
62
+ except Exception:
63
+ self.tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")
64
+
65
+ def num_tokens(self, text: str) -> int:
66
+ tokens = self.tokenizer.encode(text)
67
+ return len(tokens)
68
+
69
+ def truncate_tokens(self, text: str, max_tokens: int) -> str:
70
+ tokens = self.tokenizer.encode(text)
71
+ if len(tokens) <= max_tokens:
72
+ return text
73
+ return self.tokenizer.decode(tokens[:max_tokens])
74
+
75
+ def add_window_ids(self, chunks: List[Document]) -> None:
76
+ """Chunks may belong to multiple docs, but for each doc,
77
+ they appear consecutively. Add window_ids in metadata"""
78
+
79
+ # discard empty chunks
80
+ chunks = [c for c in chunks if c.content.strip() != ""]
81
+ if len(chunks) == 0:
82
+ return
83
+ # The original metadata.id (if any) is ignored since it will be same for all
84
+ # chunks and is useless. We want a distinct id for each chunk.
85
+ # ASSUMPTION: all chunks c of a doc have same c.metadata.id !
86
+ orig_ids = [c.metadata.id for c in chunks]
87
+ ids = [ObjectRegistry.new_id() for c in chunks]
88
+ id2chunk = {id: c for id, c in zip(ids, chunks)}
89
+
90
+ # group the ids by orig_id
91
+ # (each distinct orig_id refers to a different document)
92
+ orig_id_to_ids: Dict[str, List[str]] = {}
93
+ for orig_id, id in zip(orig_ids, ids):
94
+ if orig_id not in orig_id_to_ids:
95
+ orig_id_to_ids[orig_id] = []
96
+ orig_id_to_ids[orig_id].append(id)
97
+
98
+ # now each orig_id maps to a sequence of ids within a single doc
99
+
100
+ k = self.config.n_neighbor_ids
101
+ for orig, ids in orig_id_to_ids.items():
102
+ # ids are consecutive chunks in a single doc
103
+ n = len(ids)
104
+ window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
105
+ for i, _ in enumerate(ids):
106
+ c = id2chunk[ids[i]]
107
+ c.metadata.window_ids = window_ids[i]
108
+ c.metadata.id = ids[i]
109
+ c.metadata.is_chunk = True
110
+
111
+ def split_simple(self, docs: List[Document]) -> List[Document]:
112
+ if len(self.config.separators) == 0:
113
+ raise ValueError("Must have at least one separator")
114
+ final_docs = []
115
+
116
+ for d in docs:
117
+ if d.content.strip() == "":
118
+ continue
119
+ chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
120
+ # note we are ensuring we COPY the document metadata into each chunk,
121
+ # which ensures all chunks of a given doc have same metadata
122
+ # (and in particular same metadata.id, which is important later for
123
+ # add_window_ids)
124
+ chunk_docs = [
125
+ Document(
126
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
127
+ )
128
+ for c in chunks
129
+ if c.strip() != ""
130
+ ]
131
+ self.add_window_ids(chunk_docs)
132
+ final_docs += chunk_docs
133
+ return final_docs
134
+
135
+ def split_para_sentence(self, docs: List[Document]) -> List[Document]:
136
+ chunks = docs
137
+ while True:
138
+ un_splittables = 0
139
+ split_chunks = []
140
+ for c in chunks:
141
+ if c.content.strip() == "":
142
+ continue
143
+ if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
144
+ # small chunk: no need to split
145
+ split_chunks.append(c)
146
+ continue
147
+ splits = self._split_para_sentence_once([c])
148
+ un_splittables += len(splits) == 1
149
+ split_chunks += splits
150
+ if len(split_chunks) == len(chunks):
151
+ if un_splittables > 0:
152
+ max_len = max([self.num_tokens(p.content) for p in chunks])
153
+ logger.warning(
154
+ f"""
155
+ Unable to split {un_splittables} chunks
156
+ using chunk_size = {self.config.chunk_size}.
157
+ Max chunk size is {max_len} tokens.
158
+ """
159
+ )
160
+ break # we won't be able to shorten them with current settings
161
+ chunks = split_chunks.copy()
162
+
163
+ self.add_window_ids(chunks)
164
+ return chunks
165
+
166
+ def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
167
+ final_chunks = []
168
+ for d in docs:
169
+ if d.content.strip() == "":
170
+ continue
171
+ chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
172
+ # note we are ensuring we COPY the document metadata into each chunk,
173
+ # which ensures all chunks of a given doc have same metadata
174
+ # (and in particular same metadata.id, which is important later for
175
+ # add_window_ids)
176
+ chunk_docs = [
177
+ Document(
178
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
179
+ )
180
+ for c in chunks
181
+ if c.strip() != ""
182
+ ]
183
+ final_chunks += chunk_docs
184
+
185
+ return final_chunks
186
+
187
+ def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
188
+ final_docs = []
189
+ for d in docs:
190
+ chunks = self.chunk_tokens(d.content)
191
+ # note we are ensuring we COPY the document metadata into each chunk,
192
+ # which ensures all chunks of a given doc have same metadata
193
+ # (and in particular same metadata.id, which is important later for
194
+ # add_window_ids)
195
+ chunk_docs = [
196
+ Document(
197
+ content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
198
+ )
199
+ for c in chunks
200
+ if c.strip() != ""
201
+ ]
202
+ self.add_window_ids(chunk_docs)
203
+ final_docs += chunk_docs
204
+ return final_docs
205
+
206
+ def chunk_tokens(
207
+ self,
208
+ text: str,
209
+ ) -> List[str]:
210
+ """
211
+ Split a text into chunks of ~CHUNK_SIZE tokens,
212
+ based on punctuation and newline boundaries.
213
+ Adapted from
214
+ https://github.com/openai/chatgpt-retrieval-plugin/blob/main/services/chunks.py
215
+
216
+ Args:
217
+ text: The text to split into chunks.
218
+
219
+ Returns:
220
+ A list of text chunks, each of which is a string of tokens
221
+ roughly self.config.chunk_size tokens long.
222
+ """
223
+ # Return an empty list if the text is empty or whitespace
224
+ if not text or text.isspace():
225
+ return []
226
+
227
+ # Tokenize the text
228
+ tokens = self.tokenizer.encode(text, disallowed_special=())
229
+
230
+ # Initialize an empty list of chunks
231
+ chunks = []
232
+
233
+ # Initialize a counter for the number of chunks
234
+ num_chunks = 0
235
+
236
+ # Loop until all tokens are consumed
237
+ while tokens and num_chunks < self.config.max_chunks:
238
+ # Take the first chunk_size tokens as a chunk
239
+ chunk = tokens[: self.config.chunk_size]
240
+
241
+ # Decode the chunk into text
242
+ chunk_text = self.tokenizer.decode(chunk)
243
+
244
+ # Skip the chunk if it is empty or whitespace
245
+ if not chunk_text or chunk_text.isspace():
246
+ # Remove the tokens corresponding to the chunk text
247
+ # from remaining tokens
248
+ tokens = tokens[len(chunk) :]
249
+ # Continue to the next iteration of the loop
250
+ continue
251
+
252
+ # Find the last period or punctuation mark in the chunk
253
+ last_punctuation = max(
254
+ chunk_text.rfind("."),
255
+ chunk_text.rfind("?"),
256
+ chunk_text.rfind("!"),
257
+ chunk_text.rfind("\n"),
258
+ )
259
+
260
+ # If there is a punctuation mark, and the last punctuation index is
261
+ # after MIN_CHUNK_SIZE_CHARS
262
+ if (
263
+ last_punctuation != -1
264
+ and last_punctuation > self.config.min_chunk_chars
265
+ ):
266
+ # Truncate the chunk text at the punctuation mark
267
+ chunk_text = chunk_text[: last_punctuation + 1]
268
+
269
+ # Remove any newline characters and strip any leading or
270
+ # trailing whitespace
271
+ chunk_text_to_append = chunk_text.replace("\n", " ").strip()
272
+
273
+ if len(chunk_text_to_append) > self.config.discard_chunk_chars:
274
+ # Append the chunk text to the list of chunks
275
+ chunks.append(chunk_text_to_append)
276
+
277
+ # Remove the tokens corresponding to the chunk text
278
+ # from the remaining tokens
279
+ tokens = tokens[
280
+ len(self.tokenizer.encode(chunk_text, disallowed_special=())) :
281
+ ]
282
+
283
+ # Increment the number of chunks
284
+ num_chunks += 1
285
+
286
+ # There may be remaining tokens, but we discard them
287
+ # since we have already reached the maximum number of chunks
288
+
289
+ return chunks
290
+
291
+ def split(self, docs: List[Document]) -> List[Document]:
292
+ if len(docs) == 0:
293
+ return []
294
+ # create ids in metadata of docs if absent:
295
+ # we need this to distinguish docs later in add_window_ids
296
+ for d in docs:
297
+ if d.metadata.id in [None, ""]:
298
+ d.metadata.id = ObjectRegistry.new_id()
299
+ # some docs are already splits, so don't split them further!
300
+ chunked_docs = [d for d in docs if d.metadata.is_chunk]
301
+ big_docs = [d for d in docs if not d.metadata.is_chunk]
302
+ if len(big_docs) == 0:
303
+ return chunked_docs
304
+ if self.config.splitter == Splitter.PARA_SENTENCE:
305
+ big_doc_chunks = self.split_para_sentence(big_docs)
306
+ elif self.config.splitter == Splitter.TOKENS:
307
+ big_doc_chunks = self.split_chunk_tokens(big_docs)
308
+ elif self.config.splitter == Splitter.SIMPLE:
309
+ big_doc_chunks = self.split_simple(big_docs)
310
+ else:
311
+ raise ValueError(f"Unknown splitter: {self.config.splitter}")
312
+
313
+ return chunked_docs + big_doc_chunks