langroid 0.31.2__py3-none-any.whl → 0.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
  2. langroid-0.33.3.dist-info/RECORD +7 -0
  3. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
  4. langroid-0.33.3.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/.chainlit/config.toml +0 -121
  8. langroid/agent/.chainlit/translations/bn.json +0 -231
  9. langroid/agent/.chainlit/translations/en-US.json +0 -229
  10. langroid/agent/.chainlit/translations/gu.json +0 -231
  11. langroid/agent/.chainlit/translations/he-IL.json +0 -231
  12. langroid/agent/.chainlit/translations/hi.json +0 -231
  13. langroid/agent/.chainlit/translations/kn.json +0 -231
  14. langroid/agent/.chainlit/translations/ml.json +0 -231
  15. langroid/agent/.chainlit/translations/mr.json +0 -231
  16. langroid/agent/.chainlit/translations/ta.json +0 -231
  17. langroid/agent/.chainlit/translations/te.json +0 -231
  18. langroid/agent/.chainlit/translations/zh-CN.json +0 -229
  19. langroid/agent/__init__.py +0 -41
  20. langroid/agent/base.py +0 -1981
  21. langroid/agent/batch.py +0 -398
  22. langroid/agent/callbacks/__init__.py +0 -0
  23. langroid/agent/callbacks/chainlit.py +0 -598
  24. langroid/agent/chat_agent.py +0 -1899
  25. langroid/agent/chat_document.py +0 -454
  26. langroid/agent/helpers.py +0 -0
  27. langroid/agent/junk +0 -13
  28. langroid/agent/openai_assistant.py +0 -882
  29. langroid/agent/special/__init__.py +0 -59
  30. langroid/agent/special/arangodb/__init__.py +0 -0
  31. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  32. langroid/agent/special/arangodb/system_messages.py +0 -186
  33. langroid/agent/special/arangodb/tools.py +0 -107
  34. langroid/agent/special/arangodb/utils.py +0 -36
  35. langroid/agent/special/doc_chat_agent.py +0 -1466
  36. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  37. langroid/agent/special/lance_rag/__init__.py +0 -9
  38. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  39. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  40. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  41. langroid/agent/special/lance_tools.py +0 -61
  42. langroid/agent/special/neo4j/__init__.py +0 -0
  43. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  44. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  45. langroid/agent/special/neo4j/system_messages.py +0 -120
  46. langroid/agent/special/neo4j/tools.py +0 -32
  47. langroid/agent/special/relevance_extractor_agent.py +0 -127
  48. langroid/agent/special/retriever_agent.py +0 -56
  49. langroid/agent/special/sql/__init__.py +0 -17
  50. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  51. langroid/agent/special/sql/utils/__init__.py +0 -21
  52. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  53. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  54. langroid/agent/special/sql/utils/system_message.py +0 -35
  55. langroid/agent/special/sql/utils/tools.py +0 -64
  56. langroid/agent/special/table_chat_agent.py +0 -263
  57. langroid/agent/structured_message.py +0 -9
  58. langroid/agent/task.py +0 -2093
  59. langroid/agent/tool_message.py +0 -393
  60. langroid/agent/tools/__init__.py +0 -38
  61. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  62. langroid/agent/tools/file_tools.py +0 -234
  63. langroid/agent/tools/google_search_tool.py +0 -39
  64. langroid/agent/tools/metaphor_search_tool.py +0 -67
  65. langroid/agent/tools/orchestration.py +0 -303
  66. langroid/agent/tools/recipient_tool.py +0 -235
  67. langroid/agent/tools/retrieval_tool.py +0 -32
  68. langroid/agent/tools/rewind_tool.py +0 -137
  69. langroid/agent/tools/segment_extract_tool.py +0 -41
  70. langroid/agent/typed_task.py +0 -19
  71. langroid/agent/xml_tool_message.py +0 -382
  72. langroid/agent_config.py +0 -0
  73. langroid/cachedb/__init__.py +0 -17
  74. langroid/cachedb/base.py +0 -58
  75. langroid/cachedb/momento_cachedb.py +0 -108
  76. langroid/cachedb/redis_cachedb.py +0 -153
  77. langroid/embedding_models/__init__.py +0 -39
  78. langroid/embedding_models/base.py +0 -74
  79. langroid/embedding_models/clustering.py +0 -189
  80. langroid/embedding_models/models.py +0 -461
  81. langroid/embedding_models/protoc/__init__.py +0 -0
  82. langroid/embedding_models/protoc/embeddings.proto +0 -19
  83. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  84. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  85. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  86. langroid/embedding_models/remote_embeds.py +0 -153
  87. langroid/exceptions.py +0 -65
  88. langroid/experimental/team-save.py +0 -391
  89. langroid/language_models/.chainlit/config.toml +0 -121
  90. langroid/language_models/.chainlit/translations/en-US.json +0 -231
  91. langroid/language_models/__init__.py +0 -53
  92. langroid/language_models/azure_openai.py +0 -153
  93. langroid/language_models/base.py +0 -678
  94. langroid/language_models/config.py +0 -18
  95. langroid/language_models/mock_lm.py +0 -124
  96. langroid/language_models/openai_gpt.py +0 -1923
  97. langroid/language_models/prompt_formatter/__init__.py +0 -16
  98. langroid/language_models/prompt_formatter/base.py +0 -40
  99. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  100. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  101. langroid/language_models/utils.py +0 -147
  102. langroid/mytypes.py +0 -84
  103. langroid/parsing/__init__.py +0 -52
  104. langroid/parsing/agent_chats.py +0 -38
  105. langroid/parsing/code-parsing.md +0 -86
  106. langroid/parsing/code_parser.py +0 -121
  107. langroid/parsing/config.py +0 -0
  108. langroid/parsing/document_parser.py +0 -718
  109. langroid/parsing/image_text.py +0 -32
  110. langroid/parsing/para_sentence_split.py +0 -62
  111. langroid/parsing/parse_json.py +0 -155
  112. langroid/parsing/parser.py +0 -313
  113. langroid/parsing/repo_loader.py +0 -790
  114. langroid/parsing/routing.py +0 -36
  115. langroid/parsing/search.py +0 -275
  116. langroid/parsing/spider.py +0 -102
  117. langroid/parsing/table_loader.py +0 -94
  118. langroid/parsing/url_loader.py +0 -111
  119. langroid/parsing/url_loader_cookies.py +0 -73
  120. langroid/parsing/urls.py +0 -273
  121. langroid/parsing/utils.py +0 -373
  122. langroid/parsing/web_search.py +0 -155
  123. langroid/prompts/__init__.py +0 -9
  124. langroid/prompts/chat-gpt4-system-prompt.md +0 -68
  125. langroid/prompts/dialog.py +0 -17
  126. langroid/prompts/prompts_config.py +0 -5
  127. langroid/prompts/templates.py +0 -141
  128. langroid/pydantic_v1/__init__.py +0 -10
  129. langroid/pydantic_v1/main.py +0 -4
  130. langroid/utils/.chainlit/config.toml +0 -121
  131. langroid/utils/.chainlit/translations/en-US.json +0 -231
  132. langroid/utils/__init__.py +0 -19
  133. langroid/utils/algorithms/__init__.py +0 -3
  134. langroid/utils/algorithms/graph.py +0 -103
  135. langroid/utils/configuration.py +0 -98
  136. langroid/utils/constants.py +0 -30
  137. langroid/utils/docker.py +0 -37
  138. langroid/utils/git_utils.py +0 -252
  139. langroid/utils/globals.py +0 -49
  140. langroid/utils/llms/__init__.py +0 -0
  141. langroid/utils/llms/strings.py +0 -8
  142. langroid/utils/logging.py +0 -135
  143. langroid/utils/object_registry.py +0 -66
  144. langroid/utils/output/__init__.py +0 -20
  145. langroid/utils/output/citations.py +0 -41
  146. langroid/utils/output/printing.py +0 -99
  147. langroid/utils/output/status.py +0 -40
  148. langroid/utils/pandas_utils.py +0 -30
  149. langroid/utils/pydantic_utils.py +0 -602
  150. langroid/utils/system.py +0 -286
  151. langroid/utils/types.py +0 -93
  152. langroid/utils/web/__init__.py +0 -0
  153. langroid/utils/web/login.py +0 -83
  154. langroid/vector_store/__init__.py +0 -50
  155. langroid/vector_store/base.py +0 -357
  156. langroid/vector_store/chromadb.py +0 -214
  157. langroid/vector_store/lancedb.py +0 -401
  158. langroid/vector_store/meilisearch.py +0 -299
  159. langroid/vector_store/momento.py +0 -278
  160. langroid/vector_store/qdrant_cloud.py +0 -6
  161. langroid/vector_store/qdrantdb.py +0 -468
  162. langroid-0.31.2.dist-info/RECORD +0 -162
  163. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,32 +0,0 @@
1
- from typing import Union
2
-
3
- import pytesseract
4
- from pdf2image import convert_from_bytes, convert_from_path
5
-
6
-
7
- def pdf_image_to_text(input_data: Union[str, bytes]) -> str:
8
- """
9
- Converts a PDF that contains images to text using OCR.
10
-
11
- Args:
12
- input_data (Union[str, bytes]): The file path to the PDF or a bytes-like object
13
- of the PDF content.
14
-
15
- Returns:
16
- str: The extracted text from the PDF.
17
- """
18
-
19
- # Check if the input is a file path (str) or bytes, and
20
- # convert PDF to images accordingly
21
- if isinstance(input_data, str):
22
- images = convert_from_path(input_data)
23
- elif isinstance(input_data, bytes):
24
- images = convert_from_bytes(input_data)
25
- else:
26
- raise ValueError("input_data must be a file path (str) or bytes-like object")
27
-
28
- text = ""
29
- for image in images:
30
- text += pytesseract.image_to_string(image)
31
-
32
- return text
@@ -1,62 +0,0 @@
1
- import re
2
- from typing import Callable, List
3
-
4
- from bs4 import BeautifulSoup
5
-
6
-
7
- def remove_extra_whitespace(s: str) -> str:
8
- lines = s.split("\n")
9
- cleaned_lines = [" ".join(line.split()) for line in lines]
10
- return "\n".join(cleaned_lines)
11
-
12
-
13
- def custom_sent_tokenize(text: str) -> List[str]:
14
- sentences = [
15
- sentence.strip()
16
- for sentence in re.split(r"\.\s|\.\n", text)
17
- if sentence.strip()
18
- ]
19
- # append a period if the sentence does not end with one
20
- return [s + "." if s[-1] != "." else s for s in sentences]
21
-
22
-
23
- def create_chunks(
24
- text: str, chunk_size: int, length_fn: Callable[[str], int]
25
- ) -> List[str]:
26
- def _chunk_sentences(sentences: List[str], chunk_size: int) -> List[str]:
27
- chunks = []
28
- current_chunk: List[str] = []
29
- current_chunk_length = 0
30
-
31
- for sentence in sentences:
32
- sentence_length = length_fn(sentence)
33
- if current_chunk_length + sentence_length > chunk_size:
34
- if current_chunk:
35
- chunks.append(" ".join(current_chunk))
36
- current_chunk = [sentence]
37
- current_chunk_length = sentence_length
38
- else:
39
- current_chunk.append(sentence)
40
- current_chunk_length += sentence_length
41
-
42
- if current_chunk:
43
- new_chunk = " ".join(current_chunk).strip()
44
- if new_chunk:
45
- chunks.append(" ".join(current_chunk).strip())
46
-
47
- return chunks
48
-
49
- soup = BeautifulSoup(text, "html.parser")
50
- text = soup.get_text()
51
- # First, try to split the document into paragraphs
52
- paragraphs = text.split("\n\n")
53
-
54
- # If paragraphs are too long, split them into sentences
55
- if any(length_fn(p) > chunk_size for p in paragraphs):
56
- sentences = custom_sent_tokenize(text)
57
- chunks = _chunk_sentences(sentences, chunk_size)
58
- else:
59
- chunks = paragraphs
60
-
61
- chunks = [chunk.strip() for chunk in chunks if chunk.strip() != ""]
62
- return chunks
@@ -1,155 +0,0 @@
1
- import ast
2
- import json
3
- from datetime import datetime
4
- from typing import Any, Dict, Iterator, List, Union
5
-
6
- import yaml
7
- from json_repair import repair_json
8
- from pyparsing import nestedExpr, originalTextFor
9
-
10
-
11
- def is_valid_json(json_str: str) -> bool:
12
- """Check if the input string is a valid JSON.
13
-
14
- Args:
15
- json_str (str): The input string to check.
16
-
17
- Returns:
18
- bool: True if the input string is a valid JSON, False otherwise.
19
- """
20
- try:
21
- json.loads(json_str)
22
- return True
23
- except ValueError:
24
- return False
25
-
26
-
27
- def flatten(nested_list) -> Iterator[str]: # type: ignore
28
- """Flatten a nested list into a single list of strings"""
29
- for item in nested_list:
30
- if isinstance(item, (list, tuple)):
31
- for subitem in flatten(item):
32
- yield subitem
33
- else:
34
- yield item
35
-
36
-
37
- def get_json_candidates(s: str) -> List[str]:
38
- """Get top-level JSON candidates, i.e. strings between curly braces."""
39
- # Define the grammar for matching curly braces
40
- curly_braces = originalTextFor(nestedExpr("{", "}"))
41
-
42
- # Parse the string
43
- try:
44
- results = curly_braces.searchString(s)
45
- # Properly convert nested lists to strings
46
- return [r[0] for r in results]
47
- except Exception:
48
- return []
49
-
50
-
51
- def parse_imperfect_json(json_string: str) -> Union[Dict[str, Any], List[Any]]:
52
- if not json_string.strip():
53
- raise ValueError("Empty string is not valid JSON")
54
-
55
- # First, try parsing with ast.literal_eval
56
- try:
57
- result = ast.literal_eval(json_string)
58
- if isinstance(result, (dict, list)):
59
- return result
60
- except (ValueError, SyntaxError):
61
- pass
62
-
63
- # If ast.literal_eval fails or returns non-dict/list, try repair_json
64
- json_repaired_obj = repair_json(json_string, return_objects=True)
65
- if isinstance(json_repaired_obj, (dict, list)):
66
- return json_repaired_obj
67
- else:
68
- try:
69
- # fallback on yaml
70
- yaml_result = yaml.safe_load(json_string)
71
- if isinstance(yaml_result, (dict, list)):
72
- return yaml_result
73
- except yaml.YAMLError:
74
- pass
75
-
76
- # If all methods fail, raise ValueError
77
- raise ValueError(f"Unable to parse as JSON: {json_string}")
78
-
79
-
80
- def try_repair_json_yaml(s: str) -> str | None:
81
- """
82
- Attempt to load as json, and if it fails, try repairing the JSON.
83
- If that fails, replace any \n with space as a last resort.
84
- NOTE - replacing \n with space will result in format loss,
85
- which may matter in generated code (e.g. python, toml, etc)
86
- """
87
- s_repaired_obj = repair_json(s, return_objects=True)
88
- if isinstance(s_repaired_obj, list):
89
- if len(s_repaired_obj) > 0:
90
- s_repaired_obj = s_repaired_obj[0]
91
- else:
92
- s_repaired_obj = None
93
- if s_repaired_obj is not None:
94
- return json.dumps(s_repaired_obj) # type: ignore
95
- else:
96
- try:
97
- yaml_result = yaml.safe_load(s)
98
- if isinstance(yaml_result, dict):
99
- return json.dumps(yaml_result)
100
- except yaml.YAMLError:
101
- pass
102
- # If it still fails, replace any \n with space as a last resort
103
- s = s.replace("\n", " ")
104
- if is_valid_json(s):
105
- return s
106
- else:
107
- return None # all failed
108
-
109
-
110
- def extract_top_level_json(s: str) -> List[str]:
111
- """Extract all top-level JSON-formatted substrings from a given string.
112
-
113
- Args:
114
- s (str): The input string to search for JSON substrings.
115
-
116
- Returns:
117
- List[str]: A list of top-level JSON-formatted substrings.
118
- """
119
- # Find JSON object and array candidates
120
- json_candidates = get_json_candidates(s)
121
- maybe_repaired_jsons = map(try_repair_json_yaml, json_candidates)
122
-
123
- return [candidate for candidate in maybe_repaired_jsons if candidate is not None]
124
-
125
-
126
- def top_level_json_field(s: str, f: str) -> Any:
127
- """
128
- Extract the value of a field f from a top-level JSON object.
129
- If there are multiple, just return the first.
130
-
131
- Args:
132
- s (str): The input string to search for JSON substrings.
133
- f (str): The field to extract from the JSON object.
134
-
135
- Returns:
136
- str: The value of the field f in the top-level JSON object, if any.
137
- Otherwise, return an empty string.
138
- """
139
-
140
- jsons = extract_top_level_json(s)
141
- if len(jsons) == 0:
142
- return ""
143
- for j in jsons:
144
- json_data = json.loads(j)
145
- if f in json_data:
146
- return json_data[f]
147
-
148
- return ""
149
-
150
-
151
- def datetime_to_json(obj: Any) -> Any:
152
- if isinstance(obj, datetime):
153
- return obj.isoformat()
154
- # Let json.dumps() handle the raising of TypeError for non-serializable objects
155
- return obj
@@ -1,313 +0,0 @@
1
- import logging
2
- from enum import Enum
3
- from typing import Dict, List, Literal
4
-
5
- import tiktoken
6
-
7
- from langroid.mytypes import Document
8
- from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
9
- from langroid.pydantic_v1 import BaseSettings
10
- from langroid.utils.object_registry import ObjectRegistry
11
-
12
- logger = logging.getLogger(__name__)
13
- logger.setLevel(logging.WARNING)
14
-
15
-
16
- class Splitter(str, Enum):
17
- TOKENS = "tokens"
18
- PARA_SENTENCE = "para_sentence"
19
- SIMPLE = "simple"
20
-
21
-
22
- class PdfParsingConfig(BaseSettings):
23
- library: Literal[
24
- "fitz",
25
- "pdfplumber",
26
- "pypdf",
27
- "unstructured",
28
- "pdf2image",
29
- ] = "pdfplumber"
30
-
31
-
32
- class DocxParsingConfig(BaseSettings):
33
- library: Literal["python-docx", "unstructured"] = "unstructured"
34
-
35
-
36
- class DocParsingConfig(BaseSettings):
37
- library: Literal["unstructured"] = "unstructured"
38
-
39
-
40
- class ParsingConfig(BaseSettings):
41
- splitter: str = Splitter.TOKENS
42
- chunk_size: int = 200 # aim for this many tokens per chunk
43
- overlap: int = 50 # overlap between chunks
44
- max_chunks: int = 10_000
45
- # aim to have at least this many chars per chunk when truncating due to punctuation
46
- min_chunk_chars: int = 350
47
- discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
48
- n_similar_docs: int = 4
49
- n_neighbor_ids: int = 5 # window size to store around each chunk
50
- separators: List[str] = ["\n\n", "\n", " ", ""]
51
- token_encoding_model: str = "text-embedding-ada-002"
52
- pdf: PdfParsingConfig = PdfParsingConfig()
53
- docx: DocxParsingConfig = DocxParsingConfig()
54
- doc: DocParsingConfig = DocParsingConfig()
55
-
56
-
57
- class Parser:
58
- def __init__(self, config: ParsingConfig):
59
- self.config = config
60
- try:
61
- self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
62
- except Exception:
63
- self.tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")
64
-
65
- def num_tokens(self, text: str) -> int:
66
- tokens = self.tokenizer.encode(text)
67
- return len(tokens)
68
-
69
- def truncate_tokens(self, text: str, max_tokens: int) -> str:
70
- tokens = self.tokenizer.encode(text)
71
- if len(tokens) <= max_tokens:
72
- return text
73
- return self.tokenizer.decode(tokens[:max_tokens])
74
-
75
- def add_window_ids(self, chunks: List[Document]) -> None:
76
- """Chunks may belong to multiple docs, but for each doc,
77
- they appear consecutively. Add window_ids in metadata"""
78
-
79
- # discard empty chunks
80
- chunks = [c for c in chunks if c.content.strip() != ""]
81
- if len(chunks) == 0:
82
- return
83
- # The original metadata.id (if any) is ignored since it will be same for all
84
- # chunks and is useless. We want a distinct id for each chunk.
85
- # ASSUMPTION: all chunks c of a doc have same c.metadata.id !
86
- orig_ids = [c.metadata.id for c in chunks]
87
- ids = [ObjectRegistry.new_id() for c in chunks]
88
- id2chunk = {id: c for id, c in zip(ids, chunks)}
89
-
90
- # group the ids by orig_id
91
- # (each distinct orig_id refers to a different document)
92
- orig_id_to_ids: Dict[str, List[str]] = {}
93
- for orig_id, id in zip(orig_ids, ids):
94
- if orig_id not in orig_id_to_ids:
95
- orig_id_to_ids[orig_id] = []
96
- orig_id_to_ids[orig_id].append(id)
97
-
98
- # now each orig_id maps to a sequence of ids within a single doc
99
-
100
- k = self.config.n_neighbor_ids
101
- for orig, ids in orig_id_to_ids.items():
102
- # ids are consecutive chunks in a single doc
103
- n = len(ids)
104
- window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
105
- for i, _ in enumerate(ids):
106
- c = id2chunk[ids[i]]
107
- c.metadata.window_ids = window_ids[i]
108
- c.metadata.id = ids[i]
109
- c.metadata.is_chunk = True
110
-
111
- def split_simple(self, docs: List[Document]) -> List[Document]:
112
- if len(self.config.separators) == 0:
113
- raise ValueError("Must have at least one separator")
114
- final_docs = []
115
-
116
- for d in docs:
117
- if d.content.strip() == "":
118
- continue
119
- chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
120
- # note we are ensuring we COPY the document metadata into each chunk,
121
- # which ensures all chunks of a given doc have same metadata
122
- # (and in particular same metadata.id, which is important later for
123
- # add_window_ids)
124
- chunk_docs = [
125
- Document(
126
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
127
- )
128
- for c in chunks
129
- if c.strip() != ""
130
- ]
131
- self.add_window_ids(chunk_docs)
132
- final_docs += chunk_docs
133
- return final_docs
134
-
135
- def split_para_sentence(self, docs: List[Document]) -> List[Document]:
136
- chunks = docs
137
- while True:
138
- un_splittables = 0
139
- split_chunks = []
140
- for c in chunks:
141
- if c.content.strip() == "":
142
- continue
143
- if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
144
- # small chunk: no need to split
145
- split_chunks.append(c)
146
- continue
147
- splits = self._split_para_sentence_once([c])
148
- un_splittables += len(splits) == 1
149
- split_chunks += splits
150
- if len(split_chunks) == len(chunks):
151
- if un_splittables > 0:
152
- max_len = max([self.num_tokens(p.content) for p in chunks])
153
- logger.warning(
154
- f"""
155
- Unable to split {un_splittables} chunks
156
- using chunk_size = {self.config.chunk_size}.
157
- Max chunk size is {max_len} tokens.
158
- """
159
- )
160
- break # we won't be able to shorten them with current settings
161
- chunks = split_chunks.copy()
162
-
163
- self.add_window_ids(chunks)
164
- return chunks
165
-
166
- def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
167
- final_chunks = []
168
- for d in docs:
169
- if d.content.strip() == "":
170
- continue
171
- chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
172
- # note we are ensuring we COPY the document metadata into each chunk,
173
- # which ensures all chunks of a given doc have same metadata
174
- # (and in particular same metadata.id, which is important later for
175
- # add_window_ids)
176
- chunk_docs = [
177
- Document(
178
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
179
- )
180
- for c in chunks
181
- if c.strip() != ""
182
- ]
183
- final_chunks += chunk_docs
184
-
185
- return final_chunks
186
-
187
- def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
188
- final_docs = []
189
- for d in docs:
190
- chunks = self.chunk_tokens(d.content)
191
- # note we are ensuring we COPY the document metadata into each chunk,
192
- # which ensures all chunks of a given doc have same metadata
193
- # (and in particular same metadata.id, which is important later for
194
- # add_window_ids)
195
- chunk_docs = [
196
- Document(
197
- content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
198
- )
199
- for c in chunks
200
- if c.strip() != ""
201
- ]
202
- self.add_window_ids(chunk_docs)
203
- final_docs += chunk_docs
204
- return final_docs
205
-
206
- def chunk_tokens(
207
- self,
208
- text: str,
209
- ) -> List[str]:
210
- """
211
- Split a text into chunks of ~CHUNK_SIZE tokens,
212
- based on punctuation and newline boundaries.
213
- Adapted from
214
- https://github.com/openai/chatgpt-retrieval-plugin/blob/main/services/chunks.py
215
-
216
- Args:
217
- text: The text to split into chunks.
218
-
219
- Returns:
220
- A list of text chunks, each of which is a string of tokens
221
- roughly self.config.chunk_size tokens long.
222
- """
223
- # Return an empty list if the text is empty or whitespace
224
- if not text or text.isspace():
225
- return []
226
-
227
- # Tokenize the text
228
- tokens = self.tokenizer.encode(text, disallowed_special=())
229
-
230
- # Initialize an empty list of chunks
231
- chunks = []
232
-
233
- # Initialize a counter for the number of chunks
234
- num_chunks = 0
235
-
236
- # Loop until all tokens are consumed
237
- while tokens and num_chunks < self.config.max_chunks:
238
- # Take the first chunk_size tokens as a chunk
239
- chunk = tokens[: self.config.chunk_size]
240
-
241
- # Decode the chunk into text
242
- chunk_text = self.tokenizer.decode(chunk)
243
-
244
- # Skip the chunk if it is empty or whitespace
245
- if not chunk_text or chunk_text.isspace():
246
- # Remove the tokens corresponding to the chunk text
247
- # from remaining tokens
248
- tokens = tokens[len(chunk) :]
249
- # Continue to the next iteration of the loop
250
- continue
251
-
252
- # Find the last period or punctuation mark in the chunk
253
- last_punctuation = max(
254
- chunk_text.rfind("."),
255
- chunk_text.rfind("?"),
256
- chunk_text.rfind("!"),
257
- chunk_text.rfind("\n"),
258
- )
259
-
260
- # If there is a punctuation mark, and the last punctuation index is
261
- # after MIN_CHUNK_SIZE_CHARS
262
- if (
263
- last_punctuation != -1
264
- and last_punctuation > self.config.min_chunk_chars
265
- ):
266
- # Truncate the chunk text at the punctuation mark
267
- chunk_text = chunk_text[: last_punctuation + 1]
268
-
269
- # Remove any newline characters and strip any leading or
270
- # trailing whitespace
271
- chunk_text_to_append = chunk_text.replace("\n", " ").strip()
272
-
273
- if len(chunk_text_to_append) > self.config.discard_chunk_chars:
274
- # Append the chunk text to the list of chunks
275
- chunks.append(chunk_text_to_append)
276
-
277
- # Remove the tokens corresponding to the chunk text
278
- # from the remaining tokens
279
- tokens = tokens[
280
- len(self.tokenizer.encode(chunk_text, disallowed_special=())) :
281
- ]
282
-
283
- # Increment the number of chunks
284
- num_chunks += 1
285
-
286
- # There may be remaining tokens, but we discard them
287
- # since we have already reached the maximum number of chunks
288
-
289
- return chunks
290
-
291
- def split(self, docs: List[Document]) -> List[Document]:
292
- if len(docs) == 0:
293
- return []
294
- # create ids in metadata of docs if absent:
295
- # we need this to distinguish docs later in add_window_ids
296
- for d in docs:
297
- if d.metadata.id in [None, ""]:
298
- d.metadata.id = ObjectRegistry.new_id()
299
- # some docs are already splits, so don't split them further!
300
- chunked_docs = [d for d in docs if d.metadata.is_chunk]
301
- big_docs = [d for d in docs if not d.metadata.is_chunk]
302
- if len(big_docs) == 0:
303
- return chunked_docs
304
- if self.config.splitter == Splitter.PARA_SENTENCE:
305
- big_doc_chunks = self.split_para_sentence(big_docs)
306
- elif self.config.splitter == Splitter.TOKENS:
307
- big_doc_chunks = self.split_chunk_tokens(big_docs)
308
- elif self.config.splitter == Splitter.SIMPLE:
309
- big_doc_chunks = self.split_simple(big_docs)
310
- else:
311
- raise ValueError(f"Unknown splitter: {self.config.splitter}")
312
-
313
- return chunked_docs + big_doc_chunks