langroid 0.31.1__py3-none-any.whl → 0.33.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
- langroid-0.33.3.dist-info/RECORD +7 -0
- {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
- langroid-0.33.3.dist-info/entry_points.txt +4 -0
- pyproject.toml +317 -212
- langroid/__init__.py +0 -106
- langroid/agent/.chainlit/config.toml +0 -121
- langroid/agent/.chainlit/translations/bn.json +0 -231
- langroid/agent/.chainlit/translations/en-US.json +0 -229
- langroid/agent/.chainlit/translations/gu.json +0 -231
- langroid/agent/.chainlit/translations/he-IL.json +0 -231
- langroid/agent/.chainlit/translations/hi.json +0 -231
- langroid/agent/.chainlit/translations/kn.json +0 -231
- langroid/agent/.chainlit/translations/ml.json +0 -231
- langroid/agent/.chainlit/translations/mr.json +0 -231
- langroid/agent/.chainlit/translations/ta.json +0 -231
- langroid/agent/.chainlit/translations/te.json +0 -231
- langroid/agent/.chainlit/translations/zh-CN.json +0 -229
- langroid/agent/__init__.py +0 -41
- langroid/agent/base.py +0 -1981
- langroid/agent/batch.py +0 -398
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +0 -598
- langroid/agent/chat_agent.py +0 -1899
- langroid/agent/chat_document.py +0 -454
- langroid/agent/helpers.py +0 -0
- langroid/agent/junk +0 -13
- langroid/agent/openai_assistant.py +0 -882
- langroid/agent/special/__init__.py +0 -59
- langroid/agent/special/arangodb/__init__.py +0 -0
- langroid/agent/special/arangodb/arangodb_agent.py +0 -656
- langroid/agent/special/arangodb/system_messages.py +0 -186
- langroid/agent/special/arangodb/tools.py +0 -107
- langroid/agent/special/arangodb/utils.py +0 -36
- langroid/agent/special/doc_chat_agent.py +0 -1466
- langroid/agent/special/lance_doc_chat_agent.py +0 -262
- langroid/agent/special/lance_rag/__init__.py +0 -9
- langroid/agent/special/lance_rag/critic_agent.py +0 -198
- langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
- langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
- langroid/agent/special/lance_tools.py +0 -61
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
- langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
- langroid/agent/special/neo4j/system_messages.py +0 -120
- langroid/agent/special/neo4j/tools.py +0 -32
- langroid/agent/special/relevance_extractor_agent.py +0 -127
- langroid/agent/special/retriever_agent.py +0 -56
- langroid/agent/special/sql/__init__.py +0 -17
- langroid/agent/special/sql/sql_chat_agent.py +0 -654
- langroid/agent/special/sql/utils/__init__.py +0 -21
- langroid/agent/special/sql/utils/description_extractors.py +0 -190
- langroid/agent/special/sql/utils/populate_metadata.py +0 -85
- langroid/agent/special/sql/utils/system_message.py +0 -35
- langroid/agent/special/sql/utils/tools.py +0 -64
- langroid/agent/special/table_chat_agent.py +0 -263
- langroid/agent/structured_message.py +0 -9
- langroid/agent/task.py +0 -2093
- langroid/agent/tool_message.py +0 -393
- langroid/agent/tools/__init__.py +0 -38
- langroid/agent/tools/duckduckgo_search_tool.py +0 -50
- langroid/agent/tools/file_tools.py +0 -234
- langroid/agent/tools/google_search_tool.py +0 -39
- langroid/agent/tools/metaphor_search_tool.py +0 -67
- langroid/agent/tools/orchestration.py +0 -303
- langroid/agent/tools/recipient_tool.py +0 -235
- langroid/agent/tools/retrieval_tool.py +0 -32
- langroid/agent/tools/rewind_tool.py +0 -137
- langroid/agent/tools/segment_extract_tool.py +0 -41
- langroid/agent/typed_task.py +0 -19
- langroid/agent/xml_tool_message.py +0 -382
- langroid/agent_config.py +0 -0
- langroid/cachedb/__init__.py +0 -17
- langroid/cachedb/base.py +0 -58
- langroid/cachedb/momento_cachedb.py +0 -108
- langroid/cachedb/redis_cachedb.py +0 -153
- langroid/embedding_models/__init__.py +0 -39
- langroid/embedding_models/base.py +0 -74
- langroid/embedding_models/clustering.py +0 -189
- langroid/embedding_models/models.py +0 -461
- langroid/embedding_models/protoc/__init__.py +0 -0
- langroid/embedding_models/protoc/embeddings.proto +0 -19
- langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
- langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
- langroid/embedding_models/remote_embeds.py +0 -153
- langroid/exceptions.py +0 -65
- langroid/experimental/team-save.py +0 -391
- langroid/language_models/.chainlit/config.toml +0 -121
- langroid/language_models/.chainlit/translations/en-US.json +0 -231
- langroid/language_models/__init__.py +0 -53
- langroid/language_models/azure_openai.py +0 -153
- langroid/language_models/base.py +0 -678
- langroid/language_models/config.py +0 -18
- langroid/language_models/mock_lm.py +0 -124
- langroid/language_models/openai_gpt.py +0 -1923
- langroid/language_models/prompt_formatter/__init__.py +0 -16
- langroid/language_models/prompt_formatter/base.py +0 -40
- langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
- langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
- langroid/language_models/utils.py +0 -147
- langroid/mytypes.py +0 -84
- langroid/parsing/__init__.py +0 -52
- langroid/parsing/agent_chats.py +0 -38
- langroid/parsing/code-parsing.md +0 -86
- langroid/parsing/code_parser.py +0 -121
- langroid/parsing/config.py +0 -0
- langroid/parsing/document_parser.py +0 -718
- langroid/parsing/image_text.py +0 -32
- langroid/parsing/para_sentence_split.py +0 -62
- langroid/parsing/parse_json.py +0 -155
- langroid/parsing/parser.py +0 -313
- langroid/parsing/repo_loader.py +0 -790
- langroid/parsing/routing.py +0 -36
- langroid/parsing/search.py +0 -275
- langroid/parsing/spider.py +0 -102
- langroid/parsing/table_loader.py +0 -94
- langroid/parsing/url_loader.py +0 -111
- langroid/parsing/url_loader_cookies.py +0 -73
- langroid/parsing/urls.py +0 -273
- langroid/parsing/utils.py +0 -373
- langroid/parsing/web_search.py +0 -155
- langroid/prompts/__init__.py +0 -9
- langroid/prompts/chat-gpt4-system-prompt.md +0 -68
- langroid/prompts/dialog.py +0 -17
- langroid/prompts/prompts_config.py +0 -5
- langroid/prompts/templates.py +0 -141
- langroid/pydantic_v1/__init__.py +0 -10
- langroid/pydantic_v1/main.py +0 -4
- langroid/utils/.chainlit/config.toml +0 -121
- langroid/utils/.chainlit/translations/en-US.json +0 -231
- langroid/utils/__init__.py +0 -19
- langroid/utils/algorithms/__init__.py +0 -3
- langroid/utils/algorithms/graph.py +0 -103
- langroid/utils/configuration.py +0 -98
- langroid/utils/constants.py +0 -30
- langroid/utils/docker.py +0 -37
- langroid/utils/git_utils.py +0 -252
- langroid/utils/globals.py +0 -49
- langroid/utils/llms/__init__.py +0 -0
- langroid/utils/llms/strings.py +0 -8
- langroid/utils/logging.py +0 -135
- langroid/utils/object_registry.py +0 -66
- langroid/utils/output/__init__.py +0 -20
- langroid/utils/output/citations.py +0 -41
- langroid/utils/output/printing.py +0 -99
- langroid/utils/output/status.py +0 -40
- langroid/utils/pandas_utils.py +0 -30
- langroid/utils/pydantic_utils.py +0 -602
- langroid/utils/system.py +0 -286
- langroid/utils/types.py +0 -93
- langroid/utils/web/__init__.py +0 -0
- langroid/utils/web/login.py +0 -83
- langroid/vector_store/__init__.py +0 -50
- langroid/vector_store/base.py +0 -357
- langroid/vector_store/chromadb.py +0 -214
- langroid/vector_store/lancedb.py +0 -401
- langroid/vector_store/meilisearch.py +0 -299
- langroid/vector_store/momento.py +0 -278
- langroid/vector_store/qdrant_cloud.py +0 -6
- langroid/vector_store/qdrantdb.py +0 -468
- langroid-0.31.1.dist-info/RECORD +0 -162
- {langroid-0.31.1.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
langroid/parsing/image_text.py
DELETED
@@ -1,32 +0,0 @@
|
|
1
|
-
from typing import Union
|
2
|
-
|
3
|
-
import pytesseract
|
4
|
-
from pdf2image import convert_from_bytes, convert_from_path
|
5
|
-
|
6
|
-
|
7
|
-
def pdf_image_to_text(input_data: Union[str, bytes]) -> str:
|
8
|
-
"""
|
9
|
-
Converts a PDF that contains images to text using OCR.
|
10
|
-
|
11
|
-
Args:
|
12
|
-
input_data (Union[str, bytes]): The file path to the PDF or a bytes-like object
|
13
|
-
of the PDF content.
|
14
|
-
|
15
|
-
Returns:
|
16
|
-
str: The extracted text from the PDF.
|
17
|
-
"""
|
18
|
-
|
19
|
-
# Check if the input is a file path (str) or bytes, and
|
20
|
-
# convert PDF to images accordingly
|
21
|
-
if isinstance(input_data, str):
|
22
|
-
images = convert_from_path(input_data)
|
23
|
-
elif isinstance(input_data, bytes):
|
24
|
-
images = convert_from_bytes(input_data)
|
25
|
-
else:
|
26
|
-
raise ValueError("input_data must be a file path (str) or bytes-like object")
|
27
|
-
|
28
|
-
text = ""
|
29
|
-
for image in images:
|
30
|
-
text += pytesseract.image_to_string(image)
|
31
|
-
|
32
|
-
return text
|
@@ -1,62 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from typing import Callable, List
|
3
|
-
|
4
|
-
from bs4 import BeautifulSoup
|
5
|
-
|
6
|
-
|
7
|
-
def remove_extra_whitespace(s: str) -> str:
|
8
|
-
lines = s.split("\n")
|
9
|
-
cleaned_lines = [" ".join(line.split()) for line in lines]
|
10
|
-
return "\n".join(cleaned_lines)
|
11
|
-
|
12
|
-
|
13
|
-
def custom_sent_tokenize(text: str) -> List[str]:
|
14
|
-
sentences = [
|
15
|
-
sentence.strip()
|
16
|
-
for sentence in re.split(r"\.\s|\.\n", text)
|
17
|
-
if sentence.strip()
|
18
|
-
]
|
19
|
-
# append a period if the sentence does not end with one
|
20
|
-
return [s + "." if s[-1] != "." else s for s in sentences]
|
21
|
-
|
22
|
-
|
23
|
-
def create_chunks(
|
24
|
-
text: str, chunk_size: int, length_fn: Callable[[str], int]
|
25
|
-
) -> List[str]:
|
26
|
-
def _chunk_sentences(sentences: List[str], chunk_size: int) -> List[str]:
|
27
|
-
chunks = []
|
28
|
-
current_chunk: List[str] = []
|
29
|
-
current_chunk_length = 0
|
30
|
-
|
31
|
-
for sentence in sentences:
|
32
|
-
sentence_length = length_fn(sentence)
|
33
|
-
if current_chunk_length + sentence_length > chunk_size:
|
34
|
-
if current_chunk:
|
35
|
-
chunks.append(" ".join(current_chunk))
|
36
|
-
current_chunk = [sentence]
|
37
|
-
current_chunk_length = sentence_length
|
38
|
-
else:
|
39
|
-
current_chunk.append(sentence)
|
40
|
-
current_chunk_length += sentence_length
|
41
|
-
|
42
|
-
if current_chunk:
|
43
|
-
new_chunk = " ".join(current_chunk).strip()
|
44
|
-
if new_chunk:
|
45
|
-
chunks.append(" ".join(current_chunk).strip())
|
46
|
-
|
47
|
-
return chunks
|
48
|
-
|
49
|
-
soup = BeautifulSoup(text, "html.parser")
|
50
|
-
text = soup.get_text()
|
51
|
-
# First, try to split the document into paragraphs
|
52
|
-
paragraphs = text.split("\n\n")
|
53
|
-
|
54
|
-
# If paragraphs are too long, split them into sentences
|
55
|
-
if any(length_fn(p) > chunk_size for p in paragraphs):
|
56
|
-
sentences = custom_sent_tokenize(text)
|
57
|
-
chunks = _chunk_sentences(sentences, chunk_size)
|
58
|
-
else:
|
59
|
-
chunks = paragraphs
|
60
|
-
|
61
|
-
chunks = [chunk.strip() for chunk in chunks if chunk.strip() != ""]
|
62
|
-
return chunks
|
langroid/parsing/parse_json.py
DELETED
@@ -1,155 +0,0 @@
|
|
1
|
-
import ast
|
2
|
-
import json
|
3
|
-
from datetime import datetime
|
4
|
-
from typing import Any, Dict, Iterator, List, Union
|
5
|
-
|
6
|
-
import yaml
|
7
|
-
from json_repair import repair_json
|
8
|
-
from pyparsing import nestedExpr, originalTextFor
|
9
|
-
|
10
|
-
|
11
|
-
def is_valid_json(json_str: str) -> bool:
|
12
|
-
"""Check if the input string is a valid JSON.
|
13
|
-
|
14
|
-
Args:
|
15
|
-
json_str (str): The input string to check.
|
16
|
-
|
17
|
-
Returns:
|
18
|
-
bool: True if the input string is a valid JSON, False otherwise.
|
19
|
-
"""
|
20
|
-
try:
|
21
|
-
json.loads(json_str)
|
22
|
-
return True
|
23
|
-
except ValueError:
|
24
|
-
return False
|
25
|
-
|
26
|
-
|
27
|
-
def flatten(nested_list) -> Iterator[str]: # type: ignore
|
28
|
-
"""Flatten a nested list into a single list of strings"""
|
29
|
-
for item in nested_list:
|
30
|
-
if isinstance(item, (list, tuple)):
|
31
|
-
for subitem in flatten(item):
|
32
|
-
yield subitem
|
33
|
-
else:
|
34
|
-
yield item
|
35
|
-
|
36
|
-
|
37
|
-
def get_json_candidates(s: str) -> List[str]:
|
38
|
-
"""Get top-level JSON candidates, i.e. strings between curly braces."""
|
39
|
-
# Define the grammar for matching curly braces
|
40
|
-
curly_braces = originalTextFor(nestedExpr("{", "}"))
|
41
|
-
|
42
|
-
# Parse the string
|
43
|
-
try:
|
44
|
-
results = curly_braces.searchString(s)
|
45
|
-
# Properly convert nested lists to strings
|
46
|
-
return [r[0] for r in results]
|
47
|
-
except Exception:
|
48
|
-
return []
|
49
|
-
|
50
|
-
|
51
|
-
def parse_imperfect_json(json_string: str) -> Union[Dict[str, Any], List[Any]]:
|
52
|
-
if not json_string.strip():
|
53
|
-
raise ValueError("Empty string is not valid JSON")
|
54
|
-
|
55
|
-
# First, try parsing with ast.literal_eval
|
56
|
-
try:
|
57
|
-
result = ast.literal_eval(json_string)
|
58
|
-
if isinstance(result, (dict, list)):
|
59
|
-
return result
|
60
|
-
except (ValueError, SyntaxError):
|
61
|
-
pass
|
62
|
-
|
63
|
-
# If ast.literal_eval fails or returns non-dict/list, try repair_json
|
64
|
-
json_repaired_obj = repair_json(json_string, return_objects=True)
|
65
|
-
if isinstance(json_repaired_obj, (dict, list)):
|
66
|
-
return json_repaired_obj
|
67
|
-
else:
|
68
|
-
try:
|
69
|
-
# fallback on yaml
|
70
|
-
yaml_result = yaml.safe_load(json_string)
|
71
|
-
if isinstance(yaml_result, (dict, list)):
|
72
|
-
return yaml_result
|
73
|
-
except yaml.YAMLError:
|
74
|
-
pass
|
75
|
-
|
76
|
-
# If all methods fail, raise ValueError
|
77
|
-
raise ValueError(f"Unable to parse as JSON: {json_string}")
|
78
|
-
|
79
|
-
|
80
|
-
def try_repair_json_yaml(s: str) -> str | None:
|
81
|
-
"""
|
82
|
-
Attempt to load as json, and if it fails, try repairing the JSON.
|
83
|
-
If that fails, replace any \n with space as a last resort.
|
84
|
-
NOTE - replacing \n with space will result in format loss,
|
85
|
-
which may matter in generated code (e.g. python, toml, etc)
|
86
|
-
"""
|
87
|
-
s_repaired_obj = repair_json(s, return_objects=True)
|
88
|
-
if isinstance(s_repaired_obj, list):
|
89
|
-
if len(s_repaired_obj) > 0:
|
90
|
-
s_repaired_obj = s_repaired_obj[0]
|
91
|
-
else:
|
92
|
-
s_repaired_obj = None
|
93
|
-
if s_repaired_obj is not None:
|
94
|
-
return json.dumps(s_repaired_obj) # type: ignore
|
95
|
-
else:
|
96
|
-
try:
|
97
|
-
yaml_result = yaml.safe_load(s)
|
98
|
-
if isinstance(yaml_result, dict):
|
99
|
-
return json.dumps(yaml_result)
|
100
|
-
except yaml.YAMLError:
|
101
|
-
pass
|
102
|
-
# If it still fails, replace any \n with space as a last resort
|
103
|
-
s = s.replace("\n", " ")
|
104
|
-
if is_valid_json(s):
|
105
|
-
return s
|
106
|
-
else:
|
107
|
-
return None # all failed
|
108
|
-
|
109
|
-
|
110
|
-
def extract_top_level_json(s: str) -> List[str]:
|
111
|
-
"""Extract all top-level JSON-formatted substrings from a given string.
|
112
|
-
|
113
|
-
Args:
|
114
|
-
s (str): The input string to search for JSON substrings.
|
115
|
-
|
116
|
-
Returns:
|
117
|
-
List[str]: A list of top-level JSON-formatted substrings.
|
118
|
-
"""
|
119
|
-
# Find JSON object and array candidates
|
120
|
-
json_candidates = get_json_candidates(s)
|
121
|
-
maybe_repaired_jsons = map(try_repair_json_yaml, json_candidates)
|
122
|
-
|
123
|
-
return [candidate for candidate in maybe_repaired_jsons if candidate is not None]
|
124
|
-
|
125
|
-
|
126
|
-
def top_level_json_field(s: str, f: str) -> Any:
|
127
|
-
"""
|
128
|
-
Extract the value of a field f from a top-level JSON object.
|
129
|
-
If there are multiple, just return the first.
|
130
|
-
|
131
|
-
Args:
|
132
|
-
s (str): The input string to search for JSON substrings.
|
133
|
-
f (str): The field to extract from the JSON object.
|
134
|
-
|
135
|
-
Returns:
|
136
|
-
str: The value of the field f in the top-level JSON object, if any.
|
137
|
-
Otherwise, return an empty string.
|
138
|
-
"""
|
139
|
-
|
140
|
-
jsons = extract_top_level_json(s)
|
141
|
-
if len(jsons) == 0:
|
142
|
-
return ""
|
143
|
-
for j in jsons:
|
144
|
-
json_data = json.loads(j)
|
145
|
-
if f in json_data:
|
146
|
-
return json_data[f]
|
147
|
-
|
148
|
-
return ""
|
149
|
-
|
150
|
-
|
151
|
-
def datetime_to_json(obj: Any) -> Any:
|
152
|
-
if isinstance(obj, datetime):
|
153
|
-
return obj.isoformat()
|
154
|
-
# Let json.dumps() handle the raising of TypeError for non-serializable objects
|
155
|
-
return obj
|
langroid/parsing/parser.py
DELETED
@@ -1,313 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from enum import Enum
|
3
|
-
from typing import Dict, List, Literal
|
4
|
-
|
5
|
-
import tiktoken
|
6
|
-
|
7
|
-
from langroid.mytypes import Document
|
8
|
-
from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
|
9
|
-
from langroid.pydantic_v1 import BaseSettings
|
10
|
-
from langroid.utils.object_registry import ObjectRegistry
|
11
|
-
|
12
|
-
logger = logging.getLogger(__name__)
|
13
|
-
logger.setLevel(logging.WARNING)
|
14
|
-
|
15
|
-
|
16
|
-
class Splitter(str, Enum):
|
17
|
-
TOKENS = "tokens"
|
18
|
-
PARA_SENTENCE = "para_sentence"
|
19
|
-
SIMPLE = "simple"
|
20
|
-
|
21
|
-
|
22
|
-
class PdfParsingConfig(BaseSettings):
|
23
|
-
library: Literal[
|
24
|
-
"fitz",
|
25
|
-
"pdfplumber",
|
26
|
-
"pypdf",
|
27
|
-
"unstructured",
|
28
|
-
"pdf2image",
|
29
|
-
] = "pdfplumber"
|
30
|
-
|
31
|
-
|
32
|
-
class DocxParsingConfig(BaseSettings):
|
33
|
-
library: Literal["python-docx", "unstructured"] = "unstructured"
|
34
|
-
|
35
|
-
|
36
|
-
class DocParsingConfig(BaseSettings):
|
37
|
-
library: Literal["unstructured"] = "unstructured"
|
38
|
-
|
39
|
-
|
40
|
-
class ParsingConfig(BaseSettings):
|
41
|
-
splitter: str = Splitter.TOKENS
|
42
|
-
chunk_size: int = 200 # aim for this many tokens per chunk
|
43
|
-
overlap: int = 50 # overlap between chunks
|
44
|
-
max_chunks: int = 10_000
|
45
|
-
# aim to have at least this many chars per chunk when truncating due to punctuation
|
46
|
-
min_chunk_chars: int = 350
|
47
|
-
discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
|
48
|
-
n_similar_docs: int = 4
|
49
|
-
n_neighbor_ids: int = 5 # window size to store around each chunk
|
50
|
-
separators: List[str] = ["\n\n", "\n", " ", ""]
|
51
|
-
token_encoding_model: str = "text-embedding-ada-002"
|
52
|
-
pdf: PdfParsingConfig = PdfParsingConfig()
|
53
|
-
docx: DocxParsingConfig = DocxParsingConfig()
|
54
|
-
doc: DocParsingConfig = DocParsingConfig()
|
55
|
-
|
56
|
-
|
57
|
-
class Parser:
|
58
|
-
def __init__(self, config: ParsingConfig):
|
59
|
-
self.config = config
|
60
|
-
try:
|
61
|
-
self.tokenizer = tiktoken.encoding_for_model(config.token_encoding_model)
|
62
|
-
except Exception:
|
63
|
-
self.tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")
|
64
|
-
|
65
|
-
def num_tokens(self, text: str) -> int:
|
66
|
-
tokens = self.tokenizer.encode(text)
|
67
|
-
return len(tokens)
|
68
|
-
|
69
|
-
def truncate_tokens(self, text: str, max_tokens: int) -> str:
|
70
|
-
tokens = self.tokenizer.encode(text)
|
71
|
-
if len(tokens) <= max_tokens:
|
72
|
-
return text
|
73
|
-
return self.tokenizer.decode(tokens[:max_tokens])
|
74
|
-
|
75
|
-
def add_window_ids(self, chunks: List[Document]) -> None:
|
76
|
-
"""Chunks may belong to multiple docs, but for each doc,
|
77
|
-
they appear consecutively. Add window_ids in metadata"""
|
78
|
-
|
79
|
-
# discard empty chunks
|
80
|
-
chunks = [c for c in chunks if c.content.strip() != ""]
|
81
|
-
if len(chunks) == 0:
|
82
|
-
return
|
83
|
-
# The original metadata.id (if any) is ignored since it will be same for all
|
84
|
-
# chunks and is useless. We want a distinct id for each chunk.
|
85
|
-
# ASSUMPTION: all chunks c of a doc have same c.metadata.id !
|
86
|
-
orig_ids = [c.metadata.id for c in chunks]
|
87
|
-
ids = [ObjectRegistry.new_id() for c in chunks]
|
88
|
-
id2chunk = {id: c for id, c in zip(ids, chunks)}
|
89
|
-
|
90
|
-
# group the ids by orig_id
|
91
|
-
# (each distinct orig_id refers to a different document)
|
92
|
-
orig_id_to_ids: Dict[str, List[str]] = {}
|
93
|
-
for orig_id, id in zip(orig_ids, ids):
|
94
|
-
if orig_id not in orig_id_to_ids:
|
95
|
-
orig_id_to_ids[orig_id] = []
|
96
|
-
orig_id_to_ids[orig_id].append(id)
|
97
|
-
|
98
|
-
# now each orig_id maps to a sequence of ids within a single doc
|
99
|
-
|
100
|
-
k = self.config.n_neighbor_ids
|
101
|
-
for orig, ids in orig_id_to_ids.items():
|
102
|
-
# ids are consecutive chunks in a single doc
|
103
|
-
n = len(ids)
|
104
|
-
window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
|
105
|
-
for i, _ in enumerate(ids):
|
106
|
-
c = id2chunk[ids[i]]
|
107
|
-
c.metadata.window_ids = window_ids[i]
|
108
|
-
c.metadata.id = ids[i]
|
109
|
-
c.metadata.is_chunk = True
|
110
|
-
|
111
|
-
def split_simple(self, docs: List[Document]) -> List[Document]:
|
112
|
-
if len(self.config.separators) == 0:
|
113
|
-
raise ValueError("Must have at least one separator")
|
114
|
-
final_docs = []
|
115
|
-
|
116
|
-
for d in docs:
|
117
|
-
if d.content.strip() == "":
|
118
|
-
continue
|
119
|
-
chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
|
120
|
-
# note we are ensuring we COPY the document metadata into each chunk,
|
121
|
-
# which ensures all chunks of a given doc have same metadata
|
122
|
-
# (and in particular same metadata.id, which is important later for
|
123
|
-
# add_window_ids)
|
124
|
-
chunk_docs = [
|
125
|
-
Document(
|
126
|
-
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
127
|
-
)
|
128
|
-
for c in chunks
|
129
|
-
if c.strip() != ""
|
130
|
-
]
|
131
|
-
self.add_window_ids(chunk_docs)
|
132
|
-
final_docs += chunk_docs
|
133
|
-
return final_docs
|
134
|
-
|
135
|
-
def split_para_sentence(self, docs: List[Document]) -> List[Document]:
|
136
|
-
chunks = docs
|
137
|
-
while True:
|
138
|
-
un_splittables = 0
|
139
|
-
split_chunks = []
|
140
|
-
for c in chunks:
|
141
|
-
if c.content.strip() == "":
|
142
|
-
continue
|
143
|
-
if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
|
144
|
-
# small chunk: no need to split
|
145
|
-
split_chunks.append(c)
|
146
|
-
continue
|
147
|
-
splits = self._split_para_sentence_once([c])
|
148
|
-
un_splittables += len(splits) == 1
|
149
|
-
split_chunks += splits
|
150
|
-
if len(split_chunks) == len(chunks):
|
151
|
-
if un_splittables > 0:
|
152
|
-
max_len = max([self.num_tokens(p.content) for p in chunks])
|
153
|
-
logger.warning(
|
154
|
-
f"""
|
155
|
-
Unable to split {un_splittables} chunks
|
156
|
-
using chunk_size = {self.config.chunk_size}.
|
157
|
-
Max chunk size is {max_len} tokens.
|
158
|
-
"""
|
159
|
-
)
|
160
|
-
break # we won't be able to shorten them with current settings
|
161
|
-
chunks = split_chunks.copy()
|
162
|
-
|
163
|
-
self.add_window_ids(chunks)
|
164
|
-
return chunks
|
165
|
-
|
166
|
-
def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
|
167
|
-
final_chunks = []
|
168
|
-
for d in docs:
|
169
|
-
if d.content.strip() == "":
|
170
|
-
continue
|
171
|
-
chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
|
172
|
-
# note we are ensuring we COPY the document metadata into each chunk,
|
173
|
-
# which ensures all chunks of a given doc have same metadata
|
174
|
-
# (and in particular same metadata.id, which is important later for
|
175
|
-
# add_window_ids)
|
176
|
-
chunk_docs = [
|
177
|
-
Document(
|
178
|
-
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
179
|
-
)
|
180
|
-
for c in chunks
|
181
|
-
if c.strip() != ""
|
182
|
-
]
|
183
|
-
final_chunks += chunk_docs
|
184
|
-
|
185
|
-
return final_chunks
|
186
|
-
|
187
|
-
def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
|
188
|
-
final_docs = []
|
189
|
-
for d in docs:
|
190
|
-
chunks = self.chunk_tokens(d.content)
|
191
|
-
# note we are ensuring we COPY the document metadata into each chunk,
|
192
|
-
# which ensures all chunks of a given doc have same metadata
|
193
|
-
# (and in particular same metadata.id, which is important later for
|
194
|
-
# add_window_ids)
|
195
|
-
chunk_docs = [
|
196
|
-
Document(
|
197
|
-
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
198
|
-
)
|
199
|
-
for c in chunks
|
200
|
-
if c.strip() != ""
|
201
|
-
]
|
202
|
-
self.add_window_ids(chunk_docs)
|
203
|
-
final_docs += chunk_docs
|
204
|
-
return final_docs
|
205
|
-
|
206
|
-
def chunk_tokens(
|
207
|
-
self,
|
208
|
-
text: str,
|
209
|
-
) -> List[str]:
|
210
|
-
"""
|
211
|
-
Split a text into chunks of ~CHUNK_SIZE tokens,
|
212
|
-
based on punctuation and newline boundaries.
|
213
|
-
Adapted from
|
214
|
-
https://github.com/openai/chatgpt-retrieval-plugin/blob/main/services/chunks.py
|
215
|
-
|
216
|
-
Args:
|
217
|
-
text: The text to split into chunks.
|
218
|
-
|
219
|
-
Returns:
|
220
|
-
A list of text chunks, each of which is a string of tokens
|
221
|
-
roughly self.config.chunk_size tokens long.
|
222
|
-
"""
|
223
|
-
# Return an empty list if the text is empty or whitespace
|
224
|
-
if not text or text.isspace():
|
225
|
-
return []
|
226
|
-
|
227
|
-
# Tokenize the text
|
228
|
-
tokens = self.tokenizer.encode(text, disallowed_special=())
|
229
|
-
|
230
|
-
# Initialize an empty list of chunks
|
231
|
-
chunks = []
|
232
|
-
|
233
|
-
# Initialize a counter for the number of chunks
|
234
|
-
num_chunks = 0
|
235
|
-
|
236
|
-
# Loop until all tokens are consumed
|
237
|
-
while tokens and num_chunks < self.config.max_chunks:
|
238
|
-
# Take the first chunk_size tokens as a chunk
|
239
|
-
chunk = tokens[: self.config.chunk_size]
|
240
|
-
|
241
|
-
# Decode the chunk into text
|
242
|
-
chunk_text = self.tokenizer.decode(chunk)
|
243
|
-
|
244
|
-
# Skip the chunk if it is empty or whitespace
|
245
|
-
if not chunk_text or chunk_text.isspace():
|
246
|
-
# Remove the tokens corresponding to the chunk text
|
247
|
-
# from remaining tokens
|
248
|
-
tokens = tokens[len(chunk) :]
|
249
|
-
# Continue to the next iteration of the loop
|
250
|
-
continue
|
251
|
-
|
252
|
-
# Find the last period or punctuation mark in the chunk
|
253
|
-
last_punctuation = max(
|
254
|
-
chunk_text.rfind("."),
|
255
|
-
chunk_text.rfind("?"),
|
256
|
-
chunk_text.rfind("!"),
|
257
|
-
chunk_text.rfind("\n"),
|
258
|
-
)
|
259
|
-
|
260
|
-
# If there is a punctuation mark, and the last punctuation index is
|
261
|
-
# after MIN_CHUNK_SIZE_CHARS
|
262
|
-
if (
|
263
|
-
last_punctuation != -1
|
264
|
-
and last_punctuation > self.config.min_chunk_chars
|
265
|
-
):
|
266
|
-
# Truncate the chunk text at the punctuation mark
|
267
|
-
chunk_text = chunk_text[: last_punctuation + 1]
|
268
|
-
|
269
|
-
# Remove any newline characters and strip any leading or
|
270
|
-
# trailing whitespace
|
271
|
-
chunk_text_to_append = chunk_text.replace("\n", " ").strip()
|
272
|
-
|
273
|
-
if len(chunk_text_to_append) > self.config.discard_chunk_chars:
|
274
|
-
# Append the chunk text to the list of chunks
|
275
|
-
chunks.append(chunk_text_to_append)
|
276
|
-
|
277
|
-
# Remove the tokens corresponding to the chunk text
|
278
|
-
# from the remaining tokens
|
279
|
-
tokens = tokens[
|
280
|
-
len(self.tokenizer.encode(chunk_text, disallowed_special=())) :
|
281
|
-
]
|
282
|
-
|
283
|
-
# Increment the number of chunks
|
284
|
-
num_chunks += 1
|
285
|
-
|
286
|
-
# There may be remaining tokens, but we discard them
|
287
|
-
# since we have already reached the maximum number of chunks
|
288
|
-
|
289
|
-
return chunks
|
290
|
-
|
291
|
-
def split(self, docs: List[Document]) -> List[Document]:
|
292
|
-
if len(docs) == 0:
|
293
|
-
return []
|
294
|
-
# create ids in metadata of docs if absent:
|
295
|
-
# we need this to distinguish docs later in add_window_ids
|
296
|
-
for d in docs:
|
297
|
-
if d.metadata.id in [None, ""]:
|
298
|
-
d.metadata.id = ObjectRegistry.new_id()
|
299
|
-
# some docs are already splits, so don't split them further!
|
300
|
-
chunked_docs = [d for d in docs if d.metadata.is_chunk]
|
301
|
-
big_docs = [d for d in docs if not d.metadata.is_chunk]
|
302
|
-
if len(big_docs) == 0:
|
303
|
-
return chunked_docs
|
304
|
-
if self.config.splitter == Splitter.PARA_SENTENCE:
|
305
|
-
big_doc_chunks = self.split_para_sentence(big_docs)
|
306
|
-
elif self.config.splitter == Splitter.TOKENS:
|
307
|
-
big_doc_chunks = self.split_chunk_tokens(big_docs)
|
308
|
-
elif self.config.splitter == Splitter.SIMPLE:
|
309
|
-
big_doc_chunks = self.split_simple(big_docs)
|
310
|
-
else:
|
311
|
-
raise ValueError(f"Unknown splitter: {self.config.splitter}")
|
312
|
-
|
313
|
-
return chunked_docs + big_doc_chunks
|