langroid 0.49.0__py3-none-any.whl → 0.50.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +1 -1
- langroid/language_models/openai_gpt.py +56 -7
- langroid/mytypes.py +28 -1
- langroid/parsing/document_parser.py +0 -3
- langroid/parsing/md_parser.py +574 -0
- langroid/parsing/parser.py +34 -10
- langroid/parsing/url_loader.py +20 -3
- langroid/utils/output/citations.py +2 -2
- {langroid-0.49.0.dist-info → langroid-0.50.0.dist-info}/METADATA +2 -1
- {langroid-0.49.0.dist-info → langroid-0.50.0.dist-info}/RECORD +12 -11
- {langroid-0.49.0.dist-info → langroid-0.50.0.dist-info}/WHEEL +0 -0
- {langroid-0.49.0.dist-info → langroid-0.50.0.dist-info}/licenses/LICENSE +0 -0
@@ -174,7 +174,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
174
174
|
"https://ai.googleblog.com/2022/11/characterizing-emergent-phenomena-in.html",
|
175
175
|
]
|
176
176
|
parsing: ParsingConfig = ParsingConfig( # modify as needed
|
177
|
-
splitter=Splitter.
|
177
|
+
splitter=Splitter.MARKDOWN,
|
178
178
|
chunk_size=1000, # aim for this many tokens per chunk
|
179
179
|
overlap=100, # overlap between chunks
|
180
180
|
max_chunks=10_000,
|
@@ -1059,8 +1059,8 @@ class OpenAIGPT(LanguageModel):
|
|
1059
1059
|
)
|
1060
1060
|
if is_break:
|
1061
1061
|
break
|
1062
|
-
except Exception:
|
1063
|
-
|
1062
|
+
except Exception as e:
|
1063
|
+
logging.warning("Error while processing stream response: %s", str(e))
|
1064
1064
|
|
1065
1065
|
print("")
|
1066
1066
|
# TODO- get usage info in stream mode (?)
|
@@ -1121,8 +1121,8 @@ class OpenAIGPT(LanguageModel):
|
|
1121
1121
|
)
|
1122
1122
|
if is_break:
|
1123
1123
|
break
|
1124
|
-
except Exception:
|
1125
|
-
|
1124
|
+
except Exception as e:
|
1125
|
+
logging.warning("Error while processing stream response: %s", str(e))
|
1126
1126
|
|
1127
1127
|
print("")
|
1128
1128
|
# TODO- get usage info in stream mode (?)
|
@@ -1703,11 +1703,32 @@ class OpenAIGPT(LanguageModel):
|
|
1703
1703
|
if self.config.litellm and settings.debug:
|
1704
1704
|
kwargs["logger_fn"] = litellm_logging_fn
|
1705
1705
|
result = completion_call(**kwargs)
|
1706
|
-
|
1707
|
-
|
1706
|
+
|
1707
|
+
if self.get_stream():
|
1708
|
+
# If streaming, cannot cache result
|
1708
1709
|
# since it is a generator. Instead,
|
1709
1710
|
# we hold on to the hashed_key and
|
1710
1711
|
# cache the result later
|
1712
|
+
|
1713
|
+
# Test if this is a stream with an exception by
|
1714
|
+
# trying to get first chunk: Some providers like LiteLLM
|
1715
|
+
# produce a valid stream object `result` instead of throwing a
|
1716
|
+
# rate-limit error, and if we don't catch it here,
|
1717
|
+
# we end up returning an empty response and not
|
1718
|
+
# using the retry mechanism in the decorator.
|
1719
|
+
try:
|
1720
|
+
# try to get the first chunk to check for errors
|
1721
|
+
test_iter = iter(result)
|
1722
|
+
first_chunk = next(test_iter)
|
1723
|
+
# If we get here without error, recreate the stream
|
1724
|
+
result = chain([first_chunk], test_iter)
|
1725
|
+
except StopIteration:
|
1726
|
+
# Empty stream is fine
|
1727
|
+
pass
|
1728
|
+
except Exception as e:
|
1729
|
+
# Propagate any errors in the stream
|
1730
|
+
raise e
|
1731
|
+
else:
|
1711
1732
|
self._cache_store(hashed_key, result.model_dump())
|
1712
1733
|
return cached, hashed_key, result
|
1713
1734
|
|
@@ -1734,7 +1755,35 @@ class OpenAIGPT(LanguageModel):
|
|
1734
1755
|
kwargs["logger_fn"] = litellm_logging_fn
|
1735
1756
|
# If it's not in the cache, call the API
|
1736
1757
|
result = await acompletion_call(**kwargs)
|
1737
|
-
if
|
1758
|
+
if self.get_stream():
|
1759
|
+
try:
|
1760
|
+
# Try to peek at the first chunk to immediately catch any errors
|
1761
|
+
# Store the original result (the stream)
|
1762
|
+
original_stream = result
|
1763
|
+
|
1764
|
+
# Manually create and advance the iterator to check for errors
|
1765
|
+
stream_iter = original_stream.__aiter__()
|
1766
|
+
try:
|
1767
|
+
# This will raise an exception if the stream is invalid
|
1768
|
+
first_chunk = await anext(stream_iter)
|
1769
|
+
|
1770
|
+
# If we reach here, the stream started successfully
|
1771
|
+
# Now recreate a fresh stream from the original API result
|
1772
|
+
# Otherwise, return a new stream that yields the first chunk
|
1773
|
+
# and remaining items
|
1774
|
+
async def combined_stream(): # type: ignore
|
1775
|
+
yield first_chunk
|
1776
|
+
async for chunk in stream_iter:
|
1777
|
+
yield chunk
|
1778
|
+
|
1779
|
+
result = combined_stream() # type: ignore
|
1780
|
+
except StopAsyncIteration:
|
1781
|
+
# Empty stream is normal - nothing to do
|
1782
|
+
pass
|
1783
|
+
except Exception as e:
|
1784
|
+
# Any exception here should be raised to trigger the retry mechanism
|
1785
|
+
raise e
|
1786
|
+
else:
|
1738
1787
|
self._cache_store(hashed_key, result.model_dump())
|
1739
1788
|
return cached, hashed_key, result
|
1740
1789
|
|
langroid/mytypes.py
CHANGED
@@ -65,6 +65,33 @@ class DocMetaData(BaseModel):
|
|
65
65
|
|
66
66
|
return original_dict
|
67
67
|
|
68
|
+
def __str__(self) -> str:
|
69
|
+
title_str = (
|
70
|
+
""
|
71
|
+
if "unknown" in self.title.lower() or self.title.strip() == ""
|
72
|
+
else f"Title: {self.title}"
|
73
|
+
)
|
74
|
+
date_str = ""
|
75
|
+
if (
|
76
|
+
"unknown" not in self.published_date.lower()
|
77
|
+
and self.published_date.strip() != ""
|
78
|
+
):
|
79
|
+
try:
|
80
|
+
from dateutil import parser
|
81
|
+
|
82
|
+
# Try to parse the date string
|
83
|
+
date_obj = parser.parse(self.published_date)
|
84
|
+
# Format to include only the date part (year-month-day)
|
85
|
+
date_only = date_obj.strftime("%Y-%m-%d")
|
86
|
+
date_str = f"Date: {date_only}"
|
87
|
+
except (ValueError, ImportError, TypeError):
|
88
|
+
# If parsing fails, just use the original date
|
89
|
+
date_str = f"Date: {self.published_date}"
|
90
|
+
components = [self.source] + (
|
91
|
+
[] if title_str + date_str == "" else [title_str, date_str]
|
92
|
+
)
|
93
|
+
return ", ".join(components)
|
94
|
+
|
68
95
|
class Config:
|
69
96
|
extra = Extra.allow
|
70
97
|
|
@@ -93,7 +120,7 @@ class Document(BaseModel):
|
|
93
120
|
return dedent(
|
94
121
|
f"""
|
95
122
|
CONTENT: {self.content}
|
96
|
-
SOURCE:{self.metadata
|
123
|
+
SOURCE:{str(self.metadata)}
|
97
124
|
"""
|
98
125
|
)
|
99
126
|
|
@@ -380,9 +380,6 @@ class DocumentParser(Parser):
|
|
380
380
|
Get document chunks from a pdf source,
|
381
381
|
with page references in the document metadata.
|
382
382
|
|
383
|
-
Adapted from
|
384
|
-
https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
|
385
|
-
|
386
383
|
Returns:
|
387
384
|
List[Document]: a list of `Document` objects,
|
388
385
|
each containing a chunk of text
|
@@ -0,0 +1,574 @@
|
|
1
|
+
import re
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from langroid.pydantic_v1 import BaseModel, Field
|
5
|
+
|
6
|
+
HEADER_CONTEXT_SEP = "\n...\n"
|
7
|
+
|
8
|
+
|
9
|
+
# Pydantic model definition for a node in the markdown hierarchy
|
10
|
+
class Node(BaseModel):
|
11
|
+
content: str # The text of the header or content block
|
12
|
+
path: List[str] # List of header texts from root to this node
|
13
|
+
children: List["Node"] = Field(default_factory=list)
|
14
|
+
# Nested children nodes
|
15
|
+
|
16
|
+
def __repr__(self) -> str:
|
17
|
+
# for debug printing
|
18
|
+
return (
|
19
|
+
f"Node(content={self.content!r}, path={self.path!r}, "
|
20
|
+
f"children={len(self.children)})"
|
21
|
+
)
|
22
|
+
|
23
|
+
# Pydantic v1 requires forward references for self-referencing models
|
24
|
+
# Forward references will be resolved with the update_forward_refs call below.
|
25
|
+
|
26
|
+
|
27
|
+
# Resolve forward references for Node (required for recursive models in Pydantic v1)
|
28
|
+
Node.update_forward_refs()
|
29
|
+
|
30
|
+
|
31
|
+
def _cleanup_text(text: str) -> str:
|
32
|
+
# 1) Convert alternative newline representations (any CRLF or CR) to a single '\n'
|
33
|
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
34
|
+
|
35
|
+
# 2) Replace 3 or more consecutive newlines with exactly 2 newlines
|
36
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
37
|
+
|
38
|
+
return text
|
39
|
+
|
40
|
+
|
41
|
+
HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
|
42
|
+
|
43
|
+
|
44
|
+
def parse_markdown_headings(md_text: str) -> List[Node]:
|
45
|
+
"""
|
46
|
+
Parse `md_text` to extract a heading-based hierarchy, skipping lines
|
47
|
+
that look like headings inside fenced code blocks. Each heading node
|
48
|
+
will have a child node for the text that appears between this heading
|
49
|
+
and the next heading.
|
50
|
+
|
51
|
+
Returns a list of top-level Node objects.
|
52
|
+
|
53
|
+
Example structure:
|
54
|
+
Node(content='# Chapter 1', path=['# Chapter 1'], children=[
|
55
|
+
Node(content='Intro paragraph...', path=['# Chapter 1'], children=[]),
|
56
|
+
Node(content='## Section 1.1', path=['# Chapter 1', '## Section 1.1'],
|
57
|
+
children=[
|
58
|
+
Node(content='Some text in Section 1.1.', path=[...], children=[])
|
59
|
+
]),
|
60
|
+
...
|
61
|
+
])
|
62
|
+
"""
|
63
|
+
# If doc is empty or only whitespace, return []
|
64
|
+
if not md_text.strip():
|
65
|
+
return []
|
66
|
+
|
67
|
+
lines = md_text.splitlines(True) # keep the newline characters
|
68
|
+
|
69
|
+
# We'll scan line-by-line, track code-fence status, collect headings
|
70
|
+
headings = [] # list of (level, heading_line, start_line_idx)
|
71
|
+
in_code_fence = False
|
72
|
+
fence_marker = None # track which triple-backtick or ~~~ opened
|
73
|
+
|
74
|
+
for i, line in enumerate(lines):
|
75
|
+
# Check if we're toggling in/out of a fenced code block
|
76
|
+
# Typically triple backtick or triple tilde: ``` or ~~~
|
77
|
+
# We do a *loose* check: a line that starts with at least 3 backticks or tildes
|
78
|
+
# ignoring trailing text. You can refine as needed.
|
79
|
+
fence_match = re.match(r"^(```+|~~~+)", line.strip())
|
80
|
+
if fence_match:
|
81
|
+
# If we are not in a fence, we enter one;
|
82
|
+
# If we are in a fence, we exit if the marker matches
|
83
|
+
marker = fence_match.group(1) # e.g. "```" or "~~~~"
|
84
|
+
if not in_code_fence:
|
85
|
+
in_code_fence = True
|
86
|
+
fence_marker = marker[:3] # store triple backtick or triple tilde
|
87
|
+
else:
|
88
|
+
# only close if the fence_marker matches
|
89
|
+
# E.g. if we opened with ```, we close only on ```
|
90
|
+
if fence_marker and marker.startswith(fence_marker):
|
91
|
+
in_code_fence = False
|
92
|
+
fence_marker = None
|
93
|
+
|
94
|
+
if not in_code_fence:
|
95
|
+
# Check if the line is a heading
|
96
|
+
m = HEADING_RE.match(line)
|
97
|
+
if m:
|
98
|
+
hashes = m.group(1) # e.g. "##"
|
99
|
+
heading_text = line.rstrip("\n") # entire line, exact
|
100
|
+
level = len(hashes)
|
101
|
+
headings.append((level, heading_text, i))
|
102
|
+
|
103
|
+
# If no headings found, return a single root node with the entire text
|
104
|
+
if not headings:
|
105
|
+
return [Node(content=md_text.strip(), path=[], children=[])]
|
106
|
+
|
107
|
+
# Add a sentinel heading at the end-of-file, so we can slice the last block
|
108
|
+
# after the final real heading. We'll use level=0 so it doesn't form a real node.
|
109
|
+
headings.append((0, "", len(lines)))
|
110
|
+
|
111
|
+
# Now we build "heading blocks" with
|
112
|
+
# (level, heading_text, start_line, end_line, content)
|
113
|
+
heading_blocks = []
|
114
|
+
for idx in range(len(headings) - 1):
|
115
|
+
level, heading_line, start_i = headings[idx]
|
116
|
+
next_level, _, next_start_i = headings[idx + 1]
|
117
|
+
|
118
|
+
# Content is everything after the heading line until the next heading
|
119
|
+
# i.e. lines[start_i+1 : next_start_i]
|
120
|
+
block_content_lines = lines[start_i + 1 : next_start_i]
|
121
|
+
block_content = "".join(block_content_lines).rstrip("\n")
|
122
|
+
|
123
|
+
heading_blocks.append(
|
124
|
+
{"level": level, "heading_text": heading_line, "content": block_content}
|
125
|
+
)
|
126
|
+
# (We skip the sentinel heading in the final result.)
|
127
|
+
|
128
|
+
# We'll now convert heading_blocks into a tree using a stack-based approach
|
129
|
+
root_nodes: List[Node] = []
|
130
|
+
stack: List[Node] = []
|
131
|
+
header_path: List[str] = []
|
132
|
+
|
133
|
+
for hb in heading_blocks:
|
134
|
+
level = hb["level"] # type: ignore
|
135
|
+
heading_txt = hb["heading_text"]
|
136
|
+
content_txt = hb["content"]
|
137
|
+
|
138
|
+
# --- Pop stack first! ---
|
139
|
+
while stack and len(stack[-1].path) >= level:
|
140
|
+
stack.pop()
|
141
|
+
header_path.pop()
|
142
|
+
|
143
|
+
# build new path, create a node for the heading
|
144
|
+
new_path = header_path + [heading_txt]
|
145
|
+
heading_node = Node(
|
146
|
+
content=heading_txt, path=new_path, children=[] # type: ignore
|
147
|
+
)
|
148
|
+
|
149
|
+
# Possibly create a content child for whatever lines were below the heading
|
150
|
+
if content_txt.strip(): # type: ignore
|
151
|
+
content_node = Node(
|
152
|
+
content=content_txt, path=new_path, children=[] # type: ignore
|
153
|
+
)
|
154
|
+
heading_node.children.append(content_node)
|
155
|
+
|
156
|
+
# Attach heading_node to the stack top or as a root
|
157
|
+
if stack:
|
158
|
+
stack[-1].children.append(heading_node)
|
159
|
+
else:
|
160
|
+
root_nodes.append(heading_node)
|
161
|
+
|
162
|
+
stack.append(heading_node)
|
163
|
+
header_path.append(heading_txt) # type: ignore
|
164
|
+
|
165
|
+
return root_nodes
|
166
|
+
|
167
|
+
|
168
|
+
# The Chunk model for the final enriched chunks.
|
169
|
+
class Chunk(BaseModel):
|
170
|
+
text: str # The chunk text (which includes header context)
|
171
|
+
path: List[str] # The header path (list of header strings)
|
172
|
+
token_count: int
|
173
|
+
|
174
|
+
|
175
|
+
# Configuration for chunking
|
176
|
+
class MarkdownChunkConfig(BaseModel):
|
177
|
+
chunk_size: int = 200 # desired chunk size in tokens
|
178
|
+
overlap_tokens: int = 30 # number of tokens to overlap between chunks
|
179
|
+
variation_percent: float = 0.3 # allowed variation
|
180
|
+
rollup: bool = True # whether to roll up chunks
|
181
|
+
header_context_sep: str = HEADER_CONTEXT_SEP # separator for header context
|
182
|
+
|
183
|
+
|
184
|
+
# A simple tokenizer that counts tokens as whitespace-separated words.
|
185
|
+
def count_words(text: str) -> int:
|
186
|
+
return len(text.split())
|
187
|
+
|
188
|
+
|
189
|
+
def recursive_chunk(text: str, config: MarkdownChunkConfig) -> List[str]:
|
190
|
+
"""
|
191
|
+
Enhanced chunker that:
|
192
|
+
1. Splits by paragraph (top-level).
|
193
|
+
2. Splits paragraphs by sentences if needed (never mid-sentence unless huge).
|
194
|
+
3. Allows going over the upper bound rather than splitting a single sentence.
|
195
|
+
4. Overlaps only once between consecutive chunks.
|
196
|
+
5. Looks ahead to avoid a "dangling" final chunk below the lower bound.
|
197
|
+
6. Preserves \n\n (and other original spacing) as best as possible.
|
198
|
+
"""
|
199
|
+
|
200
|
+
# -------------------------------------------------
|
201
|
+
# Helpers
|
202
|
+
# -------------------------------------------------
|
203
|
+
def count_words(text_block: str) -> int:
|
204
|
+
return len(text_block.split())
|
205
|
+
|
206
|
+
lower_bound = int(config.chunk_size * (1 - config.variation_percent))
|
207
|
+
upper_bound = int(config.chunk_size * (1 + config.variation_percent))
|
208
|
+
|
209
|
+
# Quick check: if the entire text is short enough, return as-is.
|
210
|
+
if count_words(text) <= upper_bound:
|
211
|
+
return [text.strip()]
|
212
|
+
|
213
|
+
# Split into paragraphs, preserving \n\n if it's there.
|
214
|
+
raw_paragraphs = text.split("\n\n")
|
215
|
+
paragraphs = []
|
216
|
+
for i, p in enumerate(raw_paragraphs):
|
217
|
+
if p.strip():
|
218
|
+
# Re-append the double-newline if not the last piece
|
219
|
+
if i < len(raw_paragraphs) - 1:
|
220
|
+
paragraphs.append(p + "\n\n")
|
221
|
+
else:
|
222
|
+
paragraphs.append(p)
|
223
|
+
|
224
|
+
# Split paragraphs into "segments": each segment is either
|
225
|
+
# a full short paragraph or (if too big) a list of sentences.
|
226
|
+
sentence_regex = r"(?<=[.!?])\s+"
|
227
|
+
|
228
|
+
def split_paragraph_into_sentences(paragraph: str) -> List[str]:
|
229
|
+
"""
|
230
|
+
Return a list of sentence-sized segments. If a single sentence
|
231
|
+
is bigger than upper_bound, do a word-level fallback.
|
232
|
+
"""
|
233
|
+
if count_words(paragraph) <= upper_bound:
|
234
|
+
return [paragraph]
|
235
|
+
|
236
|
+
sentences = re.split(sentence_regex, paragraph)
|
237
|
+
# Clean up stray whitespace
|
238
|
+
sentences = [s.strip() for s in sentences if s.strip()]
|
239
|
+
|
240
|
+
expanded = []
|
241
|
+
for s in sentences:
|
242
|
+
if count_words(s) > upper_bound:
|
243
|
+
expanded.extend(_fallback_word_split(s, config))
|
244
|
+
else:
|
245
|
+
expanded.append(s)
|
246
|
+
return expanded
|
247
|
+
|
248
|
+
def _fallback_word_split(long_text: str, cfg: MarkdownChunkConfig) -> List[str]:
|
249
|
+
"""
|
250
|
+
As a last resort, split extremely large 'sentence' by words.
|
251
|
+
"""
|
252
|
+
words = long_text.split()
|
253
|
+
pieces = []
|
254
|
+
start = 0
|
255
|
+
while start < len(words):
|
256
|
+
end = start + cfg.chunk_size
|
257
|
+
chunk_words = words[start:end]
|
258
|
+
pieces.append(" ".join(chunk_words))
|
259
|
+
start = end
|
260
|
+
return pieces
|
261
|
+
|
262
|
+
# Build a list of segments
|
263
|
+
segments = []
|
264
|
+
for para in paragraphs:
|
265
|
+
if count_words(para) > upper_bound:
|
266
|
+
# split into sentences
|
267
|
+
segs = split_paragraph_into_sentences(para)
|
268
|
+
segments.extend(segs)
|
269
|
+
else:
|
270
|
+
segments.append(para)
|
271
|
+
|
272
|
+
# -------------------------------------------------
|
273
|
+
# Accumulate segments into final chunks
|
274
|
+
# -------------------------------------------------
|
275
|
+
chunks = []
|
276
|
+
current_chunk = ""
|
277
|
+
current_count = 0
|
278
|
+
|
279
|
+
def flush_chunk() -> None:
|
280
|
+
nonlocal current_chunk, current_count
|
281
|
+
trimmed = current_chunk.strip()
|
282
|
+
if trimmed:
|
283
|
+
chunks.append(trimmed)
|
284
|
+
current_chunk = ""
|
285
|
+
current_count = 0
|
286
|
+
|
287
|
+
def remaining_tokens_in_future(all_segments: List[str], current_index: int) -> int:
|
288
|
+
"""Sum of word counts from current_index onward."""
|
289
|
+
return sum(count_words(s) for s in all_segments[current_index:])
|
290
|
+
|
291
|
+
for i, seg in enumerate(segments):
|
292
|
+
seg_count = count_words(seg)
|
293
|
+
|
294
|
+
# If this single segment alone exceeds upper_bound, we accept it as a big chunk.
|
295
|
+
if seg_count > upper_bound:
|
296
|
+
# If we have something in the current chunk, flush it first
|
297
|
+
flush_chunk()
|
298
|
+
# Then store this large segment as its own chunk
|
299
|
+
chunks.append(seg.strip())
|
300
|
+
continue
|
301
|
+
|
302
|
+
# Attempt to add seg to the current chunk
|
303
|
+
if (current_count + seg_count) > upper_bound and (current_count >= lower_bound):
|
304
|
+
# We would normally flush here, but let's see if we are nearing the end:
|
305
|
+
# If the remaining tokens (including this one) is < lower_bound,
|
306
|
+
# we just add it anyway to avoid creating a tiny final chunk.
|
307
|
+
future_tokens = remaining_tokens_in_future(segments, i)
|
308
|
+
if future_tokens < lower_bound:
|
309
|
+
# Just add it (allowing to exceed upper bound)
|
310
|
+
if current_chunk:
|
311
|
+
# Add space or preserve newline carefully
|
312
|
+
# We'll do a basic approach here:
|
313
|
+
if seg.startswith("\n\n"):
|
314
|
+
current_chunk += seg # preserve double new line
|
315
|
+
else:
|
316
|
+
current_chunk += " " + seg
|
317
|
+
current_count = count_words(current_chunk)
|
318
|
+
else:
|
319
|
+
current_chunk = seg
|
320
|
+
current_count = seg_count
|
321
|
+
else:
|
322
|
+
# Normal flush
|
323
|
+
old_chunk = current_chunk
|
324
|
+
flush_chunk()
|
325
|
+
# Overlap from old_chunk
|
326
|
+
overlap_tokens_list = (
|
327
|
+
old_chunk.split()[-config.overlap_tokens :] if old_chunk else []
|
328
|
+
)
|
329
|
+
overlap_str = (
|
330
|
+
" ".join(overlap_tokens_list) if overlap_tokens_list else ""
|
331
|
+
)
|
332
|
+
if overlap_str:
|
333
|
+
current_chunk = overlap_str + " " + seg
|
334
|
+
else:
|
335
|
+
current_chunk = seg
|
336
|
+
current_count = count_words(current_chunk)
|
337
|
+
else:
|
338
|
+
# Just accumulate
|
339
|
+
if current_chunk:
|
340
|
+
if seg.startswith("\n\n"):
|
341
|
+
current_chunk += seg
|
342
|
+
else:
|
343
|
+
current_chunk += " " + seg
|
344
|
+
else:
|
345
|
+
current_chunk = seg
|
346
|
+
current_count = count_words(current_chunk)
|
347
|
+
|
348
|
+
# Flush leftover
|
349
|
+
flush_chunk()
|
350
|
+
|
351
|
+
# Return non-empty
|
352
|
+
return [c for c in chunks if c.strip()]
|
353
|
+
|
354
|
+
|
355
|
+
# Function to process a Node and produce enriched chunks.
|
356
|
+
def chunk_node(node: Node, config: MarkdownChunkConfig) -> List[Chunk]:
|
357
|
+
chunks: List[Chunk] = []
|
358
|
+
|
359
|
+
# Check if this is a header-only node.
|
360
|
+
is_header_only = node.path and node.content.strip() == node.path[-1]
|
361
|
+
|
362
|
+
# Only generate a chunk for the node if it has non-header content,
|
363
|
+
# or if it’s header-only AND has no children (i.e., it's a leaf header).
|
364
|
+
if node.content.strip() and (not is_header_only or not node.children):
|
365
|
+
header_prefix = (
|
366
|
+
config.header_context_sep.join(node.path) + "\n\n" if node.path else ""
|
367
|
+
)
|
368
|
+
content_chunks = recursive_chunk(node.content, config)
|
369
|
+
for chunk_text in content_chunks:
|
370
|
+
full_text = header_prefix + chunk_text
|
371
|
+
chunks.append(
|
372
|
+
Chunk(
|
373
|
+
text=full_text, path=node.path, token_count=count_words(full_text)
|
374
|
+
)
|
375
|
+
)
|
376
|
+
|
377
|
+
# Process children nodes recursively.
|
378
|
+
for child in node.children:
|
379
|
+
child_chunks = chunk_node(child, config)
|
380
|
+
chunks.extend(child_chunks)
|
381
|
+
|
382
|
+
return chunks
|
383
|
+
|
384
|
+
|
385
|
+
# Function to process an entire tree of Nodes.
|
386
|
+
def chunk_tree(root_nodes: List[Node], config: MarkdownChunkConfig) -> List[Chunk]:
|
387
|
+
all_chunks: List[Chunk] = []
|
388
|
+
for node in root_nodes:
|
389
|
+
all_chunks.extend(chunk_node(node, config))
|
390
|
+
return all_chunks
|
391
|
+
|
392
|
+
|
393
|
+
def aggregate_content(node: Node) -> str:
|
394
|
+
"""
|
395
|
+
Recursively aggregate the content from a node and all its descendants,
|
396
|
+
excluding header-only nodes to avoid duplication.
|
397
|
+
"""
|
398
|
+
parts = []
|
399
|
+
|
400
|
+
# Skip header-only nodes in content aggregation
|
401
|
+
is_header_only = node.path and node.content.strip() == node.path[-1].strip()
|
402
|
+
if not is_header_only and node.content.strip():
|
403
|
+
parts.append(node.content.strip())
|
404
|
+
|
405
|
+
# Recurse on children
|
406
|
+
for child in node.children:
|
407
|
+
child_text = aggregate_content(child)
|
408
|
+
if child_text.strip():
|
409
|
+
parts.append(child_text.strip())
|
410
|
+
|
411
|
+
return "\n\n".join(parts)
|
412
|
+
|
413
|
+
|
414
|
+
def flatten_tree(node: Node, level: int = 0) -> str:
|
415
|
+
"""
|
416
|
+
Flatten a node and its children back into proper markdown text.
|
417
|
+
|
418
|
+
Args:
|
419
|
+
node: The node to flatten
|
420
|
+
level: The current heading level (depth in the tree)
|
421
|
+
|
422
|
+
Returns:
|
423
|
+
str: Properly formatted markdown text
|
424
|
+
"""
|
425
|
+
result = ""
|
426
|
+
|
427
|
+
# Check if this is a header node (content matches last item in path)
|
428
|
+
is_header = node.path and node.content.strip().startswith("#")
|
429
|
+
|
430
|
+
# For header nodes, don't duplicate the hash marks
|
431
|
+
if is_header:
|
432
|
+
result = node.content.strip() + "\n\n"
|
433
|
+
elif node.content.strip():
|
434
|
+
result = node.content.strip() + "\n\n"
|
435
|
+
|
436
|
+
# Process all children
|
437
|
+
for child in node.children:
|
438
|
+
result += flatten_tree(child, level + 1)
|
439
|
+
|
440
|
+
return result
|
441
|
+
|
442
|
+
|
443
|
+
def rollup_chunk_node(
|
444
|
+
node: Node, config: MarkdownChunkConfig, prefix: str = ""
|
445
|
+
) -> List[Chunk]:
|
446
|
+
"""
|
447
|
+
Recursively produce rollup chunks from `node`, passing down a `prefix`
|
448
|
+
(e.g., parent heading(s)).
|
449
|
+
|
450
|
+
- If a node is heading-only (content == last path item) and has children,
|
451
|
+
we skip creating a chunk for that node alone and instead add that heading
|
452
|
+
to the `prefix` for child nodes.
|
453
|
+
- If a node is NOT heading-only OR has no children, we try to fit all of its
|
454
|
+
flattened content into a single chunk. If it's too large, we chunk it.
|
455
|
+
- We pass the (possibly updated) prefix down to children, so each child's
|
456
|
+
chunk is enriched exactly once with all ancestor headings.
|
457
|
+
"""
|
458
|
+
|
459
|
+
chunks: List[Chunk] = []
|
460
|
+
|
461
|
+
# Check if the node is "heading-only" and has children
|
462
|
+
# e.g. node.content=="# Chapter 1" and node.path[-1]=="# Chapter 1"
|
463
|
+
is_heading_only_with_children = (
|
464
|
+
node.path
|
465
|
+
and node.content.strip() == node.path[-1].strip()
|
466
|
+
and len(node.children) > 0
|
467
|
+
)
|
468
|
+
|
469
|
+
if is_heading_only_with_children:
|
470
|
+
# We do NOT create a chunk for this node alone.
|
471
|
+
# Instead, we add its heading to the prefix for child chunks.
|
472
|
+
new_prefix = prefix + node.content.strip()
|
473
|
+
for i, child in enumerate(node.children):
|
474
|
+
sep = "\n\n" if i == 0 else config.header_context_sep
|
475
|
+
chunks.extend(rollup_chunk_node(child, config, prefix=new_prefix + sep))
|
476
|
+
return chunks
|
477
|
+
|
478
|
+
# If not heading-only-with-children, we handle this node's own content:
|
479
|
+
# Flatten the entire node (including sub-children) in standard Markdown form.
|
480
|
+
flattened = flatten_tree(node, level=len(node.path))
|
481
|
+
flattened_with_prefix = prefix + flattened
|
482
|
+
total_tokens = count_words(flattened_with_prefix)
|
483
|
+
|
484
|
+
# Check if we can roll up everything (node + children) in a single chunk
|
485
|
+
if total_tokens <= config.chunk_size * (1 + config.variation_percent):
|
486
|
+
# One single chunk for the entire subtree
|
487
|
+
chunks.append(
|
488
|
+
Chunk(text=flattened_with_prefix, path=node.path, token_count=total_tokens)
|
489
|
+
)
|
490
|
+
else:
|
491
|
+
# It's too large overall. We'll chunk the node's own content first (if any),
|
492
|
+
# then recurse on children.
|
493
|
+
node_content = node.content.strip()
|
494
|
+
|
495
|
+
# If we have actual content that is not just a heading, chunk it with the prefix
|
496
|
+
# (like "preamble" text).
|
497
|
+
# Note: if this node is heading-only but has NO children,
|
498
|
+
# it will still land here
|
499
|
+
# (because is_heading_only_with_children was False due to zero children).
|
500
|
+
if node_content and (not node.path or node_content != node.path[-1].strip()):
|
501
|
+
# The node is actual content (not purely heading).
|
502
|
+
# We'll chunk it in paragraphs/sentences with the prefix.
|
503
|
+
content_chunks = recursive_chunk(node_content, config)
|
504
|
+
for text_block in content_chunks:
|
505
|
+
block_with_prefix = prefix + text_block
|
506
|
+
chunks.append(
|
507
|
+
Chunk(
|
508
|
+
text=block_with_prefix,
|
509
|
+
path=node.path,
|
510
|
+
token_count=count_words(block_with_prefix),
|
511
|
+
)
|
512
|
+
)
|
513
|
+
|
514
|
+
# Now recurse on children, passing the same prefix so they get it too
|
515
|
+
for child in node.children:
|
516
|
+
chunks.extend(rollup_chunk_node(child, config, prefix=prefix))
|
517
|
+
|
518
|
+
return chunks
|
519
|
+
|
520
|
+
|
521
|
+
def rollup_chunk_tree(
|
522
|
+
root_nodes: List[Node],
|
523
|
+
config: MarkdownChunkConfig,
|
524
|
+
) -> List[Chunk]:
|
525
|
+
# Create a dummy root node that contains everything.
|
526
|
+
dummy_root = Node(content="", path=[], children=root_nodes)
|
527
|
+
|
528
|
+
# Now process just the dummy root node with an empty prefix.
|
529
|
+
chunks = rollup_chunk_node(dummy_root, config, prefix="")
|
530
|
+
return chunks
|
531
|
+
|
532
|
+
|
533
|
+
def chunk_markdown(markdown_text: str, config: MarkdownChunkConfig) -> List[str]:
|
534
|
+
tree = parse_markdown_headings(markdown_text)
|
535
|
+
if len(tree) == 1 and len(tree[0].children) == 0:
|
536
|
+
# Pure text, no hierarchy, so just use recursive_chunk
|
537
|
+
text_chunks = recursive_chunk(markdown_text, config)
|
538
|
+
return [_cleanup_text(chunk) for chunk in text_chunks]
|
539
|
+
if config.rollup:
|
540
|
+
chunks = rollup_chunk_tree(tree, config)
|
541
|
+
else:
|
542
|
+
chunks = chunk_tree(tree, config)
|
543
|
+
return [_cleanup_text(chunk.text) for chunk in chunks]
|
544
|
+
|
545
|
+
|
546
|
+
if __name__ == "__main__":
|
547
|
+
# Example usage:
|
548
|
+
markdown_text = """# Title
|
549
|
+
Intro para. Hope this is not
|
550
|
+
getting split.
|
551
|
+
## SubTitle
|
552
|
+
- Item1
|
553
|
+
- Item2
|
554
|
+
"""
|
555
|
+
# Set up chunking config with very large chunk size.
|
556
|
+
# (you can adjust chunk_size, overlap_tokens, variation_percent)
|
557
|
+
config = MarkdownChunkConfig(
|
558
|
+
chunk_size=200, overlap_tokens=5, variation_percent=0.2
|
559
|
+
)
|
560
|
+
chunks = chunk_markdown(markdown_text, config)
|
561
|
+
|
562
|
+
for idx, chunk in enumerate(chunks, 1):
|
563
|
+
print(f"--- Chunk {idx} --- ")
|
564
|
+
print(chunk)
|
565
|
+
print()
|
566
|
+
|
567
|
+
config.rollup = True
|
568
|
+
# with rollup_chunk_tree we get entire doc as 1 chunk
|
569
|
+
chunks = chunk_markdown(markdown_text, config)
|
570
|
+
assert len(chunks) == 1
|
571
|
+
for idx, chunk in enumerate(chunks, 1):
|
572
|
+
print(f"--- Chunk {idx} ---")
|
573
|
+
print(chunk)
|
574
|
+
print()
|
langroid/parsing/parser.py
CHANGED
@@ -6,6 +6,11 @@ from typing import Any, Dict, List, Literal, Optional
|
|
6
6
|
import tiktoken
|
7
7
|
|
8
8
|
from langroid.mytypes import Document
|
9
|
+
from langroid.parsing.md_parser import (
|
10
|
+
MarkdownChunkConfig,
|
11
|
+
chunk_markdown,
|
12
|
+
count_words,
|
13
|
+
)
|
9
14
|
from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
|
10
15
|
from langroid.pydantic_v1 import BaseSettings, root_validator
|
11
16
|
from langroid.utils.object_registry import ObjectRegistry
|
@@ -18,6 +23,8 @@ class Splitter(str, Enum):
|
|
18
23
|
TOKENS = "tokens"
|
19
24
|
PARA_SENTENCE = "para_sentence"
|
20
25
|
SIMPLE = "simple"
|
26
|
+
# "structure-aware" splitting with chunks enriched by header info
|
27
|
+
MARKDOWN = "markdown"
|
21
28
|
|
22
29
|
|
23
30
|
class BaseParsingConfig(BaseSettings):
|
@@ -98,9 +105,10 @@ class MarkitdownXLSParsingConfig(BaseSettings):
|
|
98
105
|
|
99
106
|
|
100
107
|
class ParsingConfig(BaseSettings):
|
101
|
-
splitter: str = Splitter.
|
108
|
+
splitter: str = Splitter.MARKDOWN
|
102
109
|
chunk_by_page: bool = False # split by page?
|
103
110
|
chunk_size: int = 200 # aim for this many tokens per chunk
|
111
|
+
chunk_size_variation: float = 0.30 # max variation from chunk_size
|
104
112
|
overlap: int = 50 # overlap between chunks
|
105
113
|
max_chunks: int = 10_000
|
106
114
|
# offset to subtract from page numbers:
|
@@ -130,6 +138,8 @@ class Parser:
|
|
130
138
|
self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
|
131
139
|
|
132
140
|
def num_tokens(self, text: str) -> int:
|
141
|
+
if self.config.splitter == Splitter.MARKDOWN:
|
142
|
+
return count_words(text) # simple count based on whitespace-split
|
133
143
|
tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
|
134
144
|
return len(tokens)
|
135
145
|
|
@@ -254,7 +264,20 @@ class Parser:
|
|
254
264
|
def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
|
255
265
|
final_docs = []
|
256
266
|
for d in docs:
|
257
|
-
|
267
|
+
if self.config.splitter == Splitter.MARKDOWN:
|
268
|
+
chunks = chunk_markdown(
|
269
|
+
d.content,
|
270
|
+
MarkdownChunkConfig(
|
271
|
+
# apply rough adjustment factor to convert from tokens to words,
|
272
|
+
# which is what the markdown chunker uses
|
273
|
+
chunk_size=int(self.config.chunk_size * 0.75),
|
274
|
+
overlap_tokens=int(self.config.overlap * 0.75),
|
275
|
+
variation_percent=self.config.chunk_size_variation,
|
276
|
+
rollup=True,
|
277
|
+
),
|
278
|
+
)
|
279
|
+
else:
|
280
|
+
chunks = self.chunk_tokens(d.content)
|
258
281
|
# note we are ensuring we COPY the document metadata into each chunk,
|
259
282
|
# which ensures all chunks of a given doc have same metadata
|
260
283
|
# (and in particular same metadata.id, which is important later for
|
@@ -370,13 +393,14 @@ class Parser:
|
|
370
393
|
big_docs = [d for d in docs if not d.metadata.is_chunk]
|
371
394
|
if len(big_docs) == 0:
|
372
395
|
return chunked_docs
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
396
|
+
match self.config.splitter:
|
397
|
+
case Splitter.MARKDOWN | Splitter.TOKENS:
|
398
|
+
big_doc_chunks = self.split_chunk_tokens(big_docs)
|
399
|
+
case Splitter.PARA_SENTENCE:
|
400
|
+
big_doc_chunks = self.split_para_sentence(big_docs)
|
401
|
+
case Splitter.SIMPLE:
|
402
|
+
big_doc_chunks = self.split_simple(big_docs)
|
403
|
+
case _:
|
404
|
+
raise ValueError(f"Unknown splitter: {self.config.splitter}")
|
381
405
|
|
382
406
|
return chunked_docs + big_doc_chunks
|
langroid/parsing/url_loader.py
CHANGED
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
|
|
4
4
|
from tempfile import NamedTemporaryFile
|
5
5
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
6
6
|
|
7
|
+
import markdownify as md
|
7
8
|
from dotenv import load_dotenv
|
8
9
|
|
9
10
|
from langroid.exceptions import LangroidImportError
|
@@ -31,6 +32,7 @@ class TrafilaturaConfig(BaseCrawlerConfig):
|
|
31
32
|
"""Configuration for Trafilatura crawler."""
|
32
33
|
|
33
34
|
threads: int = 4
|
35
|
+
format: str = "markdown" # or "xml" or "txt"
|
34
36
|
|
35
37
|
|
36
38
|
class FirecrawlConfig(BaseCrawlerConfig):
|
@@ -200,8 +202,16 @@ class TrafilaturaCrawler(BaseCrawler):
|
|
200
202
|
docs.extend(parsed_doc)
|
201
203
|
else:
|
202
204
|
text = trafilatura.extract(
|
203
|
-
result,
|
205
|
+
result,
|
206
|
+
no_fallback=False,
|
207
|
+
favor_recall=True,
|
208
|
+
include_formatting=True,
|
209
|
+
output_format=self.config.format,
|
210
|
+
with_metadata=True, # Title, date, author... at start of text
|
204
211
|
)
|
212
|
+
if self.config.format in ["xml", "html"]:
|
213
|
+
# heading_style="ATX" for markdown headings, i.e. #, ##, etc.
|
214
|
+
text = md.markdownify(text, heading_style="ATX")
|
205
215
|
if text is None and result is not None and isinstance(result, str):
|
206
216
|
text = result
|
207
217
|
if text:
|
@@ -378,14 +388,21 @@ class ExaCrawler(BaseCrawler):
|
|
378
388
|
docs.extend(parsed_doc_chunks)
|
379
389
|
continue
|
380
390
|
else:
|
381
|
-
results = exa.get_contents(
|
391
|
+
results = exa.get_contents(
|
392
|
+
[url],
|
393
|
+
livecrawl="always",
|
394
|
+
text={
|
395
|
+
"include_html_tags": True,
|
396
|
+
},
|
397
|
+
)
|
382
398
|
result = results.results[0]
|
383
399
|
if result.text:
|
400
|
+
md_text = md.markdownify(result.text, heading_style="ATX")
|
384
401
|
# append a NON-chunked document
|
385
402
|
# (metadata.is_chunk = False, so will be chunked downstream)
|
386
403
|
docs.append(
|
387
404
|
Document(
|
388
|
-
content=
|
405
|
+
content=md_text,
|
389
406
|
metadata=DocMetaData(
|
390
407
|
source=url,
|
391
408
|
title=getattr(result, "title", "Unknown Title"),
|
@@ -97,7 +97,7 @@ def format_cited_references(
|
|
97
97
|
# source and content for each citation
|
98
98
|
full_citations_str = "\n".join(
|
99
99
|
[
|
100
|
-
f"[^{c}] {passages[c-1].metadata
|
100
|
+
f"[^{c}] {str(passages[c-1].metadata)}"
|
101
101
|
f"\n{format_footnote_text(passages[c-1].content)}"
|
102
102
|
for c in good_citations
|
103
103
|
]
|
@@ -105,6 +105,6 @@ def format_cited_references(
|
|
105
105
|
|
106
106
|
# source for each citation
|
107
107
|
citations_str = "\n".join(
|
108
|
-
[f"[^{c}] {passages[c-1].metadata
|
108
|
+
[f"[^{c}] {str(passages[c-1].metadata)}" for c in good_citations]
|
109
109
|
)
|
110
110
|
return full_citations_str, citations_str
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.50.0
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -27,6 +27,7 @@ Requires-Dist: halo<1.0.0,>=0.0.31
|
|
27
27
|
Requires-Dist: jinja2<4.0.0,>=3.1.2
|
28
28
|
Requires-Dist: json-repair<1.0.0,>=0.29.9
|
29
29
|
Requires-Dist: lxml<5.0.0,>=4.9.3
|
30
|
+
Requires-Dist: markdownify>=0.13.1
|
30
31
|
Requires-Dist: nest-asyncio<2.0.0,>=1.6.0
|
31
32
|
Requires-Dist: nltk<4.0.0,>=3.8.2
|
32
33
|
Requires-Dist: onnxruntime<2.0.0,>=1.16.1
|
@@ -1,6 +1,6 @@
|
|
1
1
|
langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
|
2
2
|
langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
|
3
|
-
langroid/mytypes.py,sha256=
|
3
|
+
langroid/mytypes.py,sha256=HIcYAqGeA9OK0Hlscym2FI5Oax9QFljDZoVgRlomhRk,4014
|
4
4
|
langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
6
6
|
langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
|
@@ -14,7 +14,7 @@ langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a
|
|
14
14
|
langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
15
|
langroid/agent/callbacks/chainlit.py,sha256=UHB6P_J40vsVnssosqkpkOVWRf9NK4TOY0_G2g_Arsg,20900
|
16
16
|
langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
|
17
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
17
|
+
langroid/agent/special/doc_chat_agent.py,sha256=J_-yOWBci5_ChDXOVUxCag_3gRou5Xm8la3I37ePcwk,65233
|
18
18
|
langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
|
19
19
|
langroid/agent/special/lance_tools.py,sha256=qS8x4wi8mrqfbYV2ztFzrcxyhHQ0ZWOc-zkYiH7awj0,2105
|
20
20
|
langroid/agent/special/relevance_extractor_agent.py,sha256=zIx8GUdVo1aGW6ASla0NPQjYYIpmriK_TYMijqAx3F8,4796
|
@@ -72,7 +72,7 @@ langroid/language_models/base.py,sha256=mDYmFCBCLdq8_Uvws4MiewwEgcOCP8Qb0e5yUXr3
|
|
72
72
|
langroid/language_models/config.py,sha256=9Q8wk5a7RQr8LGMT_0WkpjY8S4ywK06SalVRjXlfCiI,378
|
73
73
|
langroid/language_models/mock_lm.py,sha256=5BgHKDVRWFbUwDT_PFgTZXz9-k8wJSA2e3PZmyDgQ1k,4022
|
74
74
|
langroid/language_models/model_info.py,sha256=tfBBxL0iUf2mVN6CjcvqflzFUVg2oZqOJZexZ8jHTYA,12216
|
75
|
-
langroid/language_models/openai_gpt.py,sha256=
|
75
|
+
langroid/language_models/openai_gpt.py,sha256=M_jp97Ksp5r3U-d0jCLPLjVmn7IK1mC8Ry4t7k6A5tc,82906
|
76
76
|
langroid/language_models/utils.py,sha256=L4_CbihDMTGcsg0TOG1Yd5JFEto46--h7CX_14m89sQ,5016
|
77
77
|
langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8yiscohil1ogbP1ECkYdBlBsk,372
|
78
78
|
langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
|
@@ -81,17 +81,18 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
|
|
81
81
|
langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
|
82
82
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
83
83
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
84
|
-
langroid/parsing/document_parser.py,sha256=
|
84
|
+
langroid/parsing/document_parser.py,sha256=XihXwhp--Nxhb8xoh6wth_isJCGUROKiVr3rPDOJodU,54359
|
85
|
+
langroid/parsing/md_parser.py,sha256=JUgsUpCaeAuBndmtDaJR9HMZaje1gmtXtaLXJHst3i8,21340
|
85
86
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
86
87
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
87
|
-
langroid/parsing/parser.py,sha256=
|
88
|
+
langroid/parsing/parser.py,sha256=YPE6X6efimz2bYbardrhHHKw7V1LZvq-vF0q5p5XzOk,15387
|
88
89
|
langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
|
89
90
|
langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
|
90
91
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
91
92
|
langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
|
92
93
|
langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
|
93
94
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
94
|
-
langroid/parsing/url_loader.py,sha256=
|
95
|
+
langroid/parsing/url_loader.py,sha256=NQuCxa-hTOuxLZDq4xKLvPfGVB4IWFzh2ItqWq297DI,15675
|
95
96
|
langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
|
96
97
|
langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
|
97
98
|
langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
|
@@ -115,7 +116,7 @@ langroid/utils/types.py,sha256=-BvyIf_LmAJ5jR9NC7S4CSVNEr3XayAaxJ5o0TiIej0,2992
|
|
115
116
|
langroid/utils/algorithms/__init__.py,sha256=WylYoZymA0fnzpB4vrsH_0n7WsoLhmuZq8qxsOCjUpM,41
|
116
117
|
langroid/utils/algorithms/graph.py,sha256=JbdpPnUOhw4-D6O7ou101JLA3xPCD0Lr3qaPoFCaRfo,2866
|
117
118
|
langroid/utils/output/__init__.py,sha256=7P0f--4IZneNsTxXY5fd6d6iW-CeVe-KSsl-87sbBPc,340
|
118
|
-
langroid/utils/output/citations.py,sha256=
|
119
|
+
langroid/utils/output/citations.py,sha256=cEiqSH7DJ5q4M2z_6eFjCj9Ohnf68i6sivjeRFuFAtk,3862
|
119
120
|
langroid/utils/output/printing.py,sha256=yzPJZN-8_jyOJmI9N_oLwEDfjMwVgk3IDiwnZ4eK_AE,2962
|
120
121
|
langroid/utils/output/status.py,sha256=rzbE7mDJcgNNvdtylCseQcPGCGghtJvVq3lB-OPJ49E,1049
|
121
122
|
langroid/vector_store/__init__.py,sha256=8ktJUVsVUoc7FMmkUFpFBZu7VMWUqQY9zpm4kEJ8yTs,1537
|
@@ -127,7 +128,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
128
|
langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
|
128
129
|
langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
|
129
130
|
langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
|
130
|
-
langroid-0.
|
131
|
-
langroid-0.
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
131
|
+
langroid-0.50.0.dist-info/METADATA,sha256=JlWk_AbUqBitgpOF_957BtX6ZhT4FImk313aidCnf1Y,63641
|
132
|
+
langroid-0.50.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
133
|
+
langroid-0.50.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
134
|
+
langroid-0.50.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|