langroid 0.49.1__py3-none-any.whl → 0.50.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -174,7 +174,7 @@ class DocChatAgentConfig(ChatAgentConfig):
174
174
  "https://ai.googleblog.com/2022/11/characterizing-emergent-phenomena-in.html",
175
175
  ]
176
176
  parsing: ParsingConfig = ParsingConfig( # modify as needed
177
- splitter=Splitter.TOKENS,
177
+ splitter=Splitter.MARKDOWN,
178
178
  chunk_size=1000, # aim for this many tokens per chunk
179
179
  overlap=100, # overlap between chunks
180
180
  max_chunks=10_000,
langroid/mytypes.py CHANGED
@@ -87,7 +87,10 @@ class DocMetaData(BaseModel):
87
87
  except (ValueError, ImportError, TypeError):
88
88
  # If parsing fails, just use the original date
89
89
  date_str = f"Date: {self.published_date}"
90
- return f"{self.source} {title_str} {date_str}".strip()
90
+ components = [self.source] + (
91
+ [] if title_str + date_str == "" else [title_str, date_str]
92
+ )
93
+ return ", ".join(components)
91
94
 
92
95
  class Config:
93
96
  extra = Extra.allow
@@ -380,9 +380,6 @@ class DocumentParser(Parser):
380
380
  Get document chunks from a pdf source,
381
381
  with page references in the document metadata.
382
382
 
383
- Adapted from
384
- https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
385
-
386
383
  Returns:
387
384
  List[Document]: a list of `Document` objects,
388
385
  each containing a chunk of text
@@ -0,0 +1,574 @@
1
+ import re
2
+ from typing import List
3
+
4
+ from langroid.pydantic_v1 import BaseModel, Field
5
+
6
+ HEADER_CONTEXT_SEP = "\n...\n"
7
+
8
+
9
+ # Pydantic model definition for a node in the markdown hierarchy
10
+ class Node(BaseModel):
11
+ content: str # The text of the header or content block
12
+ path: List[str] # List of header texts from root to this node
13
+ children: List["Node"] = Field(default_factory=list)
14
+ # Nested children nodes
15
+
16
+ def __repr__(self) -> str:
17
+ # for debug printing
18
+ return (
19
+ f"Node(content={self.content!r}, path={self.path!r}, "
20
+ f"children={len(self.children)})"
21
+ )
22
+
23
+ # Pydantic v1 requires forward references for self-referencing models
24
+ # Forward references will be resolved with the update_forward_refs call below.
25
+
26
+
27
+ # Resolve forward references for Node (required for recursive models in Pydantic v1)
28
+ Node.update_forward_refs()
29
+
30
+
31
+ def _cleanup_text(text: str) -> str:
32
+ # 1) Convert alternative newline representations (any CRLF or CR) to a single '\n'
33
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
34
+
35
+ # 2) Replace 3 or more consecutive newlines with exactly 2 newlines
36
+ text = re.sub(r"\n{3,}", "\n\n", text)
37
+
38
+ return text
39
+
40
+
41
+ HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
42
+
43
+
44
+ def parse_markdown_headings(md_text: str) -> List[Node]:
45
+ """
46
+ Parse `md_text` to extract a heading-based hierarchy, skipping lines
47
+ that look like headings inside fenced code blocks. Each heading node
48
+ will have a child node for the text that appears between this heading
49
+ and the next heading.
50
+
51
+ Returns a list of top-level Node objects.
52
+
53
+ Example structure:
54
+ Node(content='# Chapter 1', path=['# Chapter 1'], children=[
55
+ Node(content='Intro paragraph...', path=['# Chapter 1'], children=[]),
56
+ Node(content='## Section 1.1', path=['# Chapter 1', '## Section 1.1'],
57
+ children=[
58
+ Node(content='Some text in Section 1.1.', path=[...], children=[])
59
+ ]),
60
+ ...
61
+ ])
62
+ """
63
+ # If doc is empty or only whitespace, return []
64
+ if not md_text.strip():
65
+ return []
66
+
67
+ lines = md_text.splitlines(True) # keep the newline characters
68
+
69
+ # We'll scan line-by-line, track code-fence status, collect headings
70
+ headings = [] # list of (level, heading_line, start_line_idx)
71
+ in_code_fence = False
72
+ fence_marker = None # track which triple-backtick or ~~~ opened
73
+
74
+ for i, line in enumerate(lines):
75
+ # Check if we're toggling in/out of a fenced code block
76
+ # Typically triple backtick or triple tilde: ``` or ~~~
77
+ # We do a *loose* check: a line that starts with at least 3 backticks or tildes
78
+ # ignoring trailing text. You can refine as needed.
79
+ fence_match = re.match(r"^(```+|~~~+)", line.strip())
80
+ if fence_match:
81
+ # If we are not in a fence, we enter one;
82
+ # If we are in a fence, we exit if the marker matches
83
+ marker = fence_match.group(1) # e.g. "```" or "~~~~"
84
+ if not in_code_fence:
85
+ in_code_fence = True
86
+ fence_marker = marker[:3] # store triple backtick or triple tilde
87
+ else:
88
+ # only close if the fence_marker matches
89
+ # E.g. if we opened with ```, we close only on ```
90
+ if fence_marker and marker.startswith(fence_marker):
91
+ in_code_fence = False
92
+ fence_marker = None
93
+
94
+ if not in_code_fence:
95
+ # Check if the line is a heading
96
+ m = HEADING_RE.match(line)
97
+ if m:
98
+ hashes = m.group(1) # e.g. "##"
99
+ heading_text = line.rstrip("\n") # entire line, exact
100
+ level = len(hashes)
101
+ headings.append((level, heading_text, i))
102
+
103
+ # If no headings found, return a single root node with the entire text
104
+ if not headings:
105
+ return [Node(content=md_text.strip(), path=[], children=[])]
106
+
107
+ # Add a sentinel heading at the end-of-file, so we can slice the last block
108
+ # after the final real heading. We'll use level=0 so it doesn't form a real node.
109
+ headings.append((0, "", len(lines)))
110
+
111
+ # Now we build "heading blocks" with
112
+ # (level, heading_text, start_line, end_line, content)
113
+ heading_blocks = []
114
+ for idx in range(len(headings) - 1):
115
+ level, heading_line, start_i = headings[idx]
116
+ next_level, _, next_start_i = headings[idx + 1]
117
+
118
+ # Content is everything after the heading line until the next heading
119
+ # i.e. lines[start_i+1 : next_start_i]
120
+ block_content_lines = lines[start_i + 1 : next_start_i]
121
+ block_content = "".join(block_content_lines).rstrip("\n")
122
+
123
+ heading_blocks.append(
124
+ {"level": level, "heading_text": heading_line, "content": block_content}
125
+ )
126
+ # (We skip the sentinel heading in the final result.)
127
+
128
+ # We'll now convert heading_blocks into a tree using a stack-based approach
129
+ root_nodes: List[Node] = []
130
+ stack: List[Node] = []
131
+ header_path: List[str] = []
132
+
133
+ for hb in heading_blocks:
134
+ level = hb["level"] # type: ignore
135
+ heading_txt = hb["heading_text"]
136
+ content_txt = hb["content"]
137
+
138
+ # --- Pop stack first! ---
139
+ while stack and len(stack[-1].path) >= level:
140
+ stack.pop()
141
+ header_path.pop()
142
+
143
+ # build new path, create a node for the heading
144
+ new_path = header_path + [heading_txt]
145
+ heading_node = Node(
146
+ content=heading_txt, path=new_path, children=[] # type: ignore
147
+ )
148
+
149
+ # Possibly create a content child for whatever lines were below the heading
150
+ if content_txt.strip(): # type: ignore
151
+ content_node = Node(
152
+ content=content_txt, path=new_path, children=[] # type: ignore
153
+ )
154
+ heading_node.children.append(content_node)
155
+
156
+ # Attach heading_node to the stack top or as a root
157
+ if stack:
158
+ stack[-1].children.append(heading_node)
159
+ else:
160
+ root_nodes.append(heading_node)
161
+
162
+ stack.append(heading_node)
163
+ header_path.append(heading_txt) # type: ignore
164
+
165
+ return root_nodes
166
+
167
+
168
+ # The Chunk model for the final enriched chunks.
169
+ class Chunk(BaseModel):
170
+ text: str # The chunk text (which includes header context)
171
+ path: List[str] # The header path (list of header strings)
172
+ token_count: int
173
+
174
+
175
+ # Configuration for chunking
176
+ class MarkdownChunkConfig(BaseModel):
177
+ chunk_size: int = 200 # desired chunk size in tokens
178
+ overlap_tokens: int = 30 # number of tokens to overlap between chunks
179
+ variation_percent: float = 0.3 # allowed variation
180
+ rollup: bool = True # whether to roll up chunks
181
+ header_context_sep: str = HEADER_CONTEXT_SEP # separator for header context
182
+
183
+
184
+ # A simple tokenizer that counts tokens as whitespace-separated words.
185
+ def count_words(text: str) -> int:
186
+ return len(text.split())
187
+
188
+
189
+ def recursive_chunk(text: str, config: MarkdownChunkConfig) -> List[str]:
190
+ """
191
+ Enhanced chunker that:
192
+ 1. Splits by paragraph (top-level).
193
+ 2. Splits paragraphs by sentences if needed (never mid-sentence unless huge).
194
+ 3. Allows going over the upper bound rather than splitting a single sentence.
195
+ 4. Overlaps only once between consecutive chunks.
196
+ 5. Looks ahead to avoid a "dangling" final chunk below the lower bound.
197
+ 6. Preserves \n\n (and other original spacing) as best as possible.
198
+ """
199
+
200
+ # -------------------------------------------------
201
+ # Helpers
202
+ # -------------------------------------------------
203
+ def count_words(text_block: str) -> int:
204
+ return len(text_block.split())
205
+
206
+ lower_bound = int(config.chunk_size * (1 - config.variation_percent))
207
+ upper_bound = int(config.chunk_size * (1 + config.variation_percent))
208
+
209
+ # Quick check: if the entire text is short enough, return as-is.
210
+ if count_words(text) <= upper_bound:
211
+ return [text.strip()]
212
+
213
+ # Split into paragraphs, preserving \n\n if it's there.
214
+ raw_paragraphs = text.split("\n\n")
215
+ paragraphs = []
216
+ for i, p in enumerate(raw_paragraphs):
217
+ if p.strip():
218
+ # Re-append the double-newline if not the last piece
219
+ if i < len(raw_paragraphs) - 1:
220
+ paragraphs.append(p + "\n\n")
221
+ else:
222
+ paragraphs.append(p)
223
+
224
+ # Split paragraphs into "segments": each segment is either
225
+ # a full short paragraph or (if too big) a list of sentences.
226
+ sentence_regex = r"(?<=[.!?])\s+"
227
+
228
+ def split_paragraph_into_sentences(paragraph: str) -> List[str]:
229
+ """
230
+ Return a list of sentence-sized segments. If a single sentence
231
+ is bigger than upper_bound, do a word-level fallback.
232
+ """
233
+ if count_words(paragraph) <= upper_bound:
234
+ return [paragraph]
235
+
236
+ sentences = re.split(sentence_regex, paragraph)
237
+ # Clean up stray whitespace
238
+ sentences = [s.strip() for s in sentences if s.strip()]
239
+
240
+ expanded = []
241
+ for s in sentences:
242
+ if count_words(s) > upper_bound:
243
+ expanded.extend(_fallback_word_split(s, config))
244
+ else:
245
+ expanded.append(s)
246
+ return expanded
247
+
248
+ def _fallback_word_split(long_text: str, cfg: MarkdownChunkConfig) -> List[str]:
249
+ """
250
+ As a last resort, split extremely large 'sentence' by words.
251
+ """
252
+ words = long_text.split()
253
+ pieces = []
254
+ start = 0
255
+ while start < len(words):
256
+ end = start + cfg.chunk_size
257
+ chunk_words = words[start:end]
258
+ pieces.append(" ".join(chunk_words))
259
+ start = end
260
+ return pieces
261
+
262
+ # Build a list of segments
263
+ segments = []
264
+ for para in paragraphs:
265
+ if count_words(para) > upper_bound:
266
+ # split into sentences
267
+ segs = split_paragraph_into_sentences(para)
268
+ segments.extend(segs)
269
+ else:
270
+ segments.append(para)
271
+
272
+ # -------------------------------------------------
273
+ # Accumulate segments into final chunks
274
+ # -------------------------------------------------
275
+ chunks = []
276
+ current_chunk = ""
277
+ current_count = 0
278
+
279
+ def flush_chunk() -> None:
280
+ nonlocal current_chunk, current_count
281
+ trimmed = current_chunk.strip()
282
+ if trimmed:
283
+ chunks.append(trimmed)
284
+ current_chunk = ""
285
+ current_count = 0
286
+
287
+ def remaining_tokens_in_future(all_segments: List[str], current_index: int) -> int:
288
+ """Sum of word counts from current_index onward."""
289
+ return sum(count_words(s) for s in all_segments[current_index:])
290
+
291
+ for i, seg in enumerate(segments):
292
+ seg_count = count_words(seg)
293
+
294
+ # If this single segment alone exceeds upper_bound, we accept it as a big chunk.
295
+ if seg_count > upper_bound:
296
+ # If we have something in the current chunk, flush it first
297
+ flush_chunk()
298
+ # Then store this large segment as its own chunk
299
+ chunks.append(seg.strip())
300
+ continue
301
+
302
+ # Attempt to add seg to the current chunk
303
+ if (current_count + seg_count) > upper_bound and (current_count >= lower_bound):
304
+ # We would normally flush here, but let's see if we are nearing the end:
305
+ # If the remaining tokens (including this one) is < lower_bound,
306
+ # we just add it anyway to avoid creating a tiny final chunk.
307
+ future_tokens = remaining_tokens_in_future(segments, i)
308
+ if future_tokens < lower_bound:
309
+ # Just add it (allowing to exceed upper bound)
310
+ if current_chunk:
311
+ # Add space or preserve newline carefully
312
+ # We'll do a basic approach here:
313
+ if seg.startswith("\n\n"):
314
+ current_chunk += seg # preserve double new line
315
+ else:
316
+ current_chunk += " " + seg
317
+ current_count = count_words(current_chunk)
318
+ else:
319
+ current_chunk = seg
320
+ current_count = seg_count
321
+ else:
322
+ # Normal flush
323
+ old_chunk = current_chunk
324
+ flush_chunk()
325
+ # Overlap from old_chunk
326
+ overlap_tokens_list = (
327
+ old_chunk.split()[-config.overlap_tokens :] if old_chunk else []
328
+ )
329
+ overlap_str = (
330
+ " ".join(overlap_tokens_list) if overlap_tokens_list else ""
331
+ )
332
+ if overlap_str:
333
+ current_chunk = overlap_str + " " + seg
334
+ else:
335
+ current_chunk = seg
336
+ current_count = count_words(current_chunk)
337
+ else:
338
+ # Just accumulate
339
+ if current_chunk:
340
+ if seg.startswith("\n\n"):
341
+ current_chunk += seg
342
+ else:
343
+ current_chunk += " " + seg
344
+ else:
345
+ current_chunk = seg
346
+ current_count = count_words(current_chunk)
347
+
348
+ # Flush leftover
349
+ flush_chunk()
350
+
351
+ # Return non-empty
352
+ return [c for c in chunks if c.strip()]
353
+
354
+
355
+ # Function to process a Node and produce enriched chunks.
356
+ def chunk_node(node: Node, config: MarkdownChunkConfig) -> List[Chunk]:
357
+ chunks: List[Chunk] = []
358
+
359
+ # Check if this is a header-only node.
360
+ is_header_only = node.path and node.content.strip() == node.path[-1]
361
+
362
+ # Only generate a chunk for the node if it has non-header content,
363
+ # or if it’s header-only AND has no children (i.e., it's a leaf header).
364
+ if node.content.strip() and (not is_header_only or not node.children):
365
+ header_prefix = (
366
+ config.header_context_sep.join(node.path) + "\n\n" if node.path else ""
367
+ )
368
+ content_chunks = recursive_chunk(node.content, config)
369
+ for chunk_text in content_chunks:
370
+ full_text = header_prefix + chunk_text
371
+ chunks.append(
372
+ Chunk(
373
+ text=full_text, path=node.path, token_count=count_words(full_text)
374
+ )
375
+ )
376
+
377
+ # Process children nodes recursively.
378
+ for child in node.children:
379
+ child_chunks = chunk_node(child, config)
380
+ chunks.extend(child_chunks)
381
+
382
+ return chunks
383
+
384
+
385
+ # Function to process an entire tree of Nodes.
386
+ def chunk_tree(root_nodes: List[Node], config: MarkdownChunkConfig) -> List[Chunk]:
387
+ all_chunks: List[Chunk] = []
388
+ for node in root_nodes:
389
+ all_chunks.extend(chunk_node(node, config))
390
+ return all_chunks
391
+
392
+
393
+ def aggregate_content(node: Node) -> str:
394
+ """
395
+ Recursively aggregate the content from a node and all its descendants,
396
+ excluding header-only nodes to avoid duplication.
397
+ """
398
+ parts = []
399
+
400
+ # Skip header-only nodes in content aggregation
401
+ is_header_only = node.path and node.content.strip() == node.path[-1].strip()
402
+ if not is_header_only and node.content.strip():
403
+ parts.append(node.content.strip())
404
+
405
+ # Recurse on children
406
+ for child in node.children:
407
+ child_text = aggregate_content(child)
408
+ if child_text.strip():
409
+ parts.append(child_text.strip())
410
+
411
+ return "\n\n".join(parts)
412
+
413
+
414
+ def flatten_tree(node: Node, level: int = 0) -> str:
415
+ """
416
+ Flatten a node and its children back into proper markdown text.
417
+
418
+ Args:
419
+ node: The node to flatten
420
+ level: The current heading level (depth in the tree)
421
+
422
+ Returns:
423
+ str: Properly formatted markdown text
424
+ """
425
+ result = ""
426
+
427
+ # Check if this is a header node (content matches last item in path)
428
+ is_header = node.path and node.content.strip().startswith("#")
429
+
430
+ # For header nodes, don't duplicate the hash marks
431
+ if is_header:
432
+ result = node.content.strip() + "\n\n"
433
+ elif node.content.strip():
434
+ result = node.content.strip() + "\n\n"
435
+
436
+ # Process all children
437
+ for child in node.children:
438
+ result += flatten_tree(child, level + 1)
439
+
440
+ return result
441
+
442
+
443
+ def rollup_chunk_node(
444
+ node: Node, config: MarkdownChunkConfig, prefix: str = ""
445
+ ) -> List[Chunk]:
446
+ """
447
+ Recursively produce rollup chunks from `node`, passing down a `prefix`
448
+ (e.g., parent heading(s)).
449
+
450
+ - If a node is heading-only (content == last path item) and has children,
451
+ we skip creating a chunk for that node alone and instead add that heading
452
+ to the `prefix` for child nodes.
453
+ - If a node is NOT heading-only OR has no children, we try to fit all of its
454
+ flattened content into a single chunk. If it's too large, we chunk it.
455
+ - We pass the (possibly updated) prefix down to children, so each child's
456
+ chunk is enriched exactly once with all ancestor headings.
457
+ """
458
+
459
+ chunks: List[Chunk] = []
460
+
461
+ # Check if the node is "heading-only" and has children
462
+ # e.g. node.content=="# Chapter 1" and node.path[-1]=="# Chapter 1"
463
+ is_heading_only_with_children = (
464
+ node.path
465
+ and node.content.strip() == node.path[-1].strip()
466
+ and len(node.children) > 0
467
+ )
468
+
469
+ if is_heading_only_with_children:
470
+ # We do NOT create a chunk for this node alone.
471
+ # Instead, we add its heading to the prefix for child chunks.
472
+ new_prefix = prefix + node.content.strip()
473
+ for i, child in enumerate(node.children):
474
+ sep = "\n\n" if i == 0 else config.header_context_sep
475
+ chunks.extend(rollup_chunk_node(child, config, prefix=new_prefix + sep))
476
+ return chunks
477
+
478
+ # If not heading-only-with-children, we handle this node's own content:
479
+ # Flatten the entire node (including sub-children) in standard Markdown form.
480
+ flattened = flatten_tree(node, level=len(node.path))
481
+ flattened_with_prefix = prefix + flattened
482
+ total_tokens = count_words(flattened_with_prefix)
483
+
484
+ # Check if we can roll up everything (node + children) in a single chunk
485
+ if total_tokens <= config.chunk_size * (1 + config.variation_percent):
486
+ # One single chunk for the entire subtree
487
+ chunks.append(
488
+ Chunk(text=flattened_with_prefix, path=node.path, token_count=total_tokens)
489
+ )
490
+ else:
491
+ # It's too large overall. We'll chunk the node's own content first (if any),
492
+ # then recurse on children.
493
+ node_content = node.content.strip()
494
+
495
+ # If we have actual content that is not just a heading, chunk it with the prefix
496
+ # (like "preamble" text).
497
+ # Note: if this node is heading-only but has NO children,
498
+ # it will still land here
499
+ # (because is_heading_only_with_children was False due to zero children).
500
+ if node_content and (not node.path or node_content != node.path[-1].strip()):
501
+ # The node is actual content (not purely heading).
502
+ # We'll chunk it in paragraphs/sentences with the prefix.
503
+ content_chunks = recursive_chunk(node_content, config)
504
+ for text_block in content_chunks:
505
+ block_with_prefix = prefix + text_block
506
+ chunks.append(
507
+ Chunk(
508
+ text=block_with_prefix,
509
+ path=node.path,
510
+ token_count=count_words(block_with_prefix),
511
+ )
512
+ )
513
+
514
+ # Now recurse on children, passing the same prefix so they get it too
515
+ for child in node.children:
516
+ chunks.extend(rollup_chunk_node(child, config, prefix=prefix))
517
+
518
+ return chunks
519
+
520
+
521
+ def rollup_chunk_tree(
522
+ root_nodes: List[Node],
523
+ config: MarkdownChunkConfig,
524
+ ) -> List[Chunk]:
525
+ # Create a dummy root node that contains everything.
526
+ dummy_root = Node(content="", path=[], children=root_nodes)
527
+
528
+ # Now process just the dummy root node with an empty prefix.
529
+ chunks = rollup_chunk_node(dummy_root, config, prefix="")
530
+ return chunks
531
+
532
+
533
+ def chunk_markdown(markdown_text: str, config: MarkdownChunkConfig) -> List[str]:
534
+ tree = parse_markdown_headings(markdown_text)
535
+ if len(tree) == 1 and len(tree[0].children) == 0:
536
+ # Pure text, no hierarchy, so just use recursive_chunk
537
+ text_chunks = recursive_chunk(markdown_text, config)
538
+ return [_cleanup_text(chunk) for chunk in text_chunks]
539
+ if config.rollup:
540
+ chunks = rollup_chunk_tree(tree, config)
541
+ else:
542
+ chunks = chunk_tree(tree, config)
543
+ return [_cleanup_text(chunk.text) for chunk in chunks]
544
+
545
+
546
+ if __name__ == "__main__":
547
+ # Example usage:
548
+ markdown_text = """# Title
549
+ Intro para. Hope this is not
550
+ getting split.
551
+ ## SubTitle
552
+ - Item1
553
+ - Item2
554
+ """
555
+ # Set up chunking config with very large chunk size.
556
+ # (you can adjust chunk_size, overlap_tokens, variation_percent)
557
+ config = MarkdownChunkConfig(
558
+ chunk_size=200, overlap_tokens=5, variation_percent=0.2
559
+ )
560
+ chunks = chunk_markdown(markdown_text, config)
561
+
562
+ for idx, chunk in enumerate(chunks, 1):
563
+ print(f"--- Chunk {idx} --- ")
564
+ print(chunk)
565
+ print()
566
+
567
+ config.rollup = True
568
+ # with rollup_chunk_tree we get entire doc as 1 chunk
569
+ chunks = chunk_markdown(markdown_text, config)
570
+ assert len(chunks) == 1
571
+ for idx, chunk in enumerate(chunks, 1):
572
+ print(f"--- Chunk {idx} ---")
573
+ print(chunk)
574
+ print()
@@ -6,6 +6,11 @@ from typing import Any, Dict, List, Literal, Optional
6
6
  import tiktoken
7
7
 
8
8
  from langroid.mytypes import Document
9
+ from langroid.parsing.md_parser import (
10
+ MarkdownChunkConfig,
11
+ chunk_markdown,
12
+ count_words,
13
+ )
9
14
  from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
10
15
  from langroid.pydantic_v1 import BaseSettings, root_validator
11
16
  from langroid.utils.object_registry import ObjectRegistry
@@ -18,6 +23,8 @@ class Splitter(str, Enum):
18
23
  TOKENS = "tokens"
19
24
  PARA_SENTENCE = "para_sentence"
20
25
  SIMPLE = "simple"
26
+ # "structure-aware" splitting with chunks enriched by header info
27
+ MARKDOWN = "markdown"
21
28
 
22
29
 
23
30
  class BaseParsingConfig(BaseSettings):
@@ -98,9 +105,10 @@ class MarkitdownXLSParsingConfig(BaseSettings):
98
105
 
99
106
 
100
107
  class ParsingConfig(BaseSettings):
101
- splitter: str = Splitter.TOKENS
108
+ splitter: str = Splitter.MARKDOWN
102
109
  chunk_by_page: bool = False # split by page?
103
110
  chunk_size: int = 200 # aim for this many tokens per chunk
111
+ chunk_size_variation: float = 0.30 # max variation from chunk_size
104
112
  overlap: int = 50 # overlap between chunks
105
113
  max_chunks: int = 10_000
106
114
  # offset to subtract from page numbers:
@@ -130,6 +138,8 @@ class Parser:
130
138
  self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
131
139
 
132
140
  def num_tokens(self, text: str) -> int:
141
+ if self.config.splitter == Splitter.MARKDOWN:
142
+ return count_words(text) # simple count based on whitespace-split
133
143
  tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
134
144
  return len(tokens)
135
145
 
@@ -254,7 +264,20 @@ class Parser:
254
264
  def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
255
265
  final_docs = []
256
266
  for d in docs:
257
- chunks = self.chunk_tokens(d.content)
267
+ if self.config.splitter == Splitter.MARKDOWN:
268
+ chunks = chunk_markdown(
269
+ d.content,
270
+ MarkdownChunkConfig(
271
+ # apply rough adjustment factor to convert from tokens to words,
272
+ # which is what the markdown chunker uses
273
+ chunk_size=int(self.config.chunk_size * 0.75),
274
+ overlap_tokens=int(self.config.overlap * 0.75),
275
+ variation_percent=self.config.chunk_size_variation,
276
+ rollup=True,
277
+ ),
278
+ )
279
+ else:
280
+ chunks = self.chunk_tokens(d.content)
258
281
  # note we are ensuring we COPY the document metadata into each chunk,
259
282
  # which ensures all chunks of a given doc have same metadata
260
283
  # (and in particular same metadata.id, which is important later for
@@ -370,13 +393,14 @@ class Parser:
370
393
  big_docs = [d for d in docs if not d.metadata.is_chunk]
371
394
  if len(big_docs) == 0:
372
395
  return chunked_docs
373
- if self.config.splitter == Splitter.PARA_SENTENCE:
374
- big_doc_chunks = self.split_para_sentence(big_docs)
375
- elif self.config.splitter == Splitter.TOKENS:
376
- big_doc_chunks = self.split_chunk_tokens(big_docs)
377
- elif self.config.splitter == Splitter.SIMPLE:
378
- big_doc_chunks = self.split_simple(big_docs)
379
- else:
380
- raise ValueError(f"Unknown splitter: {self.config.splitter}")
396
+ match self.config.splitter:
397
+ case Splitter.MARKDOWN | Splitter.TOKENS:
398
+ big_doc_chunks = self.split_chunk_tokens(big_docs)
399
+ case Splitter.PARA_SENTENCE:
400
+ big_doc_chunks = self.split_para_sentence(big_docs)
401
+ case Splitter.SIMPLE:
402
+ big_doc_chunks = self.split_simple(big_docs)
403
+ case _:
404
+ raise ValueError(f"Unknown splitter: {self.config.splitter}")
381
405
 
382
406
  return chunked_docs + big_doc_chunks
@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
4
4
  from tempfile import NamedTemporaryFile
5
5
  from typing import TYPE_CHECKING, Any, Dict, List, Optional
6
6
 
7
+ import markdownify as md
7
8
  from dotenv import load_dotenv
8
9
 
9
10
  from langroid.exceptions import LangroidImportError
@@ -31,6 +32,7 @@ class TrafilaturaConfig(BaseCrawlerConfig):
31
32
  """Configuration for Trafilatura crawler."""
32
33
 
33
34
  threads: int = 4
35
+ format: str = "markdown" # or "xml" or "txt"
34
36
 
35
37
 
36
38
  class FirecrawlConfig(BaseCrawlerConfig):
@@ -200,8 +202,16 @@ class TrafilaturaCrawler(BaseCrawler):
200
202
  docs.extend(parsed_doc)
201
203
  else:
202
204
  text = trafilatura.extract(
203
- result, no_fallback=False, favor_recall=True
205
+ result,
206
+ no_fallback=False,
207
+ favor_recall=True,
208
+ include_formatting=True,
209
+ output_format=self.config.format,
210
+ with_metadata=True, # Title, date, author... at start of text
204
211
  )
212
+ if self.config.format in ["xml", "html"]:
213
+ # heading_style="ATX" for markdown headings, i.e. #, ##, etc.
214
+ text = md.markdownify(text, heading_style="ATX")
205
215
  if text is None and result is not None and isinstance(result, str):
206
216
  text = result
207
217
  if text:
@@ -378,14 +388,21 @@ class ExaCrawler(BaseCrawler):
378
388
  docs.extend(parsed_doc_chunks)
379
389
  continue
380
390
  else:
381
- results = exa.get_contents([url], livecrawl="always", text=True)
391
+ results = exa.get_contents(
392
+ [url],
393
+ livecrawl="always",
394
+ text={
395
+ "include_html_tags": True,
396
+ },
397
+ )
382
398
  result = results.results[0]
383
399
  if result.text:
400
+ md_text = md.markdownify(result.text, heading_style="ATX")
384
401
  # append a NON-chunked document
385
402
  # (metadata.is_chunk = False, so will be chunked downstream)
386
403
  docs.append(
387
404
  Document(
388
- content=result.text,
405
+ content=md_text,
389
406
  metadata=DocMetaData(
390
407
  source=url,
391
408
  title=getattr(result, "title", "Unknown Title"),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.49.1
3
+ Version: 0.50.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -27,6 +27,7 @@ Requires-Dist: halo<1.0.0,>=0.0.31
27
27
  Requires-Dist: jinja2<4.0.0,>=3.1.2
28
28
  Requires-Dist: json-repair<1.0.0,>=0.29.9
29
29
  Requires-Dist: lxml<5.0.0,>=4.9.3
30
+ Requires-Dist: markdownify>=0.13.1
30
31
  Requires-Dist: nest-asyncio<2.0.0,>=1.6.0
31
32
  Requires-Dist: nltk<4.0.0,>=3.8.2
32
33
  Requires-Dist: onnxruntime<2.0.0,>=1.16.1
@@ -1,6 +1,6 @@
1
1
  langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
2
2
  langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
3
- langroid/mytypes.py,sha256=ezj_6FFDkJZiVx1SS9eJvh23dH76Ti7mJbePi8ldkAI,3919
3
+ langroid/mytypes.py,sha256=HIcYAqGeA9OK0Hlscym2FI5Oax9QFljDZoVgRlomhRk,4014
4
4
  langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
6
6
  langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
@@ -14,7 +14,7 @@ langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a
14
14
  langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  langroid/agent/callbacks/chainlit.py,sha256=UHB6P_J40vsVnssosqkpkOVWRf9NK4TOY0_G2g_Arsg,20900
16
16
  langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
17
- langroid/agent/special/doc_chat_agent.py,sha256=SrotZ0qw51fKDXlDP2lwTho0PPTuqUogFAT4jjq0ne0,65231
17
+ langroid/agent/special/doc_chat_agent.py,sha256=J_-yOWBci5_ChDXOVUxCag_3gRou5Xm8la3I37ePcwk,65233
18
18
  langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
19
19
  langroid/agent/special/lance_tools.py,sha256=qS8x4wi8mrqfbYV2ztFzrcxyhHQ0ZWOc-zkYiH7awj0,2105
20
20
  langroid/agent/special/relevance_extractor_agent.py,sha256=zIx8GUdVo1aGW6ASla0NPQjYYIpmriK_TYMijqAx3F8,4796
@@ -81,17 +81,18 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
81
81
  langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
82
82
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
83
83
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
84
- langroid/parsing/document_parser.py,sha256=72g9EUuLlCAAXGD9-8UPe7_l7JnZ7vgc764g_17EPWA,54454
84
+ langroid/parsing/document_parser.py,sha256=XihXwhp--Nxhb8xoh6wth_isJCGUROKiVr3rPDOJodU,54359
85
+ langroid/parsing/md_parser.py,sha256=JUgsUpCaeAuBndmtDaJR9HMZaje1gmtXtaLXJHst3i8,21340
85
86
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
86
87
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
87
- langroid/parsing/parser.py,sha256=bxBXiyRnUBhS5Ng6s4OhAUpxqCSUXwNn4c7DaDSiWnE,14314
88
+ langroid/parsing/parser.py,sha256=YPE6X6efimz2bYbardrhHHKw7V1LZvq-vF0q5p5XzOk,15387
88
89
  langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
89
90
  langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
90
91
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
91
92
  langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
92
93
  langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
93
94
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
94
- langroid/parsing/url_loader.py,sha256=DvgkdCZ3gDlAajH0dIUjea4YyXkziK-g36WnaE1J_WI,14884
95
+ langroid/parsing/url_loader.py,sha256=NQuCxa-hTOuxLZDq4xKLvPfGVB4IWFzh2ItqWq297DI,15675
95
96
  langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
96
97
  langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
97
98
  langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
@@ -127,7 +128,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
127
128
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
128
129
  langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
129
130
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
130
- langroid-0.49.1.dist-info/METADATA,sha256=a2cArSN5YfRq4GRH37MkO6h-fvXbXEFkoo-qDMyVTzA,63606
131
- langroid-0.49.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- langroid-0.49.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
- langroid-0.49.1.dist-info/RECORD,,
131
+ langroid-0.50.0.dist-info/METADATA,sha256=JlWk_AbUqBitgpOF_957BtX6ZhT4FImk313aidCnf1Y,63641
132
+ langroid-0.50.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
133
+ langroid-0.50.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
134
+ langroid-0.50.0.dist-info/RECORD,,