PyPI - langroid - Versions diffs - 0.49.1__py3-none-any.whl → 0.50.0__py3-none-any.whl - Mend

langroid 0.49.1py3-none-any.whl → 0.50.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

langroid/agent/special/doc_chat_agent.py +1 -1
langroid/mytypes.py +4 -1
langroid/parsing/document_parser.py +0 -3
langroid/parsing/md_parser.py +574 -0
langroid/parsing/parser.py +34 -10
langroid/parsing/url_loader.py +20 -3
{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/METADATA +2 -1
{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/RECORD +10 -9
{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/WHEEL +0 -0
{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/licenses/LICENSE +0 -0

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -174,7 +174,7 @@ class DocChatAgentConfig(ChatAgentConfig):
         "https://ai.googleblog.com/2022/11/characterizing-emergent-phenomena-in.html",
     ]
     parsing: ParsingConfig = ParsingConfig(  # modify as needed
-        splitter=Splitter.TOKENS,
+        splitter=Splitter.MARKDOWN,
         chunk_size=1000,  # aim for this many tokens per chunk
         overlap=100,  # overlap between chunks
         max_chunks=10_000,

langroid/mytypes.py CHANGED Viewed

@@ -87,7 +87,10 @@ class DocMetaData(BaseModel):
             except (ValueError, ImportError, TypeError):
                 # If parsing fails, just use the original date
                 date_str = f"Date: {self.published_date}"
-        return f"{self.source} {title_str} {date_str}".strip()
+        components = [self.source] + (
+            [] if title_str + date_str == "" else [title_str, date_str]
+        )
+        return ", ".join(components)
     class Config:
         extra = Extra.allow

langroid/parsing/document_parser.py CHANGED Viewed

@@ -380,9 +380,6 @@ class DocumentParser(Parser):
         Get document chunks from a pdf source,
         with page references in the document metadata.
-        Adapted from
-        https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
         Returns:
             List[Document]: a list of `Document` objects,
                 each containing a chunk of text

langroid/parsing/md_parser.py ADDED Viewed

@@ -0,0 +1,574 @@
+import re
+from typing import List
+from langroid.pydantic_v1 import BaseModel, Field
+HEADER_CONTEXT_SEP = "\n...\n"
+# Pydantic model definition for a node in the markdown hierarchy
+class Node(BaseModel):
+    content: str  # The text of the header or content block
+    path: List[str]  # List of header texts from root to this node
+    children: List["Node"] = Field(default_factory=list)
+    # Nested children nodes
+    def __repr__(self) -> str:
+        # for debug printing
+        return (
+            f"Node(content={self.content!r}, path={self.path!r}, "
+            f"children={len(self.children)})"
+        )
+    # Pydantic v1 requires forward references for self-referencing models
+    # Forward references will be resolved with the update_forward_refs call below.
+# Resolve forward references for Node (required for recursive models in Pydantic v1)
+Node.update_forward_refs()
+def _cleanup_text(text: str) -> str:
+    # 1) Convert alternative newline representations (any CRLF or CR) to a single '\n'
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    # 2) Replace 3 or more consecutive newlines with exactly 2 newlines
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text
+HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
+def parse_markdown_headings(md_text: str) -> List[Node]:
+    """
+    Parse `md_text` to extract a heading-based hierarchy, skipping lines
+    that look like headings inside fenced code blocks. Each heading node
+    will have a child node for the text that appears between this heading
+    and the next heading.
+    Returns a list of top-level Node objects.
+    Example structure:
+        Node(content='# Chapter 1', path=['# Chapter 1'], children=[
+            Node(content='Intro paragraph...', path=['# Chapter 1'], children=[]),
+            Node(content='## Section 1.1', path=['# Chapter 1', '## Section 1.1'],
+                 children=[
+                  Node(content='Some text in Section 1.1.', path=[...], children=[])
+            ]),
+            ...
+        ])
+    """
+    # If doc is empty or only whitespace, return []
+    if not md_text.strip():
+        return []
+    lines = md_text.splitlines(True)  # keep the newline characters
+    # We'll scan line-by-line, track code-fence status, collect headings
+    headings = []  # list of (level, heading_line, start_line_idx)
+    in_code_fence = False
+    fence_marker = None  # track which triple-backtick or ~~~ opened
+    for i, line in enumerate(lines):
+        # Check if we're toggling in/out of a fenced code block
+        # Typically triple backtick or triple tilde: ``` or ~~~
+        # We do a *loose* check: a line that starts with at least 3 backticks or tildes
+        # ignoring trailing text. You can refine as needed.
+        fence_match = re.match(r"^(```+|~~~+)", line.strip())
+        if fence_match:
+            # If we are not in a fence, we enter one;
+            # If we are in a fence, we exit if the marker matches
+            marker = fence_match.group(1)  # e.g. "```" or "~~~~"
+            if not in_code_fence:
+                in_code_fence = True
+                fence_marker = marker[:3]  # store triple backtick or triple tilde
+            else:
+                # only close if the fence_marker matches
+                # E.g. if we opened with ```, we close only on ```
+                if fence_marker and marker.startswith(fence_marker):
+                    in_code_fence = False
+                    fence_marker = None
+        if not in_code_fence:
+            # Check if the line is a heading
+            m = HEADING_RE.match(line)
+            if m:
+                hashes = m.group(1)  # e.g. "##"
+                heading_text = line.rstrip("\n")  # entire line, exact
+                level = len(hashes)
+                headings.append((level, heading_text, i))
+    # If no headings found, return a single root node with the entire text
+    if not headings:
+        return [Node(content=md_text.strip(), path=[], children=[])]
+    # Add a sentinel heading at the end-of-file, so we can slice the last block
+    # after the final real heading. We'll use level=0 so it doesn't form a real node.
+    headings.append((0, "", len(lines)))
+    # Now we build "heading blocks" with
+    # (level, heading_text, start_line, end_line, content)
+    heading_blocks = []
+    for idx in range(len(headings) - 1):
+        level, heading_line, start_i = headings[idx]
+        next_level, _, next_start_i = headings[idx + 1]
+        # Content is everything after the heading line until the next heading
+        # i.e. lines[start_i+1 : next_start_i]
+        block_content_lines = lines[start_i + 1 : next_start_i]
+        block_content = "".join(block_content_lines).rstrip("\n")
+        heading_blocks.append(
+            {"level": level, "heading_text": heading_line, "content": block_content}
+        )
+    # (We skip the sentinel heading in the final result.)
+    # We'll now convert heading_blocks into a tree using a stack-based approach
+    root_nodes: List[Node] = []
+    stack: List[Node] = []
+    header_path: List[str] = []
+    for hb in heading_blocks:
+        level = hb["level"]  # type: ignore
+        heading_txt = hb["heading_text"]
+        content_txt = hb["content"]
+        # --- Pop stack first! ---
+        while stack and len(stack[-1].path) >= level:
+            stack.pop()
+            header_path.pop()
+        # build new path, create a node for the heading
+        new_path = header_path + [heading_txt]
+        heading_node = Node(
+            content=heading_txt, path=new_path, children=[]  # type: ignore
+        )
+        # Possibly create a content child for whatever lines were below the heading
+        if content_txt.strip():  # type: ignore
+            content_node = Node(
+                content=content_txt, path=new_path, children=[]  # type: ignore
+            )
+            heading_node.children.append(content_node)
+        # Attach heading_node to the stack top or as a root
+        if stack:
+            stack[-1].children.append(heading_node)
+        else:
+            root_nodes.append(heading_node)
+        stack.append(heading_node)
+        header_path.append(heading_txt)  # type: ignore
+    return root_nodes
+# The Chunk model for the final enriched chunks.
+class Chunk(BaseModel):
+    text: str  # The chunk text (which includes header context)
+    path: List[str]  # The header path (list of header strings)
+    token_count: int
+# Configuration for chunking
+class MarkdownChunkConfig(BaseModel):
+    chunk_size: int = 200  # desired chunk size in tokens
+    overlap_tokens: int = 30  # number of tokens to overlap between chunks
+    variation_percent: float = 0.3  # allowed variation
+    rollup: bool = True  # whether to roll up chunks
+    header_context_sep: str = HEADER_CONTEXT_SEP  # separator for header context
+# A simple tokenizer that counts tokens as whitespace-separated words.
+def count_words(text: str) -> int:
+    return len(text.split())
+def recursive_chunk(text: str, config: MarkdownChunkConfig) -> List[str]:
+    """
+    Enhanced chunker that:
+      1. Splits by paragraph (top-level).
+      2. Splits paragraphs by sentences if needed (never mid-sentence unless huge).
+      3. Allows going over the upper bound rather than splitting a single sentence.
+      4. Overlaps only once between consecutive chunks.
+      5. Looks ahead to avoid a "dangling" final chunk below the lower bound.
+      6. Preserves \n\n (and other original spacing) as best as possible.
+    """
+    # -------------------------------------------------
+    # Helpers
+    # -------------------------------------------------
+    def count_words(text_block: str) -> int:
+        return len(text_block.split())
+    lower_bound = int(config.chunk_size * (1 - config.variation_percent))
+    upper_bound = int(config.chunk_size * (1 + config.variation_percent))
+    # Quick check: if the entire text is short enough, return as-is.
+    if count_words(text) <= upper_bound:
+        return [text.strip()]
+    # Split into paragraphs, preserving \n\n if it's there.
+    raw_paragraphs = text.split("\n\n")
+    paragraphs = []
+    for i, p in enumerate(raw_paragraphs):
+        if p.strip():
+            # Re-append the double-newline if not the last piece
+            if i < len(raw_paragraphs) - 1:
+                paragraphs.append(p + "\n\n")
+            else:
+                paragraphs.append(p)
+    # Split paragraphs into "segments": each segment is either
+    # a full short paragraph or (if too big) a list of sentences.
+    sentence_regex = r"(?<=[.!?])\s+"
+    def split_paragraph_into_sentences(paragraph: str) -> List[str]:
+        """
+        Return a list of sentence-sized segments. If a single sentence
+        is bigger than upper_bound, do a word-level fallback.
+        """
+        if count_words(paragraph) <= upper_bound:
+            return [paragraph]
+        sentences = re.split(sentence_regex, paragraph)
+        # Clean up stray whitespace
+        sentences = [s.strip() for s in sentences if s.strip()]
+        expanded = []
+        for s in sentences:
+            if count_words(s) > upper_bound:
+                expanded.extend(_fallback_word_split(s, config))
+            else:
+                expanded.append(s)
+        return expanded
+    def _fallback_word_split(long_text: str, cfg: MarkdownChunkConfig) -> List[str]:
+        """
+        As a last resort, split extremely large 'sentence' by words.
+        """
+        words = long_text.split()
+        pieces = []
+        start = 0
+        while start < len(words):
+            end = start + cfg.chunk_size
+            chunk_words = words[start:end]
+            pieces.append(" ".join(chunk_words))
+            start = end
+        return pieces
+    # Build a list of segments
+    segments = []
+    for para in paragraphs:
+        if count_words(para) > upper_bound:
+            # split into sentences
+            segs = split_paragraph_into_sentences(para)
+            segments.extend(segs)
+        else:
+            segments.append(para)
+    # -------------------------------------------------
+    # Accumulate segments into final chunks
+    # -------------------------------------------------
+    chunks = []
+    current_chunk = ""
+    current_count = 0
+    def flush_chunk() -> None:
+        nonlocal current_chunk, current_count
+        trimmed = current_chunk.strip()
+        if trimmed:
+            chunks.append(trimmed)
+        current_chunk = ""
+        current_count = 0
+    def remaining_tokens_in_future(all_segments: List[str], current_index: int) -> int:
+        """Sum of word counts from current_index onward."""
+        return sum(count_words(s) for s in all_segments[current_index:])
+    for i, seg in enumerate(segments):
+        seg_count = count_words(seg)
+        # If this single segment alone exceeds upper_bound, we accept it as a big chunk.
+        if seg_count > upper_bound:
+            # If we have something in the current chunk, flush it first
+            flush_chunk()
+            # Then store this large segment as its own chunk
+            chunks.append(seg.strip())
+            continue
+        # Attempt to add seg to the current chunk
+        if (current_count + seg_count) > upper_bound and (current_count >= lower_bound):
+            # We would normally flush here, but let's see if we are nearing the end:
+            # If the remaining tokens (including this one) is < lower_bound,
+            # we just add it anyway to avoid creating a tiny final chunk.
+            future_tokens = remaining_tokens_in_future(segments, i)
+            if future_tokens < lower_bound:
+                # Just add it (allowing to exceed upper bound)
+                if current_chunk:
+                    # Add space or preserve newline carefully
+                    # We'll do a basic approach here:
+                    if seg.startswith("\n\n"):
+                        current_chunk += seg  # preserve double new line
+                    else:
+                        current_chunk += " " + seg
+                    current_count = count_words(current_chunk)
+                else:
+                    current_chunk = seg
+                    current_count = seg_count
+            else:
+                # Normal flush
+                old_chunk = current_chunk
+                flush_chunk()
+                # Overlap from old_chunk
+                overlap_tokens_list = (
+                    old_chunk.split()[-config.overlap_tokens :] if old_chunk else []
+                )
+                overlap_str = (
+                    " ".join(overlap_tokens_list) if overlap_tokens_list else ""
+                )
+                if overlap_str:
+                    current_chunk = overlap_str + " " + seg
+                else:
+                    current_chunk = seg
+                current_count = count_words(current_chunk)
+        else:
+            # Just accumulate
+            if current_chunk:
+                if seg.startswith("\n\n"):
+                    current_chunk += seg
+                else:
+                    current_chunk += " " + seg
+            else:
+                current_chunk = seg
+            current_count = count_words(current_chunk)
+    # Flush leftover
+    flush_chunk()
+    # Return non-empty
+    return [c for c in chunks if c.strip()]
+# Function to process a Node and produce enriched chunks.
+def chunk_node(node: Node, config: MarkdownChunkConfig) -> List[Chunk]:
+    chunks: List[Chunk] = []
+    # Check if this is a header-only node.
+    is_header_only = node.path and node.content.strip() == node.path[-1]
+    # Only generate a chunk for the node if it has non-header content,
+    # or if it’s header-only AND has no children (i.e., it's a leaf header).
+    if node.content.strip() and (not is_header_only or not node.children):
+        header_prefix = (
+            config.header_context_sep.join(node.path) + "\n\n" if node.path else ""
+        )
+        content_chunks = recursive_chunk(node.content, config)
+        for chunk_text in content_chunks:
+            full_text = header_prefix + chunk_text
+            chunks.append(
+                Chunk(
+                    text=full_text, path=node.path, token_count=count_words(full_text)
+                )
+            )
+    # Process children nodes recursively.
+    for child in node.children:
+        child_chunks = chunk_node(child, config)
+        chunks.extend(child_chunks)
+    return chunks
+# Function to process an entire tree of Nodes.
+def chunk_tree(root_nodes: List[Node], config: MarkdownChunkConfig) -> List[Chunk]:
+    all_chunks: List[Chunk] = []
+    for node in root_nodes:
+        all_chunks.extend(chunk_node(node, config))
+    return all_chunks
+def aggregate_content(node: Node) -> str:
+    """
+    Recursively aggregate the content from a node and all its descendants,
+    excluding header-only nodes to avoid duplication.
+    """
+    parts = []
+    # Skip header-only nodes in content aggregation
+    is_header_only = node.path and node.content.strip() == node.path[-1].strip()
+    if not is_header_only and node.content.strip():
+        parts.append(node.content.strip())
+    # Recurse on children
+    for child in node.children:
+        child_text = aggregate_content(child)
+        if child_text.strip():
+            parts.append(child_text.strip())
+    return "\n\n".join(parts)
+def flatten_tree(node: Node, level: int = 0) -> str:
+    """
+    Flatten a node and its children back into proper markdown text.
+    Args:
+        node: The node to flatten
+        level: The current heading level (depth in the tree)
+    Returns:
+        str: Properly formatted markdown text
+    """
+    result = ""
+    # Check if this is a header node (content matches last item in path)
+    is_header = node.path and node.content.strip().startswith("#")
+    # For header nodes, don't duplicate the hash marks
+    if is_header:
+        result = node.content.strip() + "\n\n"
+    elif node.content.strip():
+        result = node.content.strip() + "\n\n"
+    # Process all children
+    for child in node.children:
+        result += flatten_tree(child, level + 1)
+    return result
+def rollup_chunk_node(
+    node: Node, config: MarkdownChunkConfig, prefix: str = ""
+) -> List[Chunk]:
+    """
+    Recursively produce rollup chunks from `node`, passing down a `prefix`
+    (e.g., parent heading(s)).
+    - If a node is heading-only (content == last path item) and has children,
+      we skip creating a chunk for that node alone and instead add that heading
+      to the `prefix` for child nodes.
+    - If a node is NOT heading-only OR has no children, we try to fit all of its
+      flattened content into a single chunk. If it's too large, we chunk it.
+    - We pass the (possibly updated) prefix down to children, so each child's
+      chunk is enriched exactly once with all ancestor headings.
+    """
+    chunks: List[Chunk] = []
+    # Check if the node is "heading-only" and has children
+    # e.g. node.content=="# Chapter 1" and node.path[-1]=="# Chapter 1"
+    is_heading_only_with_children = (
+        node.path
+        and node.content.strip() == node.path[-1].strip()
+        and len(node.children) > 0
+    )
+    if is_heading_only_with_children:
+        # We do NOT create a chunk for this node alone.
+        # Instead, we add its heading to the prefix for child chunks.
+        new_prefix = prefix + node.content.strip()
+        for i, child in enumerate(node.children):
+            sep = "\n\n" if i == 0 else config.header_context_sep
+            chunks.extend(rollup_chunk_node(child, config, prefix=new_prefix + sep))
+        return chunks
+    # If not heading-only-with-children, we handle this node's own content:
+    # Flatten the entire node (including sub-children) in standard Markdown form.
+    flattened = flatten_tree(node, level=len(node.path))
+    flattened_with_prefix = prefix + flattened
+    total_tokens = count_words(flattened_with_prefix)
+    # Check if we can roll up everything (node + children) in a single chunk
+    if total_tokens <= config.chunk_size * (1 + config.variation_percent):
+        # One single chunk for the entire subtree
+        chunks.append(
+            Chunk(text=flattened_with_prefix, path=node.path, token_count=total_tokens)
+        )
+    else:
+        # It's too large overall. We'll chunk the node's own content first (if any),
+        # then recurse on children.
+        node_content = node.content.strip()
+        # If we have actual content that is not just a heading, chunk it with the prefix
+        # (like "preamble" text).
+        # Note: if this node is heading-only but has NO children,
+        # it will still land here
+        # (because is_heading_only_with_children was False due to zero children).
+        if node_content and (not node.path or node_content != node.path[-1].strip()):
+            # The node is actual content (not purely heading).
+            # We'll chunk it in paragraphs/sentences with the prefix.
+            content_chunks = recursive_chunk(node_content, config)
+            for text_block in content_chunks:
+                block_with_prefix = prefix + text_block
+                chunks.append(
+                    Chunk(
+                        text=block_with_prefix,
+                        path=node.path,
+                        token_count=count_words(block_with_prefix),
+                    )
+                )
+        # Now recurse on children, passing the same prefix so they get it too
+        for child in node.children:
+            chunks.extend(rollup_chunk_node(child, config, prefix=prefix))
+    return chunks
+def rollup_chunk_tree(
+    root_nodes: List[Node],
+    config: MarkdownChunkConfig,
+) -> List[Chunk]:
+    # Create a dummy root node that contains everything.
+    dummy_root = Node(content="", path=[], children=root_nodes)
+    # Now process just the dummy root node with an empty prefix.
+    chunks = rollup_chunk_node(dummy_root, config, prefix="")
+    return chunks
+def chunk_markdown(markdown_text: str, config: MarkdownChunkConfig) -> List[str]:
+    tree = parse_markdown_headings(markdown_text)
+    if len(tree) == 1 and len(tree[0].children) == 0:
+        # Pure text, no hierarchy, so just use recursive_chunk
+        text_chunks = recursive_chunk(markdown_text, config)
+        return [_cleanup_text(chunk) for chunk in text_chunks]
+    if config.rollup:
+        chunks = rollup_chunk_tree(tree, config)
+    else:
+        chunks = chunk_tree(tree, config)
+    return [_cleanup_text(chunk.text) for chunk in chunks]
+if __name__ == "__main__":
+    # Example usage:
+    markdown_text = """# Title
+Intro para. Hope this is not
+getting split.
+## SubTitle
+- Item1
+- Item2
+"""
+    # Set up chunking config with very large chunk size.
+    # (you can adjust chunk_size, overlap_tokens, variation_percent)
+    config = MarkdownChunkConfig(
+        chunk_size=200, overlap_tokens=5, variation_percent=0.2
+    )
+    chunks = chunk_markdown(markdown_text, config)
+    for idx, chunk in enumerate(chunks, 1):
+        print(f"--- Chunk {idx} --- ")
+        print(chunk)
+        print()
+    config.rollup = True
+    # with rollup_chunk_tree we get entire doc as 1 chunk
+    chunks = chunk_markdown(markdown_text, config)
+    assert len(chunks) == 1
+    for idx, chunk in enumerate(chunks, 1):
+        print(f"--- Chunk {idx} ---")
+        print(chunk)
+        print()

langroid/parsing/parser.py CHANGED Viewed

@@ -6,6 +6,11 @@ from typing import Any, Dict, List, Literal, Optional
 import tiktoken
 from langroid.mytypes import Document
+from langroid.parsing.md_parser import (
+    MarkdownChunkConfig,
+    chunk_markdown,
+    count_words,
+)
 from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
 from langroid.pydantic_v1 import BaseSettings, root_validator
 from langroid.utils.object_registry import ObjectRegistry
@@ -18,6 +23,8 @@ class Splitter(str, Enum):
     TOKENS = "tokens"
     PARA_SENTENCE = "para_sentence"
     SIMPLE = "simple"
+    # "structure-aware" splitting with chunks enriched by header info
+    MARKDOWN = "markdown"
 class BaseParsingConfig(BaseSettings):
@@ -98,9 +105,10 @@ class MarkitdownXLSParsingConfig(BaseSettings):
 class ParsingConfig(BaseSettings):
-    splitter: str = Splitter.TOKENS
+    splitter: str = Splitter.MARKDOWN
     chunk_by_page: bool = False  # split by page?
     chunk_size: int = 200  # aim for this many tokens per chunk
+    chunk_size_variation: float = 0.30  # max variation from chunk_size
     overlap: int = 50  # overlap between chunks
     max_chunks: int = 10_000
     # offset to subtract from page numbers:
@@ -130,6 +138,8 @@ class Parser:
             self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
     def num_tokens(self, text: str) -> int:
+        if self.config.splitter == Splitter.MARKDOWN:
+            return count_words(text)  # simple count based on whitespace-split
         tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
         return len(tokens)
@@ -254,7 +264,20 @@ class Parser:
     def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
         final_docs = []
         for d in docs:
-            chunks = self.chunk_tokens(d.content)
+            if self.config.splitter == Splitter.MARKDOWN:
+                chunks = chunk_markdown(
+                    d.content,
+                    MarkdownChunkConfig(
+                        # apply rough adjustment factor to convert from tokens to words,
+                        # which is what the markdown chunker uses
+                        chunk_size=int(self.config.chunk_size * 0.75),
+                        overlap_tokens=int(self.config.overlap * 0.75),
+                        variation_percent=self.config.chunk_size_variation,
+                        rollup=True,
+                    ),
+                )
+            else:
+                chunks = self.chunk_tokens(d.content)
             # note we are ensuring we COPY the document metadata into each chunk,
             # which ensures all chunks of a given doc have same metadata
             # (and in particular same metadata.id, which is important later for
@@ -370,13 +393,14 @@ class Parser:
         big_docs = [d for d in docs if not d.metadata.is_chunk]
         if len(big_docs) == 0:
             return chunked_docs
-        if self.config.splitter == Splitter.PARA_SENTENCE:
-            big_doc_chunks = self.split_para_sentence(big_docs)
-        elif self.config.splitter == Splitter.TOKENS:
-            big_doc_chunks = self.split_chunk_tokens(big_docs)
-        elif self.config.splitter == Splitter.SIMPLE:
-            big_doc_chunks = self.split_simple(big_docs)
-        else:
-            raise ValueError(f"Unknown splitter: {self.config.splitter}")
+        match self.config.splitter:
+            case Splitter.MARKDOWN | Splitter.TOKENS:
+                big_doc_chunks = self.split_chunk_tokens(big_docs)
+            case Splitter.PARA_SENTENCE:
+                big_doc_chunks = self.split_para_sentence(big_docs)
+            case Splitter.SIMPLE:
+                big_doc_chunks = self.split_simple(big_docs)
+            case _:
+                raise ValueError(f"Unknown splitter: {self.config.splitter}")
         return chunked_docs + big_doc_chunks

langroid/parsing/url_loader.py CHANGED Viewed

@@ -4,6 +4,7 @@ from abc import ABC, abstractmethod
 from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
+import markdownify as md
 from dotenv import load_dotenv
 from langroid.exceptions import LangroidImportError
@@ -31,6 +32,7 @@ class TrafilaturaConfig(BaseCrawlerConfig):
     """Configuration for Trafilatura crawler."""
     threads: int = 4
+    format: str = "markdown"  # or "xml" or "txt"
 class FirecrawlConfig(BaseCrawlerConfig):
@@ -200,8 +202,16 @@ class TrafilaturaCrawler(BaseCrawler):
                     docs.extend(parsed_doc)
                 else:
                     text = trafilatura.extract(
-                        result, no_fallback=False, favor_recall=True
+                        result,
+                        no_fallback=False,
+                        favor_recall=True,
+                        include_formatting=True,
+                        output_format=self.config.format,
+                        with_metadata=True,  # Title, date, author... at start of text
                     )
+                    if self.config.format in ["xml", "html"]:
+                        # heading_style="ATX" for markdown headings, i.e. #, ##, etc.
+                        text = md.markdownify(text, heading_style="ATX")
                     if text is None and result is not None and isinstance(result, str):
                         text = result
                     if text:
@@ -378,14 +388,21 @@ class ExaCrawler(BaseCrawler):
                     docs.extend(parsed_doc_chunks)
                     continue
                 else:
-                    results = exa.get_contents([url], livecrawl="always", text=True)
+                    results = exa.get_contents(
+                        [url],
+                        livecrawl="always",
+                        text={
+                            "include_html_tags": True,
+                        },
+                    )
                     result = results.results[0]
                     if result.text:
+                        md_text = md.markdownify(result.text, heading_style="ATX")
                         # append a NON-chunked document
                         # (metadata.is_chunk = False, so will be chunked downstream)
                         docs.append(
                             Document(
-                                content=result.text,
+                                content=md_text,
                                 metadata=DocMetaData(
                                     source=url,
                                     title=getattr(result, "title", "Unknown Title"),

{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.49.1
+Version: 0.50.0
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT
@@ -27,6 +27,7 @@ Requires-Dist: halo<1.0.0,>=0.0.31
 Requires-Dist: jinja2<4.0.0,>=3.1.2
 Requires-Dist: json-repair<1.0.0,>=0.29.9
 Requires-Dist: lxml<5.0.0,>=4.9.3
+Requires-Dist: markdownify>=0.13.1
 Requires-Dist: nest-asyncio<2.0.0,>=1.6.0
 Requires-Dist: nltk<4.0.0,>=3.8.2
 Requires-Dist: onnxruntime<2.0.0,>=1.16.1

{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
 langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
-langroid/mytypes.py,sha256=ezj_6FFDkJZiVx1SS9eJvh23dH76Ti7mJbePi8ldkAI,3919
+langroid/mytypes.py,sha256=HIcYAqGeA9OK0Hlscym2FI5Oax9QFljDZoVgRlomhRk,4014
 langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
 langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
@@ -14,7 +14,7 @@ langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a
 langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/callbacks/chainlit.py,sha256=UHB6P_J40vsVnssosqkpkOVWRf9NK4TOY0_G2g_Arsg,20900
 langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
-langroid/agent/special/doc_chat_agent.py,sha256=SrotZ0qw51fKDXlDP2lwTho0PPTuqUogFAT4jjq0ne0,65231
+langroid/agent/special/doc_chat_agent.py,sha256=J_-yOWBci5_ChDXOVUxCag_3gRou5Xm8la3I37ePcwk,65233
 langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
 langroid/agent/special/lance_tools.py,sha256=qS8x4wi8mrqfbYV2ztFzrcxyhHQ0ZWOc-zkYiH7awj0,2105
 langroid/agent/special/relevance_extractor_agent.py,sha256=zIx8GUdVo1aGW6ASla0NPQjYYIpmriK_TYMijqAx3F8,4796
@@ -81,17 +81,18 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
 langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
 langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
 langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
-langroid/parsing/document_parser.py,sha256=72g9EUuLlCAAXGD9-8UPe7_l7JnZ7vgc764g_17EPWA,54454
+langroid/parsing/document_parser.py,sha256=XihXwhp--Nxhb8xoh6wth_isJCGUROKiVr3rPDOJodU,54359
+langroid/parsing/md_parser.py,sha256=JUgsUpCaeAuBndmtDaJR9HMZaje1gmtXtaLXJHst3i8,21340
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
 langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
-langroid/parsing/parser.py,sha256=bxBXiyRnUBhS5Ng6s4OhAUpxqCSUXwNn4c7DaDSiWnE,14314
+langroid/parsing/parser.py,sha256=YPE6X6efimz2bYbardrhHHKw7V1LZvq-vF0q5p5XzOk,15387
 langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
 langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
 langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
 langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
 langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
-langroid/parsing/url_loader.py,sha256=DvgkdCZ3gDlAajH0dIUjea4YyXkziK-g36WnaE1J_WI,14884
+langroid/parsing/url_loader.py,sha256=NQuCxa-hTOuxLZDq4xKLvPfGVB4IWFzh2ItqWq297DI,15675
 langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
 langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
 langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
@@ -127,7 +128,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
 langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
 langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
 langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
-langroid-0.49.1.dist-info/METADATA,sha256=a2cArSN5YfRq4GRH37MkO6h-fvXbXEFkoo-qDMyVTzA,63606
-langroid-0.49.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langroid-0.49.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.49.1.dist-info/RECORD,,
+langroid-0.50.0.dist-info/METADATA,sha256=JlWk_AbUqBitgpOF_957BtX6ZhT4FImk313aidCnf1Y,63641
+langroid-0.50.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langroid-0.50.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.50.0.dist-info/RECORD,,

{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{langroid-0.49.1.dist-info → langroid-0.50.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langroid 0.49.1__py3-none-any.whl → 0.50.0__py3-none-any.whl

langroid 0.49.1py3-none-any.whl → 0.50.0py3-none-any.whl