PyPI - dataknobs-xization - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

dataknobs-xization 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dataknobs-xization might be problematic. Click here for more details.

Files changed (14) hide show

dataknobs_xization/__init__.py +40 -1
dataknobs_xization/annotations.py +17 -16
dataknobs_xization/authorities.py +2 -2
dataknobs_xization/lexicon.py +7 -7
dataknobs_xization/markdown/__init__.py +44 -0
dataknobs_xization/markdown/md_chunker.py +429 -0
dataknobs_xization/markdown/md_parser.py +605 -0
dataknobs_xization/markdown/md_streaming.py +302 -0
dataknobs_xization/masking_tokenizer.py +1 -1
dataknobs_xization/normalize.py +2 -2
{dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/METADATA +32 -1
dataknobs_xization-1.1.0.dist-info/RECORD +14 -0
dataknobs_xization-1.0.1.dist-info/RECORD +0 -10
{dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/WHEEL +0 -0

dataknobs_xization/markdown/md_streaming.py ADDED Viewed

@@ -0,0 +1,302 @@
+"""Streaming processor for incremental markdown chunking.
+This module provides functionality to process large markdown documents
+incrementally, managing memory constraints while generating chunks.
+"""
+from __future__ import annotations
+from typing import Iterator, TextIO
+from dataknobs_structures.tree import Tree
+from dataknobs_xization.markdown.md_chunker import Chunk, ChunkFormat, HeadingInclusion, MarkdownChunker
+from dataknobs_xization.markdown.md_parser import MarkdownNode, MarkdownParser
+class StreamingMarkdownProcessor:
+    """Streaming processor for incremental markdown chunking.
+    Processes markdown documents line-by-line, building tree structure
+    incrementally and yielding chunks as they become available. Manages
+    memory by pruning processed sections of the tree.
+    """
+    def __init__(
+        self,
+        max_chunk_size: int = 1000,
+        chunk_overlap: int = 100,
+        max_line_length: int | None = None,
+        heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
+        chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
+        max_tree_depth: int = 100,
+        memory_limit_nodes: int | None = None,
+    ):
+        """Initialize the streaming processor.
+        Args:
+            max_chunk_size: Maximum size of chunk text in characters
+            chunk_overlap: Number of characters to overlap between chunks
+            max_line_length: Maximum length for individual lines
+            heading_inclusion: How to include headings in chunks
+            chunk_format: Output format for chunks
+            max_tree_depth: Maximum depth of tree to maintain
+            memory_limit_nodes: Maximum number of nodes to keep in memory
+                (None for unlimited)
+        """
+        self.parser = MarkdownParser(
+            max_line_length=max_line_length,
+            preserve_empty_lines=False,
+        )
+        self.chunker = MarkdownChunker(
+            max_chunk_size=max_chunk_size,
+            chunk_overlap=chunk_overlap,
+            heading_inclusion=heading_inclusion,
+            chunk_format=chunk_format,
+            combine_under_heading=True,
+        )
+        self.max_tree_depth = max_tree_depth
+        self.memory_limit_nodes = memory_limit_nodes
+    def process_stream(
+        self,
+        source: str | TextIO | Iterator[str],
+    ) -> Iterator[Chunk]:
+        """Process markdown from a stream, yielding chunks incrementally.
+        Args:
+            source: Markdown content as string, file object, or line iterator
+        Yields:
+            Chunk objects as they become available
+        """
+        # For simplicity in v1, we'll use a batch processing approach
+        # that processes complete sections under headings
+        #
+        # Future enhancement: true streaming with incremental tree building
+        tree = self.parser.parse(source)
+        # Generate chunks
+        yield from self.chunker.chunk(tree)
+    def process_file(self, file_path: str) -> Iterator[Chunk]:
+        """Process a markdown file, yielding chunks incrementally.
+        Args:
+            file_path: Path to markdown file
+        Yields:
+            Chunk objects
+        """
+        with open(file_path, encoding='utf-8') as f:
+            yield from self.process_stream(f)
+    def process_string(self, content: str) -> Iterator[Chunk]:
+        """Process markdown from a string, yielding chunks.
+        Args:
+            content: Markdown content string
+        Yields:
+            Chunk objects
+        """
+        yield from self.process_stream(content)
+class AdaptiveStreamingProcessor(StreamingMarkdownProcessor):
+    """Streaming processor that adapts to memory constraints.
+    This processor monitors tree size and adaptively chunks sections
+    when memory limits are approached, preventing memory overflow on
+    large documents.
+    """
+    def __init__(
+        self,
+        max_chunk_size: int = 1000,
+        chunk_overlap: int = 100,
+        max_line_length: int | None = None,
+        heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
+        chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
+        max_tree_depth: int = 100,
+        memory_limit_nodes: int = 10000,
+        adaptive_threshold: float = 0.8,
+    ):
+        """Initialize the adaptive streaming processor.
+        Args:
+            max_chunk_size: Maximum size of chunk text in characters
+            chunk_overlap: Number of characters to overlap between chunks
+            max_line_length: Maximum length for individual lines
+            heading_inclusion: How to include headings in chunks
+            chunk_format: Output format for chunks
+            max_tree_depth: Maximum depth of tree to maintain
+            memory_limit_nodes: Maximum number of nodes to keep in memory
+            adaptive_threshold: Fraction of memory_limit at which to trigger
+                adaptive chunking (0.0-1.0)
+        """
+        super().__init__(
+            max_chunk_size=max_chunk_size,
+            chunk_overlap=chunk_overlap,
+            max_line_length=max_line_length,
+            heading_inclusion=heading_inclusion,
+            chunk_format=chunk_format,
+            max_tree_depth=max_tree_depth,
+            memory_limit_nodes=memory_limit_nodes,
+        )
+        self.adaptive_threshold = adaptive_threshold
+    def process_stream(self, source: str | TextIO | Iterator[str]) -> Iterator[Chunk]:
+        """Process stream with adaptive memory management.
+        Args:
+            source: Markdown content source
+        Yields:
+            Chunk objects
+        """
+        # Build tree incrementally with memory monitoring
+        root = Tree(MarkdownNode(text="ROOT", level=0, node_type="root", line_number=0))
+        current_parent = root
+        line_number = 0
+        lines = self.parser._get_line_iterator(source)
+        pending_nodes = []  # Nodes waiting to be chunked
+        for line in lines:
+            line_number += 1
+            if not line.strip():
+                continue
+            # Check if line is a heading
+            heading_match = self.parser.HEADING_PATTERN.match(line)
+            if heading_match:
+                # Before adding new heading, check if we should chunk pending nodes
+                if self.memory_limit_nodes:
+                    node_count = len(root.find_nodes(lambda _: True))
+                    if node_count >= self.memory_limit_nodes * self.adaptive_threshold:
+                        # Chunk and yield accumulated body text
+                        if pending_nodes:
+                            yield from self._chunk_nodes(pending_nodes)
+                            pending_nodes = []
+                            # Prune processed subtrees to free memory
+                            self._prune_processed_nodes(root)
+                # Process heading
+                level = len(heading_match.group(1))
+                text = heading_match.group(2).strip()
+                node_data = MarkdownNode(
+                    text=text,
+                    level=level,
+                    node_type="heading",
+                    line_number=line_number,
+                )
+                current_parent, _ = self.parser._find_heading_parent(
+                    root, current_parent, level
+                )
+                heading_node = current_parent.add_child(node_data)
+                current_parent = heading_node
+            else:
+                # Body text
+                node_data = MarkdownNode(
+                    text=line.rstrip('\n'),
+                    level=0,
+                    node_type="body",
+                    line_number=line_number,
+                )
+                body_node = current_parent.add_child(node_data)
+                pending_nodes.append(body_node)
+        # Process any remaining pending nodes
+        if pending_nodes:
+            yield from self._chunk_nodes(pending_nodes)
+    def _chunk_nodes(self, nodes: list[Tree]) -> Iterator[Chunk]:
+        """Chunk a list of body text nodes.
+        Args:
+            nodes: List of body text tree nodes
+        Yields:
+            Chunk objects
+        """
+        yield from self.chunker._chunk_by_heading(nodes)
+    def _prune_processed_nodes(self, root: Tree) -> None:
+        """Prune processed leaf nodes to free memory.
+        Args:
+            root: Root of tree to prune
+        """
+        # Find terminal nodes that have been processed
+        # For now, we'll keep the tree structure but could optimize further
+        # by removing fully processed subtrees
+        pass
+def stream_markdown_file(
+    file_path: str,
+    max_chunk_size: int = 1000,
+    chunk_overlap: int = 100,
+    heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
+    chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
+) -> Iterator[Chunk]:
+    """Stream chunks from a markdown file.
+    Convenience function for processing a file with default settings.
+    Args:
+        file_path: Path to markdown file
+        max_chunk_size: Maximum size of chunk text in characters
+        chunk_overlap: Number of characters to overlap between chunks
+        heading_inclusion: How to include headings in chunks
+        chunk_format: Output format for chunks
+    Yields:
+        Chunk objects
+    """
+    processor = StreamingMarkdownProcessor(
+        max_chunk_size=max_chunk_size,
+        chunk_overlap=chunk_overlap,
+        heading_inclusion=heading_inclusion,
+        chunk_format=chunk_format,
+    )
+    yield from processor.process_file(file_path)
+def stream_markdown_string(
+    content: str,
+    max_chunk_size: int = 1000,
+    chunk_overlap: int = 100,
+    heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
+    chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
+) -> Iterator[Chunk]:
+    """Stream chunks from a markdown string.
+    Convenience function for processing a string with default settings.
+    Args:
+        content: Markdown content string
+        max_chunk_size: Maximum size of chunk text in characters
+        chunk_overlap: Number of characters to overlap between chunks
+        heading_inclusion: How to include headings in chunks
+        chunk_format: Output format for chunks
+    Yields:
+        Chunk objects
+    """
+    processor = StreamingMarkdownProcessor(
+        max_chunk_size=max_chunk_size,
+        chunk_overlap=chunk_overlap,
+        heading_inclusion=heading_inclusion,
+        chunk_format=chunk_format,
+    )
+    yield from processor.process_string(content)

dataknobs_xization/masking_tokenizer.py CHANGED Viewed

@@ -84,7 +84,7 @@ class CharacterFeatures(ABC):
         :return: A list of token instances
         """
         token = self.build_first_token(normalize_fn)
-        tokens = list()
+        tokens = []
         while token is not None:
             tokens.append(token)
             token = token.next_token

dataknobs_xization/normalize.py CHANGED Viewed

@@ -113,7 +113,7 @@ def get_hyphen_slash_expansions_fn(
     if do_split:
         # add each word separately
         tokens = set(hyphen_slash_re.split(text))
-        if not max(map(lambda t: len(t) < min_split_token_len, tokens)):
+        if not max(len(t) < min_split_token_len for t in tokens):
             variations.update(tokens)
     return variations
@@ -348,7 +348,7 @@ def year_variations_fn(
     variations.update(zero_pad_variations(remainder, 2, 3))
     if century > 0:
-        remainder_texts = list()
+        remainder_texts = []
         if remainder > 0:
             if remainder < 10:
                 if not numeric_only:

{dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataknobs-xization
-Version: 1.0.1
+Version: 1.1.0
 Summary: Text normalization and tokenization tools
 Author-email: Spence Koehler <KoehlerSB747@gmail.com>
 Requires-Python: >=3.10
@@ -22,6 +22,11 @@ pip install dataknobs-xization
 ## Features
+- **Markdown Chunking**: Parse and chunk markdown documents for RAG applications
+  - Preserves heading hierarchy and semantic structure
+  - Supports code blocks, tables, lists, and other markdown constructs
+  - Streaming support for large documents
+  - Flexible configuration for chunk size, overlap, and heading inclusion
 - **Text Normalization**: Standardize text for consistent processing
 - **Masking Tokenizer**: Advanced tokenization with masking capabilities
 - **Annotations**: Text annotation system
@@ -30,6 +35,32 @@ pip install dataknobs-xization
 ## Usage
+### Markdown Chunking
+```python
+from dataknobs_xization import parse_markdown, chunk_markdown_tree
+# Parse markdown into tree structure
+markdown_text = """
+# User Guide
+## Installation
+Install the package using pip.
+"""
+tree = parse_markdown(markdown_text)
+# Generate chunks for RAG
+chunks = chunk_markdown_tree(tree, max_chunk_size=500)
+for chunk in chunks:
+    print(f"Headings: {chunk.metadata.get_heading_path()}")
+    print(f"Text: {chunk.text}\n")
+```
+For more details, see the [Markdown Chunking documentation](docs/markdown/MARKDOWN_CHUNKING.md).
+### Text Normalization and Tokenization
 ```python
 from dataknobs_xization import normalize, MaskingTokenizer

dataknobs_xization-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,14 @@
+dataknobs_xization/0.readme.txt,sha256=Q46suHOARkjQLY580eOfSCeUyIgQx-e6DLmtEhcuODE,2878
+dataknobs_xization/__init__.py,sha256=CNpO8lBEz46jFS50XGjiubmO7srtsvx7W_dHxzYLSVQ,1202
+dataknobs_xization/annotations.py,sha256=cbdcmnExrRFLGVsC1ULV-_7dUxoseHN6OYs2MMLh_-g,45183
+dataknobs_xization/authorities.py,sha256=S2pfHejKOT8JUlxWnxBRuOHzZxjDlajHERvYVYOwMRs,30737
+dataknobs_xization/lexicon.py,sha256=aws0JnDWoKnXmkU09T5S4vq-hDPFBsXERxKAyMuHmw0,23701
+dataknobs_xization/masking_tokenizer.py,sha256=uJYsi4o4brhFzTi4V06muRFUWAOHkjCiVCONEqVk218,26032
+dataknobs_xization/normalize.py,sha256=ufnvdceCf3zPQ0njhp-qY1JQTl2IKM6ALQ05b-iAREg,14013
+dataknobs_xization/markdown/__init__.py,sha256=ubVUGZBZeaOqvIzQkOixW39JniK4y4O2TWgTMTCSzIU,1039
+dataknobs_xization/markdown/md_chunker.py,sha256=Lf0gqVoBlF7IZ6gorEuhxP1NV_InRv2AGn2a1zsFWXc,14569
+dataknobs_xization/markdown/md_parser.py,sha256=U1KYZjGD_G6Bwy-Eo073kUJz597Ff5UsWwdw_y1dYc8,20394
+dataknobs_xization/markdown/md_streaming.py,sha256=4zyyBeVt7G42Mqr4Hprugq0LVaZ-WTps13jOm_i4rNA,10469
+dataknobs_xization-1.1.0.dist-info/METADATA,sha256=CCXAdqzC5jfSxlNh-aXTcS-Az5qQc2jd00NK_NZ89v0,2319
+dataknobs_xization-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+dataknobs_xization-1.1.0.dist-info/RECORD,,

dataknobs_xization-1.0.1.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-dataknobs_xization/0.readme.txt,sha256=Q46suHOARkjQLY580eOfSCeUyIgQx-e6DLmtEhcuODE,2878
-dataknobs_xization/__init__.py,sha256=ixsRSYr86q1T4LqQTRzP9Z_ihcOVN6r8SQNurhmHWmY,404
-dataknobs_xization/annotations.py,sha256=qiH_QzzIs5mjvO2Yr4jiLBMIxIiPbzzfd_iublS8HTI,45143
-dataknobs_xization/authorities.py,sha256=69nAlExbh_U7NKav1q3IujXb8lBq14QJhHHy5IZ0PZE,30745
-dataknobs_xization/lexicon.py,sha256=NMo3lAXUVzFVRy246Y90TZtm-27qR5g0z8Ef9u2E2LA,23722
-dataknobs_xization/masking_tokenizer.py,sha256=65RkHdU83l1Tf0f9bXwNrLDuFsN-xegMQNJGON7Z8WY,26036
-dataknobs_xization/normalize.py,sha256=kpT8y1jEmeiKiNC8pruurFjasmREhr4rAQ3W_yB2v4U,14024
-dataknobs_xization-1.0.1.dist-info/METADATA,sha256=RDT8c1JeCzLd7F57WkifjZlwMinbSpHBVJoM8ZU3uQE,1393
-dataknobs_xization-1.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-dataknobs_xization-1.0.1.dist-info/RECORD,,

{dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

dataknobs-xization 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

dataknobs-xization 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl