PyPI - dataknobs-xization - Versions diffs - 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

dataknobs-xization 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dataknobs-xization might be problematic. Click here for more details.

Files changed (14) hide show

dataknobs_xization/__init__.py +40 -1
dataknobs_xization/annotations.py +17 -16
dataknobs_xization/authorities.py +2 -2
dataknobs_xization/lexicon.py +7 -7
dataknobs_xization/markdown/__init__.py +44 -0
dataknobs_xization/markdown/md_chunker.py +429 -0
dataknobs_xization/markdown/md_parser.py +605 -0
dataknobs_xization/markdown/md_streaming.py +302 -0
dataknobs_xization/masking_tokenizer.py +1 -1
dataknobs_xization/normalize.py +2 -2
{dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/METADATA +32 -1
dataknobs_xization-1.1.0.dist-info/RECORD +14 -0
dataknobs_xization-1.0.1.dist-info/RECORD +0 -10
{dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/WHEEL +0 -0

dataknobs_xization/__init__.py CHANGED Viewed

@@ -1,11 +1,34 @@
 """Text normalization and tokenization tools."""
-from dataknobs_xization import annotations, authorities, lexicon, masking_tokenizer, normalize
+from dataknobs_xization import (
+    annotations,
+    authorities,
+    lexicon,
+    markdown,
+    masking_tokenizer,
+    normalize,
+)
+from dataknobs_xization.markdown import (
+    AdaptiveStreamingProcessor,
+    Chunk,
+    ChunkFormat,
+    ChunkMetadata,
+    HeadingInclusion,
+    MarkdownChunker,
+    MarkdownNode,
+    MarkdownParser,
+    StreamingMarkdownProcessor,
+    chunk_markdown_tree,
+    parse_markdown,
+    stream_markdown_file,
+    stream_markdown_string,
+)
 from dataknobs_xization.masking_tokenizer import CharacterFeatures, TextFeatures
 __version__ = "1.0.0"
 __all__ = [
+    # Existing exports
     "CharacterFeatures",
     "TextFeatures",
     "annotations",
@@ -13,4 +36,20 @@ __all__ = [
     "lexicon",
     "masking_tokenizer",
     "normalize",
+    # Markdown module
+    "markdown",
+    # Markdown chunking classes and functions
+    "AdaptiveStreamingProcessor",
+    "Chunk",
+    "ChunkFormat",
+    "ChunkMetadata",
+    "HeadingInclusion",
+    "MarkdownChunker",
+    "MarkdownNode",
+    "MarkdownParser",
+    "StreamingMarkdownProcessor",
+    "chunk_markdown_tree",
+    "parse_markdown",
+    "stream_markdown_file",
+    "stream_markdown_string",
 ]

dataknobs_xization/annotations.py CHANGED Viewed

@@ -237,7 +237,7 @@ class Annotations:
         if self._df is not None:
             alist = self._df.to_dict(orient="records")
             self._df = None
-        return alist if alist is not None else list()
+        return alist if alist is not None else []
     def _build_df(self) -> pd.DataFrame:
         """Get the annotations as a df."""
@@ -303,7 +303,7 @@ class AnnotationsBuilder:
         :param key_fields: The dictionary of key fields
         :param kwargs: Any extra fields to add
         """
-        result = dict()
+        result = {}
         result.update(key_fields)
         if self.data_defaults is not None:
             # Add data_defaults
@@ -392,7 +392,7 @@ class AnnotationsGroup:
         :param autolock: True to automatically lock this group when (1) at
             least one row has been added and (2) a row is rejected.
         """
-        self.rows = list()  # List[RowData]
+        self.rows = []  # List[RowData]
         self.row_accessor = row_accessor
         self.field_col_type = field_col_type
         self.accept_fn = accept_fn
@@ -732,17 +732,17 @@ class AnnotationsGroupList:
     def __init__(
         self,
         groups: List[AnnotationsGroup] = None,
-        accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda l, g: l.size
+        accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda lst, g: lst.size
         == 0
-        or not g.is_subset_of_any(l.groups),
+        or not g.is_subset_of_any(lst.groups),
     ):
         """:param groups: The initial groups for this list
-        :param accept_fn: A fn(l, g) that returns True to accept the group, g,
-            into this list, l, or False to reject the group. If None, then all
+        :param accept_fn: A fn(lst, g) that returns True to accept the group, g,
+            into this list, lst, or False to reject the group. If None, then all
             groups are always accepted. The default function will reject any
             group that is a subset of any existing group in the list.
         """
-        self.groups = groups if groups is not None else list()
+        self.groups = groups if groups is not None else []
         self.accept_fn = accept_fn
         self._coverage = None
@@ -838,7 +838,7 @@ class AnnotatedText(dk_doc.Text):
     def bookmarks(self) -> Dict[str, pd.DataFrame]:
         """Get this object's bookmarks"""
         if self._bookmarks is None:
-            self._bookmarks = dict()
+            self._bookmarks = {}
         return self._bookmarks
     def get_text(
@@ -1134,13 +1134,14 @@ class EntityAnnotator(BasicAnnotator):
         :param largest_only: True to only mark largest records.
         :return: The annotations added to the text object
         """
-        annot2mask = (
-            None
-            if annot_mask_cols is None
-            else {  # TODO: Use this?!
-                col: self.mask_char for col in annot_mask_cols
-            }
-        )
+        # TODO: Use annot_mask_cols to mask annotations
+        # annot2mask = (
+        #     None
+        #     if annot_mask_cols is None
+        #     else {
+        #         col: self.mask_char for col in annot_mask_cols
+        #     }
+        # )
         annots = self.annotate_text(text_obj.text)
         if annots is None:

dataknobs_xization/authorities.py CHANGED Viewed

@@ -653,7 +653,7 @@ class RegexAuthority(Authority):
         :return: The added Annotations
         """
         for match in re.finditer(self.regex, text_obj.text):
-            ann_dicts = list()
+            ann_dicts = []
             if match.lastindex is not None:
                 if len(self.regex.groupindex) > 0:  # we have named groups
                     for group_name, group_num in self.regex.groupindex.items():
@@ -735,7 +735,7 @@ class AuthoritiesBundle(Authority):
             anns_validator=anns_validator,
             parent_auth=parent_auth,
         )
-        self.auths = auths.copy() if auths is not None else list()
+        self.auths = auths.copy() if auths is not None else []
     def add(self, auth: Authority):
         """Add the authority to this bundle

dataknobs_xization/lexicon.py CHANGED Viewed

@@ -56,7 +56,7 @@ class LexicalExpander:
             variations = {self.normalize_fn(v) for v in variations}
         # Add a mapping from each variation to its original term
         if variations is not None and len(variations) > 0:
-            more_itertools.consume(map(lambda v: self.v2t[v].add(term), variations))
+            more_itertools.consume(self.v2t[v].add(term) for v in variations)
         return variations
     def normalize(self, input_term: str) -> str:
@@ -92,7 +92,7 @@ class TokenMatch:
         self.varparts = var.split()
         self.matches = True
-        self.tokens = list()
+        self.tokens = []
         t = token
         for v in self.varparts:
             if t is not None and v == t.norm_text:
@@ -133,7 +133,7 @@ class TokenAligner:
     def __init__(self, first_token: dk_tok.Token, authority: dk_auth.LexicalAuthority):
         self.first_token = first_token
         self.auth = authority
-        self.annotations = list()  # List[Dict[str, Any]]
+        self.annotations = []  # List[Dict[str, Any]]
         self._processed_idx = set()
         self._process(self.first_token)
@@ -147,7 +147,7 @@ class TokenAligner:
             self._process(token.next_token)
     def _get_token_matches(self, token):
-        token_matches = list()
+        token_matches = []
         vs = self.auth.find_variations(token.norm_text, starts_with=True)
         if len(vs) > 0:
             for val_idx, var in vs.items():
@@ -169,7 +169,7 @@ class DataframeAuthority(dk_auth.LexicalAuthority):
         authdata: dk_auth.AuthorityData,
         auth_anns_builder: dk_auth.AuthorityAnnotationsBuilder = None,
         field_groups: dk_auth.DerivedFieldGroups = None,
-        anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
+        anns_validator: Callable[[dk_auth.Authority, Dict[str, Any]], bool] = None,
         parent_auth: dk_auth.Authority = None,
     ):
         """Initialize with the name, values, and associated ids of the authority;
@@ -351,7 +351,7 @@ class CorrelatedAuthorityData(dk_auth.AuthorityData):
     def __init__(self, df: pd.DataFrame, name: str):
         super().__init__(df, name)
-        self._authority_data = dict()
+        self._authority_data = {}
     def sub_authority_names(self) -> List[str]:
         """Get the "sub" authority names."""
@@ -406,7 +406,7 @@ class MultiAuthorityData(CorrelatedAuthorityData):
     def __init__(self, df: pd.DataFrame, name: str):
         super().__init__(df, name)
-        self._authority_data = dict()
+        self._authority_data = {}
     @abstractmethod
     def build_authority_data(self, name: str) -> dk_auth.AuthorityData:

dataknobs_xization/markdown/__init__.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Markdown chunking utilities for RAG applications.
+This module provides comprehensive utilities for parsing and chunking markdown
+documents while preserving semantic structure and heading hierarchy.
+"""
+from dataknobs_xization.markdown.md_chunker import (
+    Chunk,
+    ChunkFormat,
+    ChunkMetadata,
+    HeadingInclusion,
+    MarkdownChunker,
+    chunk_markdown_tree,
+)
+from dataknobs_xization.markdown.md_parser import (
+    MarkdownNode,
+    MarkdownParser,
+    parse_markdown,
+)
+from dataknobs_xization.markdown.md_streaming import (
+    AdaptiveStreamingProcessor,
+    StreamingMarkdownProcessor,
+    stream_markdown_file,
+    stream_markdown_string,
+)
+__all__ = [
+    # Parser
+    "MarkdownNode",
+    "MarkdownParser",
+    "parse_markdown",
+    # Chunker
+    "Chunk",
+    "ChunkFormat",
+    "ChunkMetadata",
+    "HeadingInclusion",
+    "MarkdownChunker",
+    "chunk_markdown_tree",
+    # Streaming
+    "AdaptiveStreamingProcessor",
+    "StreamingMarkdownProcessor",
+    "stream_markdown_file",
+    "stream_markdown_string",
+]

dataknobs_xization/markdown/md_chunker.py ADDED Viewed

@@ -0,0 +1,429 @@
+"""Markdown chunker for generating RAG-optimized chunks from tree structures.
+This module provides functionality to traverse markdown tree structures and
+generate chunks suitable for RAG (Retrieval-Augmented Generation) applications.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Iterator
+from dataknobs_structures.tree import Tree
+from dataknobs_xization.markdown.md_parser import MarkdownNode
+class ChunkFormat(Enum):
+    """Output format for chunk text."""
+    MARKDOWN = "markdown"  # Include headings as markdown
+    PLAIN = "plain"  # Plain text without markdown formatting
+    DICT = "dict"  # Return as dictionary
+class HeadingInclusion(Enum):
+    """Strategy for including headings in chunks."""
+    IN_TEXT = "in_text"  # Include headings in chunk text
+    IN_METADATA = "in_metadata"  # Include headings only in metadata
+    BOTH = "both"  # Include in both text and metadata
+    NONE = "none"  # Don't include headings
+@dataclass
+class ChunkMetadata:
+    """Metadata for a document chunk.
+    Attributes:
+        headings: List of heading texts from root to chunk
+        heading_levels: List of heading levels corresponding to headings
+        line_number: Starting line number in source document
+        chunk_index: Index of this chunk in the sequence
+        chunk_size: Size of chunk text in characters
+        custom: Additional custom metadata
+    """
+    headings: list[str] = field(default_factory=list)
+    heading_levels: list[int] = field(default_factory=list)
+    line_number: int = 0
+    chunk_index: int = 0
+    chunk_size: int = 0
+    custom: dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert metadata to dictionary."""
+        return {
+            "headings": self.headings,
+            "heading_levels": self.heading_levels,
+            "line_number": self.line_number,
+            "chunk_index": self.chunk_index,
+            "chunk_size": self.chunk_size,
+            **self.custom,
+        }
+    def get_heading_path(self, separator: str = " > ") -> str:
+        """Get heading hierarchy as a single string.
+        Args:
+            separator: String to use between headings
+        Returns:
+            Formatted heading path
+        """
+        return separator.join(self.headings)
+@dataclass
+class Chunk:
+    """A chunk of text with associated metadata.
+    Attributes:
+        text: The chunk text content
+        metadata: Metadata for this chunk
+    """
+    text: str
+    metadata: ChunkMetadata
+    def to_dict(self) -> dict[str, Any]:
+        """Convert chunk to dictionary representation."""
+        return {
+            "text": self.text,
+            "metadata": self.metadata.to_dict(),
+        }
+    def to_markdown(self, include_headings: bool = True) -> str:
+        """Convert chunk to markdown format.
+        Args:
+            include_headings: Whether to include heading hierarchy
+        Returns:
+            Markdown-formatted string
+        """
+        if not include_headings or not self.metadata.headings:
+            return self.text
+        # Build heading hierarchy
+        lines = []
+        for heading, level in zip(
+            self.metadata.headings, self.metadata.heading_levels
+        ):
+            lines.append(f"{'#' * level} {heading}")
+        # Add body text
+        if self.text:
+            lines.append("")
+            lines.append(self.text)
+        return "\n".join(lines)
+class MarkdownChunker:
+    """Chunker for generating chunks from markdown tree structures.
+    Traverses a Tree built from markdown and generates chunks with
+    configurable size, heading inclusion, and output format.
+    """
+    def __init__(
+        self,
+        max_chunk_size: int = 1000,
+        chunk_overlap: int = 100,
+        heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
+        chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
+        combine_under_heading: bool = True,
+    ):
+        """Initialize the markdown chunker.
+        Args:
+            max_chunk_size: Maximum size of chunk text in characters
+            chunk_overlap: Number of characters to overlap between chunks
+            heading_inclusion: How to include headings in chunks
+            chunk_format: Output format for chunks
+            combine_under_heading: Whether to combine body text under same heading
+        """
+        self.max_chunk_size = max_chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.heading_inclusion = heading_inclusion
+        self.chunk_format = chunk_format
+        self.combine_under_heading = combine_under_heading
+        self._chunk_index = 0
+    def chunk(self, tree: Tree) -> Iterator[Chunk]:
+        """Generate chunks from a markdown tree.
+        Args:
+            tree: Tree structure built from markdown
+        Yields:
+            Chunk objects with text and metadata
+        """
+        self._chunk_index = 0
+        # Get all terminal (leaf) nodes - not headings or root
+        terminal_nodes = tree.collect_terminal_nodes(
+            accept_node_fn=lambda n: (
+                isinstance(n.data, MarkdownNode)
+                and not n.data.is_heading()
+                and n.data.node_type != "root"
+            )
+        )
+        if self.combine_under_heading:
+            # Group terminal nodes by their parent heading
+            yield from self._chunk_by_heading(terminal_nodes)
+        else:
+            # Process each terminal node individually
+            yield from self._chunk_individually(terminal_nodes)
+    def _chunk_by_heading(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
+        """Group nodes under same heading and chunk them.
+        Args:
+            terminal_nodes: List of terminal tree nodes
+        Yields:
+            Chunk objects
+        """
+        # Group nodes by their immediate parent
+        parent_groups: dict[Tree, list[Tree]] = {}
+        for node in terminal_nodes:
+            parent = node.parent
+            if parent not in parent_groups:
+                parent_groups[parent] = []
+            parent_groups[parent].append(node)
+        # Process each group
+        for parent, nodes in parent_groups.items():
+            # Get heading path for this group
+            headings, levels = self._get_heading_path(parent)
+            # Separate atomic constructs from regular body text
+            atomic_nodes = [n for n in nodes if n.data.is_atomic()]
+            body_nodes = [n for n in nodes if not n.data.is_atomic()]
+            # Process body text nodes (can be combined and split)
+            if body_nodes:
+                combined_text = "\n".join(
+                    node.data.text for node in body_nodes if node.data.text.strip()
+                )
+                if combined_text.strip():
+                    for chunk_text in self._split_text(combined_text):
+                        yield self._create_chunk(
+                            text=chunk_text,
+                            headings=headings,
+                            heading_levels=levels,
+                            line_number=body_nodes[0].data.line_number if body_nodes else 0,
+                        )
+            # Process atomic constructs (keep as complete units)
+            for atomic_node in atomic_nodes:
+                # Don't split atomic constructs, even if they exceed max_chunk_size
+                yield self._create_chunk(
+                    text=atomic_node.data.text,
+                    headings=headings,
+                    heading_levels=levels,
+                    line_number=atomic_node.data.line_number,
+                    metadata=atomic_node.data.metadata,
+                    node_type=atomic_node.data.node_type,
+                )
+    def _chunk_individually(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
+        """Process each terminal node individually.
+        Args:
+            terminal_nodes: List of terminal tree nodes
+        Yields:
+            Chunk objects
+        """
+        for node in terminal_nodes:
+            if not node.data.text.strip():
+                continue
+            headings, levels = self._get_heading_path(node.parent)
+            # Atomic constructs are kept whole
+            if node.data.is_atomic():
+                yield self._create_chunk(
+                    text=node.data.text,
+                    headings=headings,
+                    heading_levels=levels,
+                    line_number=node.data.line_number,
+                    metadata=node.data.metadata,
+                    node_type=node.data.node_type,
+                )
+            else:
+                # Regular body text can be split
+                for chunk_text in self._split_text(node.data.text):
+                    yield self._create_chunk(
+                        text=chunk_text,
+                        headings=headings,
+                        heading_levels=levels,
+                        line_number=node.data.line_number,
+                    )
+    def _get_heading_path(self, node: Tree | None) -> tuple[list[str], list[int]]:
+        """Get the heading path from root to this node.
+        Args:
+            node: Tree node to get path for
+        Returns:
+            Tuple of (heading_texts, heading_levels)
+        """
+        headings = []
+        levels = []
+        current = node
+        while current is not None:
+            if isinstance(current.data, MarkdownNode):
+                if current.data.is_heading():
+                    headings.insert(0, current.data.text)
+                    levels.insert(0, current.data.level)
+            current = current.parent
+        return headings, levels
+    def _split_text(self, text: str) -> list[str]:
+        """Split text into chunks respecting max_chunk_size.
+        Args:
+            text: Text to split
+        Returns:
+            List of text chunks
+        """
+        if len(text) <= self.max_chunk_size:
+            return [text]
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = start + self.max_chunk_size
+            # If not at the end, try to break at a good boundary
+            if end < len(text):
+                # Try to break at paragraph boundary (double newline)
+                break_pos = text.rfind("\n\n", start, end)
+                if break_pos > start:
+                    end = break_pos + 2
+                else:
+                    # Try to break at sentence boundary
+                    for punct in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
+                        break_pos = text.rfind(punct, start, end)
+                        if break_pos > start:
+                            end = break_pos + len(punct)
+                            break
+                    else:
+                        # Try to break at word boundary
+                        break_pos = text.rfind(" ", start, end)
+                        if break_pos > start:
+                            end = break_pos + 1
+            chunks.append(text[start:end].strip())
+            # Move start position, accounting for overlap
+            start = max(start + 1, end - self.chunk_overlap)
+        return [c for c in chunks if c]  # Filter out empty chunks
+    def _create_chunk(
+        self,
+        text: str,
+        headings: list[str],
+        heading_levels: list[int],
+        line_number: int,
+        metadata: dict[str, Any] | None = None,
+        node_type: str = "body",
+    ) -> Chunk:
+        """Create a chunk with appropriate format and metadata.
+        Args:
+            text: Body text for chunk
+            headings: List of heading texts
+            heading_levels: List of heading levels
+            line_number: Source line number
+            metadata: Optional metadata from the source node
+            node_type: Type of node ('body', 'code', 'list', 'table', etc.)
+        Returns:
+            Formatted Chunk object
+        """
+        # Build chunk text based on heading inclusion setting
+        chunk_text = text
+        if self.heading_inclusion in (HeadingInclusion.IN_TEXT, HeadingInclusion.BOTH):
+            # Prepend headings to text
+            heading_lines = []
+            for heading, level in zip(headings, heading_levels):
+                if self.chunk_format == ChunkFormat.MARKDOWN:
+                    heading_lines.append(f"{'#' * level} {heading}")
+                else:
+                    heading_lines.append(heading)
+            if heading_lines:
+                chunk_text = "\n".join(heading_lines) + "\n\n" + text
+        # Create custom metadata dict with node type and additional metadata
+        custom_metadata = {"node_type": node_type}
+        if metadata:
+            custom_metadata.update(metadata)
+        # Create chunk metadata
+        chunk_metadata = ChunkMetadata(
+            headings=headings if self.heading_inclusion in (
+                HeadingInclusion.IN_METADATA,
+                HeadingInclusion.BOTH,
+            ) else [],
+            heading_levels=heading_levels if self.heading_inclusion in (
+                HeadingInclusion.IN_METADATA,
+                HeadingInclusion.BOTH,
+            ) else [],
+            line_number=line_number,
+            chunk_index=self._chunk_index,
+            chunk_size=len(chunk_text),
+            custom=custom_metadata,
+        )
+        self._chunk_index += 1
+        return Chunk(text=chunk_text, metadata=chunk_metadata)
+def chunk_markdown_tree(
+    tree: Tree,
+    max_chunk_size: int = 1000,
+    chunk_overlap: int = 100,
+    heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
+    chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
+    combine_under_heading: bool = True,
+) -> list[Chunk]:
+    """Generate chunks from a markdown tree.
+    Convenience function for creating and using a MarkdownChunker.
+    Args:
+        tree: Tree structure built from markdown
+        max_chunk_size: Maximum size of chunk text in characters
+        chunk_overlap: Number of characters to overlap between chunks
+        heading_inclusion: How to include headings in chunks
+        chunk_format: Output format for chunks
+        combine_under_heading: Whether to combine body text under same heading
+    Returns:
+        List of Chunk objects
+    """
+    chunker = MarkdownChunker(
+        max_chunk_size=max_chunk_size,
+        chunk_overlap=chunk_overlap,
+        heading_inclusion=heading_inclusion,
+        chunk_format=chunk_format,
+        combine_under_heading=combine_under_heading,
+    )
+    return list(chunker.chunk(tree))

dataknobs-xization 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

dataknobs-xization 1.0.1py3-none-any.whl → 1.1.0py3-none-any.whl