dataknobs-xization 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dataknobs-xization might be problematic. Click here for more details.
- dataknobs_xization/__init__.py +40 -1
- dataknobs_xization/annotations.py +17 -16
- dataknobs_xization/authorities.py +2 -2
- dataknobs_xization/lexicon.py +7 -7
- dataknobs_xization/markdown/__init__.py +44 -0
- dataknobs_xization/markdown/md_chunker.py +429 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +1 -1
- dataknobs_xization/normalize.py +2 -2
- {dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/METADATA +32 -1
- dataknobs_xization-1.1.0.dist-info/RECORD +14 -0
- dataknobs_xization-1.0.1.dist-info/RECORD +0 -10
- {dataknobs_xization-1.0.1.dist-info → dataknobs_xization-1.1.0.dist-info}/WHEEL +0 -0
dataknobs_xization/__init__.py
CHANGED
|
@@ -1,11 +1,34 @@
|
|
|
1
1
|
"""Text normalization and tokenization tools."""
|
|
2
2
|
|
|
3
|
-
from dataknobs_xization import
|
|
3
|
+
from dataknobs_xization import (
|
|
4
|
+
annotations,
|
|
5
|
+
authorities,
|
|
6
|
+
lexicon,
|
|
7
|
+
markdown,
|
|
8
|
+
masking_tokenizer,
|
|
9
|
+
normalize,
|
|
10
|
+
)
|
|
11
|
+
from dataknobs_xization.markdown import (
|
|
12
|
+
AdaptiveStreamingProcessor,
|
|
13
|
+
Chunk,
|
|
14
|
+
ChunkFormat,
|
|
15
|
+
ChunkMetadata,
|
|
16
|
+
HeadingInclusion,
|
|
17
|
+
MarkdownChunker,
|
|
18
|
+
MarkdownNode,
|
|
19
|
+
MarkdownParser,
|
|
20
|
+
StreamingMarkdownProcessor,
|
|
21
|
+
chunk_markdown_tree,
|
|
22
|
+
parse_markdown,
|
|
23
|
+
stream_markdown_file,
|
|
24
|
+
stream_markdown_string,
|
|
25
|
+
)
|
|
4
26
|
from dataknobs_xization.masking_tokenizer import CharacterFeatures, TextFeatures
|
|
5
27
|
|
|
6
28
|
__version__ = "1.0.0"
|
|
7
29
|
|
|
8
30
|
__all__ = [
|
|
31
|
+
# Existing exports
|
|
9
32
|
"CharacterFeatures",
|
|
10
33
|
"TextFeatures",
|
|
11
34
|
"annotations",
|
|
@@ -13,4 +36,20 @@ __all__ = [
|
|
|
13
36
|
"lexicon",
|
|
14
37
|
"masking_tokenizer",
|
|
15
38
|
"normalize",
|
|
39
|
+
# Markdown module
|
|
40
|
+
"markdown",
|
|
41
|
+
# Markdown chunking classes and functions
|
|
42
|
+
"AdaptiveStreamingProcessor",
|
|
43
|
+
"Chunk",
|
|
44
|
+
"ChunkFormat",
|
|
45
|
+
"ChunkMetadata",
|
|
46
|
+
"HeadingInclusion",
|
|
47
|
+
"MarkdownChunker",
|
|
48
|
+
"MarkdownNode",
|
|
49
|
+
"MarkdownParser",
|
|
50
|
+
"StreamingMarkdownProcessor",
|
|
51
|
+
"chunk_markdown_tree",
|
|
52
|
+
"parse_markdown",
|
|
53
|
+
"stream_markdown_file",
|
|
54
|
+
"stream_markdown_string",
|
|
16
55
|
]
|
|
@@ -237,7 +237,7 @@ class Annotations:
|
|
|
237
237
|
if self._df is not None:
|
|
238
238
|
alist = self._df.to_dict(orient="records")
|
|
239
239
|
self._df = None
|
|
240
|
-
return alist if alist is not None else
|
|
240
|
+
return alist if alist is not None else []
|
|
241
241
|
|
|
242
242
|
def _build_df(self) -> pd.DataFrame:
|
|
243
243
|
"""Get the annotations as a df."""
|
|
@@ -303,7 +303,7 @@ class AnnotationsBuilder:
|
|
|
303
303
|
:param key_fields: The dictionary of key fields
|
|
304
304
|
:param kwargs: Any extra fields to add
|
|
305
305
|
"""
|
|
306
|
-
result =
|
|
306
|
+
result = {}
|
|
307
307
|
result.update(key_fields)
|
|
308
308
|
if self.data_defaults is not None:
|
|
309
309
|
# Add data_defaults
|
|
@@ -392,7 +392,7 @@ class AnnotationsGroup:
|
|
|
392
392
|
:param autolock: True to automatically lock this group when (1) at
|
|
393
393
|
least one row has been added and (2) a row is rejected.
|
|
394
394
|
"""
|
|
395
|
-
self.rows =
|
|
395
|
+
self.rows = [] # List[RowData]
|
|
396
396
|
self.row_accessor = row_accessor
|
|
397
397
|
self.field_col_type = field_col_type
|
|
398
398
|
self.accept_fn = accept_fn
|
|
@@ -732,17 +732,17 @@ class AnnotationsGroupList:
|
|
|
732
732
|
def __init__(
|
|
733
733
|
self,
|
|
734
734
|
groups: List[AnnotationsGroup] = None,
|
|
735
|
-
accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda
|
|
735
|
+
accept_fn: Callable[["AnnotationsGroupList", AnnotationsGroup], bool] = lambda lst, g: lst.size
|
|
736
736
|
== 0
|
|
737
|
-
or not g.is_subset_of_any(
|
|
737
|
+
or not g.is_subset_of_any(lst.groups),
|
|
738
738
|
):
|
|
739
739
|
""":param groups: The initial groups for this list
|
|
740
|
-
:param accept_fn: A fn(
|
|
741
|
-
into this list,
|
|
740
|
+
:param accept_fn: A fn(lst, g) that returns True to accept the group, g,
|
|
741
|
+
into this list, lst, or False to reject the group. If None, then all
|
|
742
742
|
groups are always accepted. The default function will reject any
|
|
743
743
|
group that is a subset of any existing group in the list.
|
|
744
744
|
"""
|
|
745
|
-
self.groups = groups if groups is not None else
|
|
745
|
+
self.groups = groups if groups is not None else []
|
|
746
746
|
self.accept_fn = accept_fn
|
|
747
747
|
self._coverage = None
|
|
748
748
|
|
|
@@ -838,7 +838,7 @@ class AnnotatedText(dk_doc.Text):
|
|
|
838
838
|
def bookmarks(self) -> Dict[str, pd.DataFrame]:
|
|
839
839
|
"""Get this object's bookmarks"""
|
|
840
840
|
if self._bookmarks is None:
|
|
841
|
-
self._bookmarks =
|
|
841
|
+
self._bookmarks = {}
|
|
842
842
|
return self._bookmarks
|
|
843
843
|
|
|
844
844
|
def get_text(
|
|
@@ -1134,13 +1134,14 @@ class EntityAnnotator(BasicAnnotator):
|
|
|
1134
1134
|
:param largest_only: True to only mark largest records.
|
|
1135
1135
|
:return: The annotations added to the text object
|
|
1136
1136
|
"""
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1137
|
+
# TODO: Use annot_mask_cols to mask annotations
|
|
1138
|
+
# annot2mask = (
|
|
1139
|
+
# None
|
|
1140
|
+
# if annot_mask_cols is None
|
|
1141
|
+
# else {
|
|
1142
|
+
# col: self.mask_char for col in annot_mask_cols
|
|
1143
|
+
# }
|
|
1144
|
+
# )
|
|
1144
1145
|
|
|
1145
1146
|
annots = self.annotate_text(text_obj.text)
|
|
1146
1147
|
if annots is None:
|
|
@@ -653,7 +653,7 @@ class RegexAuthority(Authority):
|
|
|
653
653
|
:return: The added Annotations
|
|
654
654
|
"""
|
|
655
655
|
for match in re.finditer(self.regex, text_obj.text):
|
|
656
|
-
ann_dicts =
|
|
656
|
+
ann_dicts = []
|
|
657
657
|
if match.lastindex is not None:
|
|
658
658
|
if len(self.regex.groupindex) > 0: # we have named groups
|
|
659
659
|
for group_name, group_num in self.regex.groupindex.items():
|
|
@@ -735,7 +735,7 @@ class AuthoritiesBundle(Authority):
|
|
|
735
735
|
anns_validator=anns_validator,
|
|
736
736
|
parent_auth=parent_auth,
|
|
737
737
|
)
|
|
738
|
-
self.auths = auths.copy() if auths is not None else
|
|
738
|
+
self.auths = auths.copy() if auths is not None else []
|
|
739
739
|
|
|
740
740
|
def add(self, auth: Authority):
|
|
741
741
|
"""Add the authority to this bundle
|
dataknobs_xization/lexicon.py
CHANGED
|
@@ -56,7 +56,7 @@ class LexicalExpander:
|
|
|
56
56
|
variations = {self.normalize_fn(v) for v in variations}
|
|
57
57
|
# Add a mapping from each variation to its original term
|
|
58
58
|
if variations is not None and len(variations) > 0:
|
|
59
|
-
more_itertools.consume(
|
|
59
|
+
more_itertools.consume(self.v2t[v].add(term) for v in variations)
|
|
60
60
|
return variations
|
|
61
61
|
|
|
62
62
|
def normalize(self, input_term: str) -> str:
|
|
@@ -92,7 +92,7 @@ class TokenMatch:
|
|
|
92
92
|
|
|
93
93
|
self.varparts = var.split()
|
|
94
94
|
self.matches = True
|
|
95
|
-
self.tokens =
|
|
95
|
+
self.tokens = []
|
|
96
96
|
t = token
|
|
97
97
|
for v in self.varparts:
|
|
98
98
|
if t is not None and v == t.norm_text:
|
|
@@ -133,7 +133,7 @@ class TokenAligner:
|
|
|
133
133
|
def __init__(self, first_token: dk_tok.Token, authority: dk_auth.LexicalAuthority):
|
|
134
134
|
self.first_token = first_token
|
|
135
135
|
self.auth = authority
|
|
136
|
-
self.annotations =
|
|
136
|
+
self.annotations = [] # List[Dict[str, Any]]
|
|
137
137
|
self._processed_idx = set()
|
|
138
138
|
self._process(self.first_token)
|
|
139
139
|
|
|
@@ -147,7 +147,7 @@ class TokenAligner:
|
|
|
147
147
|
self._process(token.next_token)
|
|
148
148
|
|
|
149
149
|
def _get_token_matches(self, token):
|
|
150
|
-
token_matches =
|
|
150
|
+
token_matches = []
|
|
151
151
|
vs = self.auth.find_variations(token.norm_text, starts_with=True)
|
|
152
152
|
if len(vs) > 0:
|
|
153
153
|
for val_idx, var in vs.items():
|
|
@@ -169,7 +169,7 @@ class DataframeAuthority(dk_auth.LexicalAuthority):
|
|
|
169
169
|
authdata: dk_auth.AuthorityData,
|
|
170
170
|
auth_anns_builder: dk_auth.AuthorityAnnotationsBuilder = None,
|
|
171
171
|
field_groups: dk_auth.DerivedFieldGroups = None,
|
|
172
|
-
anns_validator: Callable[[
|
|
172
|
+
anns_validator: Callable[[dk_auth.Authority, Dict[str, Any]], bool] = None,
|
|
173
173
|
parent_auth: dk_auth.Authority = None,
|
|
174
174
|
):
|
|
175
175
|
"""Initialize with the name, values, and associated ids of the authority;
|
|
@@ -351,7 +351,7 @@ class CorrelatedAuthorityData(dk_auth.AuthorityData):
|
|
|
351
351
|
|
|
352
352
|
def __init__(self, df: pd.DataFrame, name: str):
|
|
353
353
|
super().__init__(df, name)
|
|
354
|
-
self._authority_data =
|
|
354
|
+
self._authority_data = {}
|
|
355
355
|
|
|
356
356
|
def sub_authority_names(self) -> List[str]:
|
|
357
357
|
"""Get the "sub" authority names."""
|
|
@@ -406,7 +406,7 @@ class MultiAuthorityData(CorrelatedAuthorityData):
|
|
|
406
406
|
|
|
407
407
|
def __init__(self, df: pd.DataFrame, name: str):
|
|
408
408
|
super().__init__(df, name)
|
|
409
|
-
self._authority_data =
|
|
409
|
+
self._authority_data = {}
|
|
410
410
|
|
|
411
411
|
@abstractmethod
|
|
412
412
|
def build_authority_data(self, name: str) -> dk_auth.AuthorityData:
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Markdown chunking utilities for RAG applications.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive utilities for parsing and chunking markdown
|
|
4
|
+
documents while preserving semantic structure and heading hierarchy.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataknobs_xization.markdown.md_chunker import (
|
|
8
|
+
Chunk,
|
|
9
|
+
ChunkFormat,
|
|
10
|
+
ChunkMetadata,
|
|
11
|
+
HeadingInclusion,
|
|
12
|
+
MarkdownChunker,
|
|
13
|
+
chunk_markdown_tree,
|
|
14
|
+
)
|
|
15
|
+
from dataknobs_xization.markdown.md_parser import (
|
|
16
|
+
MarkdownNode,
|
|
17
|
+
MarkdownParser,
|
|
18
|
+
parse_markdown,
|
|
19
|
+
)
|
|
20
|
+
from dataknobs_xization.markdown.md_streaming import (
|
|
21
|
+
AdaptiveStreamingProcessor,
|
|
22
|
+
StreamingMarkdownProcessor,
|
|
23
|
+
stream_markdown_file,
|
|
24
|
+
stream_markdown_string,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
# Parser
|
|
29
|
+
"MarkdownNode",
|
|
30
|
+
"MarkdownParser",
|
|
31
|
+
"parse_markdown",
|
|
32
|
+
# Chunker
|
|
33
|
+
"Chunk",
|
|
34
|
+
"ChunkFormat",
|
|
35
|
+
"ChunkMetadata",
|
|
36
|
+
"HeadingInclusion",
|
|
37
|
+
"MarkdownChunker",
|
|
38
|
+
"chunk_markdown_tree",
|
|
39
|
+
# Streaming
|
|
40
|
+
"AdaptiveStreamingProcessor",
|
|
41
|
+
"StreamingMarkdownProcessor",
|
|
42
|
+
"stream_markdown_file",
|
|
43
|
+
"stream_markdown_string",
|
|
44
|
+
]
|
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
"""Markdown chunker for generating RAG-optimized chunks from tree structures.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to traverse markdown tree structures and
|
|
4
|
+
generate chunks suitable for RAG (Retrieval-Augmented Generation) applications.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any, Iterator
|
|
12
|
+
|
|
13
|
+
from dataknobs_structures.tree import Tree
|
|
14
|
+
|
|
15
|
+
from dataknobs_xization.markdown.md_parser import MarkdownNode
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ChunkFormat(Enum):
|
|
19
|
+
"""Output format for chunk text."""
|
|
20
|
+
|
|
21
|
+
MARKDOWN = "markdown" # Include headings as markdown
|
|
22
|
+
PLAIN = "plain" # Plain text without markdown formatting
|
|
23
|
+
DICT = "dict" # Return as dictionary
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HeadingInclusion(Enum):
|
|
27
|
+
"""Strategy for including headings in chunks."""
|
|
28
|
+
|
|
29
|
+
IN_TEXT = "in_text" # Include headings in chunk text
|
|
30
|
+
IN_METADATA = "in_metadata" # Include headings only in metadata
|
|
31
|
+
BOTH = "both" # Include in both text and metadata
|
|
32
|
+
NONE = "none" # Don't include headings
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class ChunkMetadata:
|
|
37
|
+
"""Metadata for a document chunk.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
headings: List of heading texts from root to chunk
|
|
41
|
+
heading_levels: List of heading levels corresponding to headings
|
|
42
|
+
line_number: Starting line number in source document
|
|
43
|
+
chunk_index: Index of this chunk in the sequence
|
|
44
|
+
chunk_size: Size of chunk text in characters
|
|
45
|
+
custom: Additional custom metadata
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
headings: list[str] = field(default_factory=list)
|
|
49
|
+
heading_levels: list[int] = field(default_factory=list)
|
|
50
|
+
line_number: int = 0
|
|
51
|
+
chunk_index: int = 0
|
|
52
|
+
chunk_size: int = 0
|
|
53
|
+
custom: dict[str, Any] = field(default_factory=dict)
|
|
54
|
+
|
|
55
|
+
def to_dict(self) -> dict[str, Any]:
|
|
56
|
+
"""Convert metadata to dictionary."""
|
|
57
|
+
return {
|
|
58
|
+
"headings": self.headings,
|
|
59
|
+
"heading_levels": self.heading_levels,
|
|
60
|
+
"line_number": self.line_number,
|
|
61
|
+
"chunk_index": self.chunk_index,
|
|
62
|
+
"chunk_size": self.chunk_size,
|
|
63
|
+
**self.custom,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
def get_heading_path(self, separator: str = " > ") -> str:
|
|
67
|
+
"""Get heading hierarchy as a single string.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
separator: String to use between headings
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Formatted heading path
|
|
74
|
+
"""
|
|
75
|
+
return separator.join(self.headings)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class Chunk:
|
|
80
|
+
"""A chunk of text with associated metadata.
|
|
81
|
+
|
|
82
|
+
Attributes:
|
|
83
|
+
text: The chunk text content
|
|
84
|
+
metadata: Metadata for this chunk
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
text: str
|
|
88
|
+
metadata: ChunkMetadata
|
|
89
|
+
|
|
90
|
+
def to_dict(self) -> dict[str, Any]:
|
|
91
|
+
"""Convert chunk to dictionary representation."""
|
|
92
|
+
return {
|
|
93
|
+
"text": self.text,
|
|
94
|
+
"metadata": self.metadata.to_dict(),
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def to_markdown(self, include_headings: bool = True) -> str:
|
|
98
|
+
"""Convert chunk to markdown format.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
include_headings: Whether to include heading hierarchy
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Markdown-formatted string
|
|
105
|
+
"""
|
|
106
|
+
if not include_headings or not self.metadata.headings:
|
|
107
|
+
return self.text
|
|
108
|
+
|
|
109
|
+
# Build heading hierarchy
|
|
110
|
+
lines = []
|
|
111
|
+
for heading, level in zip(
|
|
112
|
+
self.metadata.headings, self.metadata.heading_levels
|
|
113
|
+
):
|
|
114
|
+
lines.append(f"{'#' * level} {heading}")
|
|
115
|
+
|
|
116
|
+
# Add body text
|
|
117
|
+
if self.text:
|
|
118
|
+
lines.append("")
|
|
119
|
+
lines.append(self.text)
|
|
120
|
+
|
|
121
|
+
return "\n".join(lines)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class MarkdownChunker:
|
|
125
|
+
"""Chunker for generating chunks from markdown tree structures.
|
|
126
|
+
|
|
127
|
+
Traverses a Tree built from markdown and generates chunks with
|
|
128
|
+
configurable size, heading inclusion, and output format.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(
|
|
132
|
+
self,
|
|
133
|
+
max_chunk_size: int = 1000,
|
|
134
|
+
chunk_overlap: int = 100,
|
|
135
|
+
heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
|
|
136
|
+
chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
|
|
137
|
+
combine_under_heading: bool = True,
|
|
138
|
+
):
|
|
139
|
+
"""Initialize the markdown chunker.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
max_chunk_size: Maximum size of chunk text in characters
|
|
143
|
+
chunk_overlap: Number of characters to overlap between chunks
|
|
144
|
+
heading_inclusion: How to include headings in chunks
|
|
145
|
+
chunk_format: Output format for chunks
|
|
146
|
+
combine_under_heading: Whether to combine body text under same heading
|
|
147
|
+
"""
|
|
148
|
+
self.max_chunk_size = max_chunk_size
|
|
149
|
+
self.chunk_overlap = chunk_overlap
|
|
150
|
+
self.heading_inclusion = heading_inclusion
|
|
151
|
+
self.chunk_format = chunk_format
|
|
152
|
+
self.combine_under_heading = combine_under_heading
|
|
153
|
+
self._chunk_index = 0
|
|
154
|
+
|
|
155
|
+
def chunk(self, tree: Tree) -> Iterator[Chunk]:
|
|
156
|
+
"""Generate chunks from a markdown tree.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
tree: Tree structure built from markdown
|
|
160
|
+
|
|
161
|
+
Yields:
|
|
162
|
+
Chunk objects with text and metadata
|
|
163
|
+
"""
|
|
164
|
+
self._chunk_index = 0
|
|
165
|
+
|
|
166
|
+
# Get all terminal (leaf) nodes - not headings or root
|
|
167
|
+
terminal_nodes = tree.collect_terminal_nodes(
|
|
168
|
+
accept_node_fn=lambda n: (
|
|
169
|
+
isinstance(n.data, MarkdownNode)
|
|
170
|
+
and not n.data.is_heading()
|
|
171
|
+
and n.data.node_type != "root"
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
if self.combine_under_heading:
|
|
176
|
+
# Group terminal nodes by their parent heading
|
|
177
|
+
yield from self._chunk_by_heading(terminal_nodes)
|
|
178
|
+
else:
|
|
179
|
+
# Process each terminal node individually
|
|
180
|
+
yield from self._chunk_individually(terminal_nodes)
|
|
181
|
+
|
|
182
|
+
def _chunk_by_heading(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
|
|
183
|
+
"""Group nodes under same heading and chunk them.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
terminal_nodes: List of terminal tree nodes
|
|
187
|
+
|
|
188
|
+
Yields:
|
|
189
|
+
Chunk objects
|
|
190
|
+
"""
|
|
191
|
+
# Group nodes by their immediate parent
|
|
192
|
+
parent_groups: dict[Tree, list[Tree]] = {}
|
|
193
|
+
for node in terminal_nodes:
|
|
194
|
+
parent = node.parent
|
|
195
|
+
if parent not in parent_groups:
|
|
196
|
+
parent_groups[parent] = []
|
|
197
|
+
parent_groups[parent].append(node)
|
|
198
|
+
|
|
199
|
+
# Process each group
|
|
200
|
+
for parent, nodes in parent_groups.items():
|
|
201
|
+
# Get heading path for this group
|
|
202
|
+
headings, levels = self._get_heading_path(parent)
|
|
203
|
+
|
|
204
|
+
# Separate atomic constructs from regular body text
|
|
205
|
+
atomic_nodes = [n for n in nodes if n.data.is_atomic()]
|
|
206
|
+
body_nodes = [n for n in nodes if not n.data.is_atomic()]
|
|
207
|
+
|
|
208
|
+
# Process body text nodes (can be combined and split)
|
|
209
|
+
if body_nodes:
|
|
210
|
+
combined_text = "\n".join(
|
|
211
|
+
node.data.text for node in body_nodes if node.data.text.strip()
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if combined_text.strip():
|
|
215
|
+
for chunk_text in self._split_text(combined_text):
|
|
216
|
+
yield self._create_chunk(
|
|
217
|
+
text=chunk_text,
|
|
218
|
+
headings=headings,
|
|
219
|
+
heading_levels=levels,
|
|
220
|
+
line_number=body_nodes[0].data.line_number if body_nodes else 0,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Process atomic constructs (keep as complete units)
|
|
224
|
+
for atomic_node in atomic_nodes:
|
|
225
|
+
# Don't split atomic constructs, even if they exceed max_chunk_size
|
|
226
|
+
yield self._create_chunk(
|
|
227
|
+
text=atomic_node.data.text,
|
|
228
|
+
headings=headings,
|
|
229
|
+
heading_levels=levels,
|
|
230
|
+
line_number=atomic_node.data.line_number,
|
|
231
|
+
metadata=atomic_node.data.metadata,
|
|
232
|
+
node_type=atomic_node.data.node_type,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def _chunk_individually(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
|
|
236
|
+
"""Process each terminal node individually.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
terminal_nodes: List of terminal tree nodes
|
|
240
|
+
|
|
241
|
+
Yields:
|
|
242
|
+
Chunk objects
|
|
243
|
+
"""
|
|
244
|
+
for node in terminal_nodes:
|
|
245
|
+
if not node.data.text.strip():
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
headings, levels = self._get_heading_path(node.parent)
|
|
249
|
+
|
|
250
|
+
# Atomic constructs are kept whole
|
|
251
|
+
if node.data.is_atomic():
|
|
252
|
+
yield self._create_chunk(
|
|
253
|
+
text=node.data.text,
|
|
254
|
+
headings=headings,
|
|
255
|
+
heading_levels=levels,
|
|
256
|
+
line_number=node.data.line_number,
|
|
257
|
+
metadata=node.data.metadata,
|
|
258
|
+
node_type=node.data.node_type,
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
# Regular body text can be split
|
|
262
|
+
for chunk_text in self._split_text(node.data.text):
|
|
263
|
+
yield self._create_chunk(
|
|
264
|
+
text=chunk_text,
|
|
265
|
+
headings=headings,
|
|
266
|
+
heading_levels=levels,
|
|
267
|
+
line_number=node.data.line_number,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
def _get_heading_path(self, node: Tree | None) -> tuple[list[str], list[int]]:
|
|
271
|
+
"""Get the heading path from root to this node.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
node: Tree node to get path for
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Tuple of (heading_texts, heading_levels)
|
|
278
|
+
"""
|
|
279
|
+
headings = []
|
|
280
|
+
levels = []
|
|
281
|
+
|
|
282
|
+
current = node
|
|
283
|
+
while current is not None:
|
|
284
|
+
if isinstance(current.data, MarkdownNode):
|
|
285
|
+
if current.data.is_heading():
|
|
286
|
+
headings.insert(0, current.data.text)
|
|
287
|
+
levels.insert(0, current.data.level)
|
|
288
|
+
current = current.parent
|
|
289
|
+
|
|
290
|
+
return headings, levels
|
|
291
|
+
|
|
292
|
+
def _split_text(self, text: str) -> list[str]:
|
|
293
|
+
"""Split text into chunks respecting max_chunk_size.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
text: Text to split
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
List of text chunks
|
|
300
|
+
"""
|
|
301
|
+
if len(text) <= self.max_chunk_size:
|
|
302
|
+
return [text]
|
|
303
|
+
|
|
304
|
+
chunks = []
|
|
305
|
+
start = 0
|
|
306
|
+
|
|
307
|
+
while start < len(text):
|
|
308
|
+
end = start + self.max_chunk_size
|
|
309
|
+
|
|
310
|
+
# If not at the end, try to break at a good boundary
|
|
311
|
+
if end < len(text):
|
|
312
|
+
# Try to break at paragraph boundary (double newline)
|
|
313
|
+
break_pos = text.rfind("\n\n", start, end)
|
|
314
|
+
if break_pos > start:
|
|
315
|
+
end = break_pos + 2
|
|
316
|
+
else:
|
|
317
|
+
# Try to break at sentence boundary
|
|
318
|
+
for punct in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
|
|
319
|
+
break_pos = text.rfind(punct, start, end)
|
|
320
|
+
if break_pos > start:
|
|
321
|
+
end = break_pos + len(punct)
|
|
322
|
+
break
|
|
323
|
+
else:
|
|
324
|
+
# Try to break at word boundary
|
|
325
|
+
break_pos = text.rfind(" ", start, end)
|
|
326
|
+
if break_pos > start:
|
|
327
|
+
end = break_pos + 1
|
|
328
|
+
|
|
329
|
+
chunks.append(text[start:end].strip())
|
|
330
|
+
|
|
331
|
+
# Move start position, accounting for overlap
|
|
332
|
+
start = max(start + 1, end - self.chunk_overlap)
|
|
333
|
+
|
|
334
|
+
return [c for c in chunks if c] # Filter out empty chunks
|
|
335
|
+
|
|
336
|
+
def _create_chunk(
|
|
337
|
+
self,
|
|
338
|
+
text: str,
|
|
339
|
+
headings: list[str],
|
|
340
|
+
heading_levels: list[int],
|
|
341
|
+
line_number: int,
|
|
342
|
+
metadata: dict[str, Any] | None = None,
|
|
343
|
+
node_type: str = "body",
|
|
344
|
+
) -> Chunk:
|
|
345
|
+
"""Create a chunk with appropriate format and metadata.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
text: Body text for chunk
|
|
349
|
+
headings: List of heading texts
|
|
350
|
+
heading_levels: List of heading levels
|
|
351
|
+
line_number: Source line number
|
|
352
|
+
metadata: Optional metadata from the source node
|
|
353
|
+
node_type: Type of node ('body', 'code', 'list', 'table', etc.)
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Formatted Chunk object
|
|
357
|
+
"""
|
|
358
|
+
# Build chunk text based on heading inclusion setting
|
|
359
|
+
chunk_text = text
|
|
360
|
+
|
|
361
|
+
if self.heading_inclusion in (HeadingInclusion.IN_TEXT, HeadingInclusion.BOTH):
|
|
362
|
+
# Prepend headings to text
|
|
363
|
+
heading_lines = []
|
|
364
|
+
for heading, level in zip(headings, heading_levels):
|
|
365
|
+
if self.chunk_format == ChunkFormat.MARKDOWN:
|
|
366
|
+
heading_lines.append(f"{'#' * level} {heading}")
|
|
367
|
+
else:
|
|
368
|
+
heading_lines.append(heading)
|
|
369
|
+
|
|
370
|
+
if heading_lines:
|
|
371
|
+
chunk_text = "\n".join(heading_lines) + "\n\n" + text
|
|
372
|
+
|
|
373
|
+
# Create custom metadata dict with node type and additional metadata
|
|
374
|
+
custom_metadata = {"node_type": node_type}
|
|
375
|
+
if metadata:
|
|
376
|
+
custom_metadata.update(metadata)
|
|
377
|
+
|
|
378
|
+
# Create chunk metadata
|
|
379
|
+
chunk_metadata = ChunkMetadata(
|
|
380
|
+
headings=headings if self.heading_inclusion in (
|
|
381
|
+
HeadingInclusion.IN_METADATA,
|
|
382
|
+
HeadingInclusion.BOTH,
|
|
383
|
+
) else [],
|
|
384
|
+
heading_levels=heading_levels if self.heading_inclusion in (
|
|
385
|
+
HeadingInclusion.IN_METADATA,
|
|
386
|
+
HeadingInclusion.BOTH,
|
|
387
|
+
) else [],
|
|
388
|
+
line_number=line_number,
|
|
389
|
+
chunk_index=self._chunk_index,
|
|
390
|
+
chunk_size=len(chunk_text),
|
|
391
|
+
custom=custom_metadata,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
self._chunk_index += 1
|
|
395
|
+
|
|
396
|
+
return Chunk(text=chunk_text, metadata=chunk_metadata)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def chunk_markdown_tree(
|
|
400
|
+
tree: Tree,
|
|
401
|
+
max_chunk_size: int = 1000,
|
|
402
|
+
chunk_overlap: int = 100,
|
|
403
|
+
heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
|
|
404
|
+
chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
|
|
405
|
+
combine_under_heading: bool = True,
|
|
406
|
+
) -> list[Chunk]:
|
|
407
|
+
"""Generate chunks from a markdown tree.
|
|
408
|
+
|
|
409
|
+
Convenience function for creating and using a MarkdownChunker.
|
|
410
|
+
|
|
411
|
+
Args:
|
|
412
|
+
tree: Tree structure built from markdown
|
|
413
|
+
max_chunk_size: Maximum size of chunk text in characters
|
|
414
|
+
chunk_overlap: Number of characters to overlap between chunks
|
|
415
|
+
heading_inclusion: How to include headings in chunks
|
|
416
|
+
chunk_format: Output format for chunks
|
|
417
|
+
combine_under_heading: Whether to combine body text under same heading
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
List of Chunk objects
|
|
421
|
+
"""
|
|
422
|
+
chunker = MarkdownChunker(
|
|
423
|
+
max_chunk_size=max_chunk_size,
|
|
424
|
+
chunk_overlap=chunk_overlap,
|
|
425
|
+
heading_inclusion=heading_inclusion,
|
|
426
|
+
chunk_format=chunk_format,
|
|
427
|
+
combine_under_heading=combine_under_heading,
|
|
428
|
+
)
|
|
429
|
+
return list(chunker.chunk(tree))
|