dataknobs-xization 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +110 -0
- dataknobs_xization/annotations.py +1476 -0
- dataknobs_xization/authorities.py +860 -0
- dataknobs_xization/content_transformer.py +570 -0
- dataknobs_xization/ingestion/__init__.py +27 -0
- dataknobs_xization/ingestion/config.py +352 -0
- dataknobs_xization/ingestion/processor.py +367 -0
- dataknobs_xization/json/__init__.py +17 -0
- dataknobs_xization/json/json_chunker.py +591 -0
- dataknobs_xization/lexicon.py +723 -0
- dataknobs_xization/markdown/__init__.py +72 -0
- dataknobs_xization/markdown/enrichment.py +260 -0
- dataknobs_xization/markdown/filters.py +236 -0
- dataknobs_xization/markdown/md_chunker.py +478 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +768 -0
- dataknobs_xization/normalize.py +520 -0
- dataknobs_xization/py.typed +0 -0
- dataknobs_xization-1.2.3.dist-info/METADATA +170 -0
- dataknobs_xization-1.2.3.dist-info/RECORD +23 -0
- dataknobs_xization-1.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""Streaming processor for incremental markdown chunking.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to process large markdown documents
|
|
4
|
+
incrementally, managing memory constraints while generating chunks.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Iterator, TextIO
|
|
10
|
+
|
|
11
|
+
from dataknobs_structures.tree import Tree
|
|
12
|
+
|
|
13
|
+
from dataknobs_xization.markdown.md_chunker import Chunk, ChunkFormat, HeadingInclusion, MarkdownChunker
|
|
14
|
+
from dataknobs_xization.markdown.md_parser import MarkdownNode, MarkdownParser
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StreamingMarkdownProcessor:
|
|
18
|
+
"""Streaming processor for incremental markdown chunking.
|
|
19
|
+
|
|
20
|
+
Processes markdown documents line-by-line, building tree structure
|
|
21
|
+
incrementally and yielding chunks as they become available. Manages
|
|
22
|
+
memory by pruning processed sections of the tree.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
max_chunk_size: int = 1000,
|
|
28
|
+
chunk_overlap: int = 100,
|
|
29
|
+
max_line_length: int | None = None,
|
|
30
|
+
heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
|
|
31
|
+
chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
|
|
32
|
+
max_tree_depth: int = 100,
|
|
33
|
+
memory_limit_nodes: int | None = None,
|
|
34
|
+
):
|
|
35
|
+
"""Initialize the streaming processor.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
max_chunk_size: Maximum size of chunk text in characters
|
|
39
|
+
chunk_overlap: Number of characters to overlap between chunks
|
|
40
|
+
max_line_length: Maximum length for individual lines
|
|
41
|
+
heading_inclusion: How to include headings in chunks
|
|
42
|
+
chunk_format: Output format for chunks
|
|
43
|
+
max_tree_depth: Maximum depth of tree to maintain
|
|
44
|
+
memory_limit_nodes: Maximum number of nodes to keep in memory
|
|
45
|
+
(None for unlimited)
|
|
46
|
+
"""
|
|
47
|
+
self.parser = MarkdownParser(
|
|
48
|
+
max_line_length=max_line_length,
|
|
49
|
+
preserve_empty_lines=False,
|
|
50
|
+
)
|
|
51
|
+
self.chunker = MarkdownChunker(
|
|
52
|
+
max_chunk_size=max_chunk_size,
|
|
53
|
+
chunk_overlap=chunk_overlap,
|
|
54
|
+
heading_inclusion=heading_inclusion,
|
|
55
|
+
chunk_format=chunk_format,
|
|
56
|
+
combine_under_heading=True,
|
|
57
|
+
)
|
|
58
|
+
self.max_tree_depth = max_tree_depth
|
|
59
|
+
self.memory_limit_nodes = memory_limit_nodes
|
|
60
|
+
|
|
61
|
+
def process_stream(
|
|
62
|
+
self,
|
|
63
|
+
source: str | TextIO | Iterator[str],
|
|
64
|
+
) -> Iterator[Chunk]:
|
|
65
|
+
"""Process markdown from a stream, yielding chunks incrementally.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
source: Markdown content as string, file object, or line iterator
|
|
69
|
+
|
|
70
|
+
Yields:
|
|
71
|
+
Chunk objects as they become available
|
|
72
|
+
"""
|
|
73
|
+
# For simplicity in v1, we'll use a batch processing approach
|
|
74
|
+
# that processes complete sections under headings
|
|
75
|
+
#
|
|
76
|
+
# Future enhancement: true streaming with incremental tree building
|
|
77
|
+
|
|
78
|
+
tree = self.parser.parse(source)
|
|
79
|
+
|
|
80
|
+
# Generate chunks
|
|
81
|
+
yield from self.chunker.chunk(tree)
|
|
82
|
+
|
|
83
|
+
def process_file(self, file_path: str) -> Iterator[Chunk]:
|
|
84
|
+
"""Process a markdown file, yielding chunks incrementally.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
file_path: Path to markdown file
|
|
88
|
+
|
|
89
|
+
Yields:
|
|
90
|
+
Chunk objects
|
|
91
|
+
"""
|
|
92
|
+
with open(file_path, encoding='utf-8') as f:
|
|
93
|
+
yield from self.process_stream(f)
|
|
94
|
+
|
|
95
|
+
def process_string(self, content: str) -> Iterator[Chunk]:
|
|
96
|
+
"""Process markdown from a string, yielding chunks.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
content: Markdown content string
|
|
100
|
+
|
|
101
|
+
Yields:
|
|
102
|
+
Chunk objects
|
|
103
|
+
"""
|
|
104
|
+
yield from self.process_stream(content)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class AdaptiveStreamingProcessor(StreamingMarkdownProcessor):
|
|
108
|
+
"""Streaming processor that adapts to memory constraints.
|
|
109
|
+
|
|
110
|
+
This processor monitors tree size and adaptively chunks sections
|
|
111
|
+
when memory limits are approached, preventing memory overflow on
|
|
112
|
+
large documents.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def __init__(
|
|
116
|
+
self,
|
|
117
|
+
max_chunk_size: int = 1000,
|
|
118
|
+
chunk_overlap: int = 100,
|
|
119
|
+
max_line_length: int | None = None,
|
|
120
|
+
heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
|
|
121
|
+
chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
|
|
122
|
+
max_tree_depth: int = 100,
|
|
123
|
+
memory_limit_nodes: int = 10000,
|
|
124
|
+
adaptive_threshold: float = 0.8,
|
|
125
|
+
):
|
|
126
|
+
"""Initialize the adaptive streaming processor.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
max_chunk_size: Maximum size of chunk text in characters
|
|
130
|
+
chunk_overlap: Number of characters to overlap between chunks
|
|
131
|
+
max_line_length: Maximum length for individual lines
|
|
132
|
+
heading_inclusion: How to include headings in chunks
|
|
133
|
+
chunk_format: Output format for chunks
|
|
134
|
+
max_tree_depth: Maximum depth of tree to maintain
|
|
135
|
+
memory_limit_nodes: Maximum number of nodes to keep in memory
|
|
136
|
+
adaptive_threshold: Fraction of memory_limit at which to trigger
|
|
137
|
+
adaptive chunking (0.0-1.0)
|
|
138
|
+
"""
|
|
139
|
+
super().__init__(
|
|
140
|
+
max_chunk_size=max_chunk_size,
|
|
141
|
+
chunk_overlap=chunk_overlap,
|
|
142
|
+
max_line_length=max_line_length,
|
|
143
|
+
heading_inclusion=heading_inclusion,
|
|
144
|
+
chunk_format=chunk_format,
|
|
145
|
+
max_tree_depth=max_tree_depth,
|
|
146
|
+
memory_limit_nodes=memory_limit_nodes,
|
|
147
|
+
)
|
|
148
|
+
self.adaptive_threshold = adaptive_threshold
|
|
149
|
+
|
|
150
|
+
def process_stream(self, source: str | TextIO | Iterator[str]) -> Iterator[Chunk]:
|
|
151
|
+
"""Process stream with adaptive memory management.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
source: Markdown content source
|
|
155
|
+
|
|
156
|
+
Yields:
|
|
157
|
+
Chunk objects
|
|
158
|
+
"""
|
|
159
|
+
# Build tree incrementally with memory monitoring
|
|
160
|
+
root = Tree(MarkdownNode(text="ROOT", level=0, node_type="root", line_number=0))
|
|
161
|
+
current_parent = root
|
|
162
|
+
line_number = 0
|
|
163
|
+
|
|
164
|
+
lines = self.parser._get_line_iterator(source)
|
|
165
|
+
|
|
166
|
+
pending_nodes = [] # Nodes waiting to be chunked
|
|
167
|
+
|
|
168
|
+
for line in lines:
|
|
169
|
+
line_number += 1
|
|
170
|
+
|
|
171
|
+
if not line.strip():
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
# Check if line is a heading
|
|
175
|
+
heading_match = self.parser.HEADING_PATTERN.match(line)
|
|
176
|
+
|
|
177
|
+
if heading_match:
|
|
178
|
+
# Before adding new heading, check if we should chunk pending nodes
|
|
179
|
+
if self.memory_limit_nodes:
|
|
180
|
+
node_count = len(root.find_nodes(lambda _: True))
|
|
181
|
+
if node_count >= self.memory_limit_nodes * self.adaptive_threshold:
|
|
182
|
+
# Chunk and yield accumulated body text
|
|
183
|
+
if pending_nodes:
|
|
184
|
+
yield from self._chunk_nodes(pending_nodes)
|
|
185
|
+
pending_nodes = []
|
|
186
|
+
# Prune processed subtrees to free memory
|
|
187
|
+
self._prune_processed_nodes(root)
|
|
188
|
+
|
|
189
|
+
# Process heading
|
|
190
|
+
level = len(heading_match.group(1))
|
|
191
|
+
text = heading_match.group(2).strip()
|
|
192
|
+
|
|
193
|
+
node_data = MarkdownNode(
|
|
194
|
+
text=text,
|
|
195
|
+
level=level,
|
|
196
|
+
node_type="heading",
|
|
197
|
+
line_number=line_number,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
current_parent, _ = self.parser._find_heading_parent(
|
|
201
|
+
root, current_parent, level
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
heading_node = current_parent.add_child(node_data)
|
|
205
|
+
current_parent = heading_node
|
|
206
|
+
|
|
207
|
+
else:
|
|
208
|
+
# Body text
|
|
209
|
+
node_data = MarkdownNode(
|
|
210
|
+
text=line.rstrip('\n'),
|
|
211
|
+
level=0,
|
|
212
|
+
node_type="body",
|
|
213
|
+
line_number=line_number,
|
|
214
|
+
)
|
|
215
|
+
body_node = current_parent.add_child(node_data)
|
|
216
|
+
pending_nodes.append(body_node)
|
|
217
|
+
|
|
218
|
+
# Process any remaining pending nodes
|
|
219
|
+
if pending_nodes:
|
|
220
|
+
yield from self._chunk_nodes(pending_nodes)
|
|
221
|
+
|
|
222
|
+
def _chunk_nodes(self, nodes: list[Tree]) -> Iterator[Chunk]:
|
|
223
|
+
"""Chunk a list of body text nodes.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
nodes: List of body text tree nodes
|
|
227
|
+
|
|
228
|
+
Yields:
|
|
229
|
+
Chunk objects
|
|
230
|
+
"""
|
|
231
|
+
yield from self.chunker._chunk_by_heading(nodes)
|
|
232
|
+
|
|
233
|
+
def _prune_processed_nodes(self, root: Tree) -> None:
|
|
234
|
+
"""Prune processed leaf nodes to free memory.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
root: Root of tree to prune
|
|
238
|
+
"""
|
|
239
|
+
# Find terminal nodes that have been processed
|
|
240
|
+
# For now, we'll keep the tree structure but could optimize further
|
|
241
|
+
# by removing fully processed subtrees
|
|
242
|
+
pass
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def stream_markdown_file(
|
|
246
|
+
file_path: str,
|
|
247
|
+
max_chunk_size: int = 1000,
|
|
248
|
+
chunk_overlap: int = 100,
|
|
249
|
+
heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
|
|
250
|
+
chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
|
|
251
|
+
) -> Iterator[Chunk]:
|
|
252
|
+
"""Stream chunks from a markdown file.
|
|
253
|
+
|
|
254
|
+
Convenience function for processing a file with default settings.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
file_path: Path to markdown file
|
|
258
|
+
max_chunk_size: Maximum size of chunk text in characters
|
|
259
|
+
chunk_overlap: Number of characters to overlap between chunks
|
|
260
|
+
heading_inclusion: How to include headings in chunks
|
|
261
|
+
chunk_format: Output format for chunks
|
|
262
|
+
|
|
263
|
+
Yields:
|
|
264
|
+
Chunk objects
|
|
265
|
+
"""
|
|
266
|
+
processor = StreamingMarkdownProcessor(
|
|
267
|
+
max_chunk_size=max_chunk_size,
|
|
268
|
+
chunk_overlap=chunk_overlap,
|
|
269
|
+
heading_inclusion=heading_inclusion,
|
|
270
|
+
chunk_format=chunk_format,
|
|
271
|
+
)
|
|
272
|
+
yield from processor.process_file(file_path)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def stream_markdown_string(
|
|
276
|
+
content: str,
|
|
277
|
+
max_chunk_size: int = 1000,
|
|
278
|
+
chunk_overlap: int = 100,
|
|
279
|
+
heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
|
|
280
|
+
chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
|
|
281
|
+
) -> Iterator[Chunk]:
|
|
282
|
+
"""Stream chunks from a markdown string.
|
|
283
|
+
|
|
284
|
+
Convenience function for processing a string with default settings.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
content: Markdown content string
|
|
288
|
+
max_chunk_size: Maximum size of chunk text in characters
|
|
289
|
+
chunk_overlap: Number of characters to overlap between chunks
|
|
290
|
+
heading_inclusion: How to include headings in chunks
|
|
291
|
+
chunk_format: Output format for chunks
|
|
292
|
+
|
|
293
|
+
Yields:
|
|
294
|
+
Chunk objects
|
|
295
|
+
"""
|
|
296
|
+
processor = StreamingMarkdownProcessor(
|
|
297
|
+
max_chunk_size=max_chunk_size,
|
|
298
|
+
chunk_overlap=chunk_overlap,
|
|
299
|
+
heading_inclusion=heading_inclusion,
|
|
300
|
+
chunk_format=chunk_format,
|
|
301
|
+
)
|
|
302
|
+
yield from processor.process_string(content)
|