dataknobs-xization 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -0,0 +1,302 @@
1
+ """Streaming processor for incremental markdown chunking.
2
+
3
+ This module provides functionality to process large markdown documents
4
+ incrementally, managing memory constraints while generating chunks.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Iterator, TextIO
10
+
11
+ from dataknobs_structures.tree import Tree
12
+
13
+ from dataknobs_xization.markdown.md_chunker import Chunk, ChunkFormat, HeadingInclusion, MarkdownChunker
14
+ from dataknobs_xization.markdown.md_parser import MarkdownNode, MarkdownParser
15
+
16
+
17
+ class StreamingMarkdownProcessor:
18
+ """Streaming processor for incremental markdown chunking.
19
+
20
+ Processes markdown documents line-by-line, building tree structure
21
+ incrementally and yielding chunks as they become available. Manages
22
+ memory by pruning processed sections of the tree.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ max_chunk_size: int = 1000,
28
+ chunk_overlap: int = 100,
29
+ max_line_length: int | None = None,
30
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
31
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
32
+ max_tree_depth: int = 100,
33
+ memory_limit_nodes: int | None = None,
34
+ ):
35
+ """Initialize the streaming processor.
36
+
37
+ Args:
38
+ max_chunk_size: Maximum size of chunk text in characters
39
+ chunk_overlap: Number of characters to overlap between chunks
40
+ max_line_length: Maximum length for individual lines
41
+ heading_inclusion: How to include headings in chunks
42
+ chunk_format: Output format for chunks
43
+ max_tree_depth: Maximum depth of tree to maintain
44
+ memory_limit_nodes: Maximum number of nodes to keep in memory
45
+ (None for unlimited)
46
+ """
47
+ self.parser = MarkdownParser(
48
+ max_line_length=max_line_length,
49
+ preserve_empty_lines=False,
50
+ )
51
+ self.chunker = MarkdownChunker(
52
+ max_chunk_size=max_chunk_size,
53
+ chunk_overlap=chunk_overlap,
54
+ heading_inclusion=heading_inclusion,
55
+ chunk_format=chunk_format,
56
+ combine_under_heading=True,
57
+ )
58
+ self.max_tree_depth = max_tree_depth
59
+ self.memory_limit_nodes = memory_limit_nodes
60
+
61
+ def process_stream(
62
+ self,
63
+ source: str | TextIO | Iterator[str],
64
+ ) -> Iterator[Chunk]:
65
+ """Process markdown from a stream, yielding chunks incrementally.
66
+
67
+ Args:
68
+ source: Markdown content as string, file object, or line iterator
69
+
70
+ Yields:
71
+ Chunk objects as they become available
72
+ """
73
+ # For simplicity in v1, we'll use a batch processing approach
74
+ # that processes complete sections under headings
75
+ #
76
+ # Future enhancement: true streaming with incremental tree building
77
+
78
+ tree = self.parser.parse(source)
79
+
80
+ # Generate chunks
81
+ yield from self.chunker.chunk(tree)
82
+
83
+ def process_file(self, file_path: str) -> Iterator[Chunk]:
84
+ """Process a markdown file, yielding chunks incrementally.
85
+
86
+ Args:
87
+ file_path: Path to markdown file
88
+
89
+ Yields:
90
+ Chunk objects
91
+ """
92
+ with open(file_path, encoding='utf-8') as f:
93
+ yield from self.process_stream(f)
94
+
95
+ def process_string(self, content: str) -> Iterator[Chunk]:
96
+ """Process markdown from a string, yielding chunks.
97
+
98
+ Args:
99
+ content: Markdown content string
100
+
101
+ Yields:
102
+ Chunk objects
103
+ """
104
+ yield from self.process_stream(content)
105
+
106
+
107
+ class AdaptiveStreamingProcessor(StreamingMarkdownProcessor):
108
+ """Streaming processor that adapts to memory constraints.
109
+
110
+ This processor monitors tree size and adaptively chunks sections
111
+ when memory limits are approached, preventing memory overflow on
112
+ large documents.
113
+ """
114
+
115
+ def __init__(
116
+ self,
117
+ max_chunk_size: int = 1000,
118
+ chunk_overlap: int = 100,
119
+ max_line_length: int | None = None,
120
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
121
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
122
+ max_tree_depth: int = 100,
123
+ memory_limit_nodes: int = 10000,
124
+ adaptive_threshold: float = 0.8,
125
+ ):
126
+ """Initialize the adaptive streaming processor.
127
+
128
+ Args:
129
+ max_chunk_size: Maximum size of chunk text in characters
130
+ chunk_overlap: Number of characters to overlap between chunks
131
+ max_line_length: Maximum length for individual lines
132
+ heading_inclusion: How to include headings in chunks
133
+ chunk_format: Output format for chunks
134
+ max_tree_depth: Maximum depth of tree to maintain
135
+ memory_limit_nodes: Maximum number of nodes to keep in memory
136
+ adaptive_threshold: Fraction of memory_limit at which to trigger
137
+ adaptive chunking (0.0-1.0)
138
+ """
139
+ super().__init__(
140
+ max_chunk_size=max_chunk_size,
141
+ chunk_overlap=chunk_overlap,
142
+ max_line_length=max_line_length,
143
+ heading_inclusion=heading_inclusion,
144
+ chunk_format=chunk_format,
145
+ max_tree_depth=max_tree_depth,
146
+ memory_limit_nodes=memory_limit_nodes,
147
+ )
148
+ self.adaptive_threshold = adaptive_threshold
149
+
150
+ def process_stream(self, source: str | TextIO | Iterator[str]) -> Iterator[Chunk]:
151
+ """Process stream with adaptive memory management.
152
+
153
+ Args:
154
+ source: Markdown content source
155
+
156
+ Yields:
157
+ Chunk objects
158
+ """
159
+ # Build tree incrementally with memory monitoring
160
+ root = Tree(MarkdownNode(text="ROOT", level=0, node_type="root", line_number=0))
161
+ current_parent = root
162
+ line_number = 0
163
+
164
+ lines = self.parser._get_line_iterator(source)
165
+
166
+ pending_nodes = [] # Nodes waiting to be chunked
167
+
168
+ for line in lines:
169
+ line_number += 1
170
+
171
+ if not line.strip():
172
+ continue
173
+
174
+ # Check if line is a heading
175
+ heading_match = self.parser.HEADING_PATTERN.match(line)
176
+
177
+ if heading_match:
178
+ # Before adding new heading, check if we should chunk pending nodes
179
+ if self.memory_limit_nodes:
180
+ node_count = len(root.find_nodes(lambda _: True))
181
+ if node_count >= self.memory_limit_nodes * self.adaptive_threshold:
182
+ # Chunk and yield accumulated body text
183
+ if pending_nodes:
184
+ yield from self._chunk_nodes(pending_nodes)
185
+ pending_nodes = []
186
+ # Prune processed subtrees to free memory
187
+ self._prune_processed_nodes(root)
188
+
189
+ # Process heading
190
+ level = len(heading_match.group(1))
191
+ text = heading_match.group(2).strip()
192
+
193
+ node_data = MarkdownNode(
194
+ text=text,
195
+ level=level,
196
+ node_type="heading",
197
+ line_number=line_number,
198
+ )
199
+
200
+ current_parent, _ = self.parser._find_heading_parent(
201
+ root, current_parent, level
202
+ )
203
+
204
+ heading_node = current_parent.add_child(node_data)
205
+ current_parent = heading_node
206
+
207
+ else:
208
+ # Body text
209
+ node_data = MarkdownNode(
210
+ text=line.rstrip('\n'),
211
+ level=0,
212
+ node_type="body",
213
+ line_number=line_number,
214
+ )
215
+ body_node = current_parent.add_child(node_data)
216
+ pending_nodes.append(body_node)
217
+
218
+ # Process any remaining pending nodes
219
+ if pending_nodes:
220
+ yield from self._chunk_nodes(pending_nodes)
221
+
222
+ def _chunk_nodes(self, nodes: list[Tree]) -> Iterator[Chunk]:
223
+ """Chunk a list of body text nodes.
224
+
225
+ Args:
226
+ nodes: List of body text tree nodes
227
+
228
+ Yields:
229
+ Chunk objects
230
+ """
231
+ yield from self.chunker._chunk_by_heading(nodes)
232
+
233
+ def _prune_processed_nodes(self, root: Tree) -> None:
234
+ """Prune processed leaf nodes to free memory.
235
+
236
+ Args:
237
+ root: Root of tree to prune
238
+ """
239
+ # Find terminal nodes that have been processed
240
+ # For now, we'll keep the tree structure but could optimize further
241
+ # by removing fully processed subtrees
242
+ pass
243
+
244
+
245
+ def stream_markdown_file(
246
+ file_path: str,
247
+ max_chunk_size: int = 1000,
248
+ chunk_overlap: int = 100,
249
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
250
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
251
+ ) -> Iterator[Chunk]:
252
+ """Stream chunks from a markdown file.
253
+
254
+ Convenience function for processing a file with default settings.
255
+
256
+ Args:
257
+ file_path: Path to markdown file
258
+ max_chunk_size: Maximum size of chunk text in characters
259
+ chunk_overlap: Number of characters to overlap between chunks
260
+ heading_inclusion: How to include headings in chunks
261
+ chunk_format: Output format for chunks
262
+
263
+ Yields:
264
+ Chunk objects
265
+ """
266
+ processor = StreamingMarkdownProcessor(
267
+ max_chunk_size=max_chunk_size,
268
+ chunk_overlap=chunk_overlap,
269
+ heading_inclusion=heading_inclusion,
270
+ chunk_format=chunk_format,
271
+ )
272
+ yield from processor.process_file(file_path)
273
+
274
+
275
+ def stream_markdown_string(
276
+ content: str,
277
+ max_chunk_size: int = 1000,
278
+ chunk_overlap: int = 100,
279
+ heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
280
+ chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
281
+ ) -> Iterator[Chunk]:
282
+ """Stream chunks from a markdown string.
283
+
284
+ Convenience function for processing a string with default settings.
285
+
286
+ Args:
287
+ content: Markdown content string
288
+ max_chunk_size: Maximum size of chunk text in characters
289
+ chunk_overlap: Number of characters to overlap between chunks
290
+ heading_inclusion: How to include headings in chunks
291
+ chunk_format: Output format for chunks
292
+
293
+ Yields:
294
+ Chunk objects
295
+ """
296
+ processor = StreamingMarkdownProcessor(
297
+ max_chunk_size=max_chunk_size,
298
+ chunk_overlap=chunk_overlap,
299
+ heading_inclusion=heading_inclusion,
300
+ chunk_format=chunk_format,
301
+ )
302
+ yield from processor.process_string(content)
@@ -84,7 +84,7 @@ class CharacterFeatures(ABC):
84
84
  :return: A list of token instances
85
85
  """
86
86
  token = self.build_first_token(normalize_fn)
87
- tokens = list()
87
+ tokens = []
88
88
  while token is not None:
89
89
  tokens.append(token)
90
90
  token = token.next_token
@@ -113,7 +113,7 @@ def get_hyphen_slash_expansions_fn(
113
113
  if do_split:
114
114
  # add each word separately
115
115
  tokens = set(hyphen_slash_re.split(text))
116
- if not max(map(lambda t: len(t) < min_split_token_len, tokens)):
116
+ if not max(len(t) < min_split_token_len for t in tokens):
117
117
  variations.update(tokens)
118
118
  return variations
119
119
 
@@ -348,7 +348,7 @@ def year_variations_fn(
348
348
  variations.update(zero_pad_variations(remainder, 2, 3))
349
349
 
350
350
  if century > 0:
351
- remainder_texts = list()
351
+ remainder_texts = []
352
352
  if remainder > 0:
353
353
  if remainder < 10:
354
354
  if not numeric_only:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataknobs-xization
3
- Version: 1.0.1
3
+ Version: 1.1.0
4
4
  Summary: Text normalization and tokenization tools
5
5
  Author-email: Spence Koehler <KoehlerSB747@gmail.com>
6
6
  Requires-Python: >=3.10
@@ -22,6 +22,11 @@ pip install dataknobs-xization
22
22
 
23
23
  ## Features
24
24
 
25
+ - **Markdown Chunking**: Parse and chunk markdown documents for RAG applications
26
+ - Preserves heading hierarchy and semantic structure
27
+ - Supports code blocks, tables, lists, and other markdown constructs
28
+ - Streaming support for large documents
29
+ - Flexible configuration for chunk size, overlap, and heading inclusion
25
30
  - **Text Normalization**: Standardize text for consistent processing
26
31
  - **Masking Tokenizer**: Advanced tokenization with masking capabilities
27
32
  - **Annotations**: Text annotation system
@@ -30,6 +35,32 @@ pip install dataknobs-xization
30
35
 
31
36
  ## Usage
32
37
 
38
+ ### Markdown Chunking
39
+
40
+ ```python
41
+ from dataknobs_xization import parse_markdown, chunk_markdown_tree
42
+
43
+ # Parse markdown into tree structure
44
+ markdown_text = """
45
+ # User Guide
46
+ ## Installation
47
+ Install the package using pip.
48
+ """
49
+
50
+ tree = parse_markdown(markdown_text)
51
+
52
+ # Generate chunks for RAG
53
+ chunks = chunk_markdown_tree(tree, max_chunk_size=500)
54
+
55
+ for chunk in chunks:
56
+ print(f"Headings: {chunk.metadata.get_heading_path()}")
57
+ print(f"Text: {chunk.text}\n")
58
+ ```
59
+
60
+ For more details, see the [Markdown Chunking documentation](docs/markdown/MARKDOWN_CHUNKING.md).
61
+
62
+ ### Text Normalization and Tokenization
63
+
33
64
  ```python
34
65
  from dataknobs_xization import normalize, MaskingTokenizer
35
66
 
@@ -0,0 +1,14 @@
1
+ dataknobs_xization/0.readme.txt,sha256=Q46suHOARkjQLY580eOfSCeUyIgQx-e6DLmtEhcuODE,2878
2
+ dataknobs_xization/__init__.py,sha256=CNpO8lBEz46jFS50XGjiubmO7srtsvx7W_dHxzYLSVQ,1202
3
+ dataknobs_xization/annotations.py,sha256=cbdcmnExrRFLGVsC1ULV-_7dUxoseHN6OYs2MMLh_-g,45183
4
+ dataknobs_xization/authorities.py,sha256=S2pfHejKOT8JUlxWnxBRuOHzZxjDlajHERvYVYOwMRs,30737
5
+ dataknobs_xization/lexicon.py,sha256=aws0JnDWoKnXmkU09T5S4vq-hDPFBsXERxKAyMuHmw0,23701
6
+ dataknobs_xization/masking_tokenizer.py,sha256=uJYsi4o4brhFzTi4V06muRFUWAOHkjCiVCONEqVk218,26032
7
+ dataknobs_xization/normalize.py,sha256=ufnvdceCf3zPQ0njhp-qY1JQTl2IKM6ALQ05b-iAREg,14013
8
+ dataknobs_xization/markdown/__init__.py,sha256=ubVUGZBZeaOqvIzQkOixW39JniK4y4O2TWgTMTCSzIU,1039
9
+ dataknobs_xization/markdown/md_chunker.py,sha256=Lf0gqVoBlF7IZ6gorEuhxP1NV_InRv2AGn2a1zsFWXc,14569
10
+ dataknobs_xization/markdown/md_parser.py,sha256=U1KYZjGD_G6Bwy-Eo073kUJz597Ff5UsWwdw_y1dYc8,20394
11
+ dataknobs_xization/markdown/md_streaming.py,sha256=4zyyBeVt7G42Mqr4Hprugq0LVaZ-WTps13jOm_i4rNA,10469
12
+ dataknobs_xization-1.1.0.dist-info/METADATA,sha256=CCXAdqzC5jfSxlNh-aXTcS-Az5qQc2jd00NK_NZ89v0,2319
13
+ dataknobs_xization-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ dataknobs_xization-1.1.0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- dataknobs_xization/0.readme.txt,sha256=Q46suHOARkjQLY580eOfSCeUyIgQx-e6DLmtEhcuODE,2878
2
- dataknobs_xization/__init__.py,sha256=ixsRSYr86q1T4LqQTRzP9Z_ihcOVN6r8SQNurhmHWmY,404
3
- dataknobs_xization/annotations.py,sha256=qiH_QzzIs5mjvO2Yr4jiLBMIxIiPbzzfd_iublS8HTI,45143
4
- dataknobs_xization/authorities.py,sha256=69nAlExbh_U7NKav1q3IujXb8lBq14QJhHHy5IZ0PZE,30745
5
- dataknobs_xization/lexicon.py,sha256=NMo3lAXUVzFVRy246Y90TZtm-27qR5g0z8Ef9u2E2LA,23722
6
- dataknobs_xization/masking_tokenizer.py,sha256=65RkHdU83l1Tf0f9bXwNrLDuFsN-xegMQNJGON7Z8WY,26036
7
- dataknobs_xization/normalize.py,sha256=kpT8y1jEmeiKiNC8pruurFjasmREhr4rAQ3W_yB2v4U,14024
8
- dataknobs_xization-1.0.1.dist-info/METADATA,sha256=RDT8c1JeCzLd7F57WkifjZlwMinbSpHBVJoM8ZU3uQE,1393
9
- dataknobs_xization-1.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- dataknobs_xization-1.0.1.dist-info/RECORD,,