dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,520 @@
1
+ """Text normalization utilities and regular expressions.
2
+
3
+ Provides functions and regex patterns for normalizing text including
4
+ whitespace handling, camelCase splitting, and symbol processing.
5
+ """
6
+
7
+ import math
8
+ import re
9
+ from itertools import product
10
+ from typing import List, Set
11
+
12
+ # squash whitespace: to collapse consecutive whitespace to a single space by
13
+ # x.sub(' ', text)
14
+ SQUASH_WS_RE = re.compile(r"\s+")
15
+
16
+
17
+ # to identify strings with any symbols by
18
+ # x.search(text)
19
+ ALL_SYMBOLS_RE = re.compile(r"[^\w\s]+")
20
+
21
+
22
+ # camelcase LU: to split between consecutive lower and upper chars by
23
+ # x.sub(r'\1 \2', text)
24
+ CAMELCASE_LU_RE = re.compile(r"([a-z]+)([A-Z])")
25
+
26
+
27
+ # camelcase UL: to split between consecutive upper and uppler lower chars by
28
+ # x.sub(r'\1 \2', text)
29
+ CAMELCASE_UL_RE = re.compile(r"([A-Z]+)([A-Z][a-z])")
30
+
31
+
32
+ # non-embedded symbols: those without a word char on both sides by
33
+ # x.sub('', text)
34
+ NON_EMBEDDED_WORD_SYMS_RE = re.compile(r"((?<!\w)[^\w\s]+)|([^\w\s]+(?!\w))")
35
+
36
+
37
+ # embedded symbols: to drop embedded symbols by
38
+ # x.sub('', text)
39
+ EMBEDDED_SYMS_RE = re.compile(r"(?<=\w)[^\w\s]+(?=\w)")
40
+
41
+
42
+ # hyphen-slash: to split between an embedded hyphen and/or slash by
43
+ # x.split(text)
44
+ HYPHEN_SLASH_RE = re.compile(r"(?<=\w)[\-\/ ](?=\w)")
45
+
46
+
47
+ # hyphen-only: to split between an embedded hyphen by
48
+ # x.split(text)
49
+ HYPHEN_ONLY_RE = re.compile(r"(?<=\w)[\- ](?=\w)")
50
+
51
+
52
+ # slash-only: to split between an embedded slash by
53
+ # x.split(text)
54
+ SLASH_ONLY_RE = re.compile(r"(?<=\w)\/(?=\w)")
55
+
56
+
57
+ # parenthetical expressions: to drop parenthetical expressions by
58
+ # x.sub('', text)
59
+ PARENTHETICAL_RE = re.compile(r"\(.*\)")
60
+
61
+
62
+ # ampersand: to replace an ampersand with " and " by
63
+ # x.sub(' and ', text)
64
+ AMPERSAND_RE = re.compile(r"\s*\&\s*")
65
+
66
+
67
+ def expand_camelcase_fn(text: str) -> str:
68
+ """Expand both "lU" and "UUl" camelcasing to "l U" and "U Ul" """
69
+ text = CAMELCASE_LU_RE.sub(r"\1 \2", text)
70
+ return CAMELCASE_UL_RE.sub(r"\1 \2", text)
71
+
72
+
73
+ def drop_non_embedded_symbols_fn(text: str, repl: str = "") -> str:
74
+ """Drop symbols not embedded within word characters"""
75
+ return NON_EMBEDDED_WORD_SYMS_RE.sub(repl, text)
76
+
77
+
78
+ def drop_embedded_symbols_fn(text: str, repl: str = "") -> str:
79
+ """Drop symbols embedded within word characters"""
80
+ return EMBEDDED_SYMS_RE.sub(repl, text)
81
+
82
+
83
+ def get_hyphen_slash_expansions_fn(
84
+ text: str,
85
+ subs: List[str] = ("-", " ", ""),
86
+ add_self: bool = True,
87
+ do_split: bool = True,
88
+ min_split_token_len: int = 2,
89
+ hyphen_slash_re: re.Pattern[str] = HYPHEN_SLASH_RE,
90
+ ) -> Set[str]:
91
+ """Given text with words that may or may not appear as hyphenated or with a
92
+ slash, return the set potential variations:
93
+ - the text as-is (add_self)
94
+ - with a hyphen between all words (if '-' in subs)
95
+ - with a space between all words (if ' ' in subs)
96
+ - with all words squashed together (empty string between if '' in subs)
97
+ - with each word separately (do_split as long as min_split_token_len is
98
+ met for all tokens)
99
+
100
+ Note:
101
+ * To add a variation with a slash, add '/' to subs.
102
+ * To not add any variations with symbols, leave them out of subs
103
+ and don't add self.
104
+
105
+ Args:
106
+ text: The hyphen-worthy snippet of text, either already
107
+ hyphenated or with a slash or space delimited.
108
+ subs: A string of characters or list of strings to insert between
109
+ tokens.
110
+ add_self: True to include the text itself in the result.
111
+ do_split: True to add split tokens separately.
112
+ min_split_token_len: If any of the split tokens fail
113
+ to meet the min token length, don't add any of the splits.
114
+ hyphen_slash_re: The regex to identify hyphen/slash to expand.
115
+
116
+ Returns:
117
+ The set of text variations.
118
+ """
119
+ variations = {text} if add_self else set()
120
+ if subs is not None and len(subs) > 0:
121
+ # create variant with all <s>'s
122
+ for s in subs:
123
+ variations.add(HYPHEN_SLASH_RE.sub(s, text))
124
+ if do_split:
125
+ # add each word separately
126
+ tokens = set(hyphen_slash_re.split(text))
127
+ if not max(len(t) < min_split_token_len for t in tokens):
128
+ variations.update(tokens)
129
+ return variations
130
+
131
+
132
+ def drop_parentheticals_fn(text: str) -> str:
133
+ """Drop parenthetical expressions from the text."""
134
+ return PARENTHETICAL_RE.sub("", text)
135
+
136
+
137
+ def expand_ampersand_fn(text: str) -> str:
138
+ """Replace '&' with ' and '."""
139
+ return AMPERSAND_RE.sub(" and ", text)
140
+
141
+
142
+ def get_lexical_variations(
143
+ text: str,
144
+ include_self: bool = True,
145
+ expand_camelcase: bool = True,
146
+ drop_non_embedded_symbols: bool = True,
147
+ drop_embedded_symbols: bool = True,
148
+ spacify_embedded_symbols: bool = False,
149
+ do_hyphen_expansion: bool = True,
150
+ hyphen_subs: List[str] = (" ", ""),
151
+ do_hyphen_split: bool = True,
152
+ min_hyphen_split_token_len: int = 2,
153
+ do_slash_expansion: bool = True,
154
+ slash_subs: List[str] = (" ", " or "),
155
+ do_slash_split: bool = True,
156
+ min_slash_split_token_len: int = 1,
157
+ drop_parentheticals: bool = True,
158
+ expand_ampersands: bool = True,
159
+ add_eng_plurals: bool = True,
160
+ ) -> Set[str]:
161
+ """Get all variations for the text (including the text itself).
162
+
163
+ Args:
164
+ text: The text to generate variations for.
165
+ include_self: True to include the original text in the result.
166
+ expand_camelcase: True to expand camelCase text.
167
+ drop_non_embedded_symbols: True to drop symbols not embedded in words.
168
+ drop_embedded_symbols: True to drop symbols embedded in words.
169
+ spacify_embedded_symbols: True to replace embedded symbols with spaces.
170
+ do_hyphen_expansion: True to expand hyphenated text.
171
+ hyphen_subs: List of strings to substitute for hyphens.
172
+ do_hyphen_split: True to split on hyphens.
173
+ min_hyphen_split_token_len: Minimum token length for hyphen splits.
174
+ do_slash_expansion: True to expand slashes.
175
+ slash_subs: List of strings to substitute for slashes.
176
+ do_slash_split: True to split on slashes.
177
+ min_slash_split_token_len: Minimum token length for slash splits.
178
+ drop_parentheticals: True to drop parenthetical expressions.
179
+ expand_ampersands: True to expand ampersands to ' and '.
180
+ add_eng_plurals: True to add English plural forms.
181
+
182
+ Returns:
183
+ The set of all text variations.
184
+ """
185
+ variations = {text} if include_self else set()
186
+ if expand_camelcase:
187
+ variations.add(expand_camelcase_fn(text))
188
+ if drop_non_embedded_symbols:
189
+ variations.add(drop_non_embedded_symbols_fn(text))
190
+ if drop_embedded_symbols:
191
+ variations.add(drop_embedded_symbols_fn(text))
192
+ if spacify_embedded_symbols:
193
+ variations.add(drop_embedded_symbols_fn(text, " "))
194
+ if (
195
+ do_hyphen_expansion and hyphen_subs is not None and len(hyphen_subs) > 0
196
+ ) or do_hyphen_split:
197
+ variations.update(
198
+ get_hyphen_slash_expansions_fn(
199
+ text,
200
+ subs=hyphen_subs,
201
+ add_self=False,
202
+ do_split=do_hyphen_split,
203
+ min_split_token_len=min_hyphen_split_token_len,
204
+ )
205
+ )
206
+ if (do_slash_expansion and slash_subs is not None and len(slash_subs) > 0) or do_slash_split:
207
+ variations.update(
208
+ get_hyphen_slash_expansions_fn(
209
+ text,
210
+ subs=slash_subs,
211
+ add_self=False,
212
+ do_split=do_slash_split,
213
+ min_split_token_len=min_slash_split_token_len,
214
+ )
215
+ )
216
+ if drop_parentheticals:
217
+ variations.add(drop_parentheticals_fn(text))
218
+ if expand_ampersands:
219
+ variations.add(expand_ampersand_fn(text))
220
+ if add_eng_plurals:
221
+ # TODO: Use a better pluralizer
222
+ plurals = {f"{v}s" for v in variations}
223
+ variations.update(plurals)
224
+ return variations
225
+
226
+
227
+ def int_to_en(num: int) -> str:
228
+ d = {
229
+ 0: "zero",
230
+ 1: "one",
231
+ 2: "two",
232
+ 3: "three",
233
+ 4: "four",
234
+ 5: "five",
235
+ 6: "six",
236
+ 7: "seven",
237
+ 8: "eight",
238
+ 9: "nine",
239
+ 10: "ten",
240
+ 11: "eleven",
241
+ 12: "twelve",
242
+ 13: "thirteen",
243
+ 14: "fourteen",
244
+ 15: "fifteen",
245
+ 16: "sixteen",
246
+ 17: "seventeen",
247
+ 18: "eighteen",
248
+ 19: "nineteen",
249
+ 20: "twenty",
250
+ 30: "thirty",
251
+ 40: "forty",
252
+ 50: "fifty",
253
+ 60: "sixty",
254
+ 70: "seventy",
255
+ 80: "eighty",
256
+ 90: "ninety",
257
+ }
258
+ k = 1000
259
+ m = k * 1000
260
+ b = m * 1000
261
+ t = b * 1000
262
+
263
+ if not isinstance(num, int):
264
+ return num
265
+
266
+ if num < 0:
267
+ return "negative " + int_to_en(abs(num))
268
+
269
+ if num < 20:
270
+ return d[num]
271
+
272
+ if num < 100:
273
+ if num % 10 == 0:
274
+ return d[num]
275
+ else:
276
+ return d[num // 10 * 10] + " " + d[num % 10]
277
+
278
+ if num < k:
279
+ if num % 100 == 0:
280
+ return d[num // 100] + " hundred"
281
+ else:
282
+ return d[num // 100] + " hundred and " + int_to_en(num % 100)
283
+
284
+ if num < m:
285
+ if num % k == 0:
286
+ return int_to_en(num // k) + " thousand"
287
+ else:
288
+ return int_to_en(num // k) + " thousand " + int_to_en(num % k)
289
+
290
+ if num < b:
291
+ if (num % m) == 0:
292
+ return int_to_en(num // m) + " million"
293
+ else:
294
+ return int_to_en(num // m) + " million " + int_to_en(num % m)
295
+
296
+ if num < t:
297
+ if (num % b) == 0:
298
+ return int_to_en(num // b) + " billion"
299
+ else:
300
+ return int_to_en(num // b) + " billion " + int_to_en(num % b)
301
+
302
+ if num % t == 0:
303
+ return int_to_en(num // t) + " trillion"
304
+ else:
305
+ return int_to_en(num // t) + " trillion " + int_to_en(num % t)
306
+
307
+ # num is too large
308
+ return str(num)
309
+
310
+
311
+ def zero_pad_variations(
312
+ val: int,
313
+ min_zpad_len: int,
314
+ max_zpad_len: int,
315
+ ) -> Set[str]:
316
+ """Get (only) zero-padded variations of the given value from min (inclusive)
317
+ to max (exclusive) zero-pad lengths.
318
+
319
+ Examples:
320
+ >>> from dataknobs_xization.normalize import zero_pad_variations
321
+ >>> zero_pad_variations(9, 2, 4)
322
+ {'09', '009'}
323
+ >>> zero_pad_variations(90, 2, 4)
324
+ {'090'}
325
+ >>> zero_pad_variations(90, 2, 3)
326
+ set()
327
+ >>> zero_pad_variations(3, 0, 5)
328
+ {'03', '003', '0003'}
329
+
330
+ Args:
331
+ val: The integer value to zero-pad.
332
+ min_zpad_len: The minimum zero-padded string length (inclusive).
333
+ max_zpad_len: The maximum zero-padded string length (exclusive).
334
+
335
+ Returns:
336
+ The set of all requested zero-padded number strings.
337
+ """
338
+ return {
339
+ f"{val:0{zpad}d}"
340
+ for zpad in range(
341
+ max(min_zpad_len, math.ceil(math.log10(val)) + 1 if val > 0 else 1), max_zpad_len
342
+ )
343
+ }
344
+
345
+
346
+ def month_day_variations_fn(
347
+ month_or_day: int,
348
+ do_int_to_en: bool = False,
349
+ ) -> Set[str]:
350
+ """Get the variations for a month or day number, including the number
351
+ itself as a string, a 2-digit zero-padded form of the number, and
352
+ (optionally) english word for the number.
353
+
354
+ Args:
355
+ month_or_day: The month or day for which to get variations.
356
+ do_int_to_en: Optionally include the english word for the number.
357
+
358
+ Returns:
359
+ The set of variations for the value.
360
+ """
361
+ result = zero_pad_variations(month_or_day, 2, 3)
362
+ result.add(str(month_or_day))
363
+ if do_int_to_en:
364
+ result.add(int_to_en(month_or_day))
365
+ return result
366
+
367
+
368
+ def year_variations_fn(
369
+ year: int,
370
+ min_year: int = 0,
371
+ max_year: int = 9999,
372
+ do_int_to_en_below_100: bool = False,
373
+ numeric_only: bool = False,
374
+ ) -> Set[str]:
375
+ """Convert a year to various text representations.
376
+
377
+ Generates variations including:
378
+ * "1999" (numeric)
379
+ * Long text: "one thousand, nine hundred and ninety nine"
380
+ * Short text: "nineteen [hundred and] ninety nine"
381
+
382
+ Args:
383
+ year: The year value to convert.
384
+ min_year: Minimum year to process (inclusive).
385
+ max_year: Maximum year to process (inclusive).
386
+ do_int_to_en_below_100: True to convert years below 100 to English text.
387
+ numeric_only: True to return only numeric variations.
388
+
389
+ Returns:
390
+ The set of year variations.
391
+ """
392
+ variations = {str(year)}
393
+
394
+ if year < min_year or year > max_year:
395
+ return variations
396
+
397
+ # one thousand, nine hundred and ninety nine
398
+ if not numeric_only and (do_int_to_en_below_100 or year >= 100):
399
+ variations.add(int_to_en(year))
400
+
401
+ # nineteen ninety five
402
+ century = year // 100
403
+ remainder = year % 100
404
+ remainder_text = int_to_en(remainder)
405
+
406
+ variations.update(zero_pad_variations(remainder, 2, 3))
407
+
408
+ if century > 0:
409
+ remainder_texts = []
410
+ if remainder > 0:
411
+ if remainder < 10:
412
+ if not numeric_only:
413
+ remainder_texts.append(f" oh {remainder_text}")
414
+ remainder_texts.append(f" 0{remainder}")
415
+ else:
416
+ if not numeric_only:
417
+ remainder_texts.append(f" {remainder_text}")
418
+ remainder_texts.append(f" {remainder}")
419
+ if not numeric_only:
420
+ remainder_texts.append(f" and {remainder_text}")
421
+
422
+ century_text = int_to_en(century)
423
+ scales = ["", century_text]
424
+ if century % 10 == 0:
425
+ mil_text = int_to_en(century // 10)
426
+ scales.append(f"{mil_text} thousand")
427
+ else:
428
+ scales.append(f"{century_text} hundred")
429
+
430
+ def clean_up(s):
431
+ s = s.strip()
432
+ if s.startswith("and "):
433
+ s = s[4:]
434
+ return s
435
+
436
+ variations.update({clean_up("".join(v)) for v in product(scales, remainder_texts)})
437
+
438
+ return variations
439
+
440
+
441
+ def replace_smart_quotes_fn(text: str) -> str:
442
+ """Replace "smart" quotes with their ascii version."""
443
+ return (
444
+ text.replace(
445
+ "\u201c",
446
+ '"', # left double quote U+201C
447
+ )
448
+ .replace(
449
+ "\u201d",
450
+ '"', # right double quote U+201D
451
+ )
452
+ .replace(
453
+ "\u2018",
454
+ "'", # left single quote U+2018
455
+ )
456
+ .replace(
457
+ "\u2019",
458
+ "'", # right single quote U+2019
459
+ )
460
+ )
461
+
462
+
463
+ def basic_normalization_fn(
464
+ text: str,
465
+ lowercase: bool = True,
466
+ expand_camelcase: bool = True,
467
+ simplify_quote_chars: bool = True,
468
+ drop_non_embedded_symbols: bool = False,
469
+ spacify_embedded_symbols: bool = False,
470
+ drop_embedded_symbols: bool = False,
471
+ squash_whitespace: bool = False,
472
+ do_all: bool = False,
473
+ ) -> str:
474
+ """Basic normalization functions include:
475
+ * lowercasing [default]
476
+ * expanding camelcase [default]
477
+ * replacing "smart" quotes and apostrophes with ascii versions [default]
478
+ * dropping non_embedded symbols [optional]
479
+ * replacing embedded symbols with a space [takes precedence over dropping unless do_all]
480
+ * or dropping embedded symbols [optional]
481
+ * collapsing multiple spaces and stripping spaces from ends [optional]
482
+
483
+ Args:
484
+ text: The text to normalize.
485
+ lowercase: True to convert to lowercase.
486
+ expand_camelcase: True to expand camelCase text.
487
+ simplify_quote_chars: True to replace smart quotes with ASCII quotes.
488
+ drop_non_embedded_symbols: True to drop symbols not embedded in words.
489
+ spacify_embedded_symbols: True to replace embedded symbols with spaces.
490
+ drop_embedded_symbols: True to drop embedded symbols.
491
+ squash_whitespace: True to collapse whitespace and strip ends.
492
+ do_all: True to apply all normalization steps.
493
+
494
+ Returns:
495
+ The normalized text.
496
+ """
497
+ # NOTE: do this before changing case
498
+ if expand_camelcase or do_all:
499
+ text = expand_camelcase_fn(text)
500
+
501
+ if lowercase or do_all:
502
+ text = text.lower()
503
+ if (drop_non_embedded_symbols and drop_embedded_symbols) or do_all:
504
+ text = re.sub(r"[^\w\s]+", "", text)
505
+ elif drop_non_embedded_symbols:
506
+ text = drop_non_embedded_symbols_fn(text)
507
+ elif spacify_embedded_symbols:
508
+ text = drop_embedded_symbols_fn(text, " ")
509
+ elif drop_embedded_symbols:
510
+ text = drop_embedded_symbols_fn(text)
511
+
512
+ # NOTE: do this after dropping (only some) symbols
513
+ if simplify_quote_chars and (not drop_non_embedded_symbols or not drop_embedded_symbols):
514
+ # NOTE: It only makes sense to do this if we're keeping symbols
515
+ text = replace_smart_quotes_fn(text)
516
+
517
+ # NOTE: do this last
518
+ if squash_whitespace or do_all:
519
+ text = re.sub(r"\s+", " ", text).strip()
520
+ return text
File without changes
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataknobs-xization
3
+ Version: 1.2.3
4
+ Summary: Text normalization and tokenization tools
5
+ Author-email: Spence Koehler <KoehlerSB747@gmail.com>
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: dataknobs-common>=1.0.0
8
+ Requires-Dist: dataknobs-structures>=1.0.0
9
+ Requires-Dist: dataknobs-utils>=1.0.0
10
+ Requires-Dist: nltk>=3.9.1
11
+ Description-Content-Type: text/markdown
12
+
13
+ # dataknobs-xization
14
+
15
+ Text normalization and tokenization tools.
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install dataknobs-xization
21
+ ```
22
+
23
+ ## Features
24
+
25
+ - **Markdown Chunking**: Parse and chunk markdown documents for RAG applications
26
+ - Preserves heading hierarchy and semantic structure
27
+ - Supports code blocks, tables, lists, and other markdown constructs
28
+ - Streaming support for large documents
29
+ - Flexible configuration for chunk size, overlap, and heading inclusion
30
+ - **Content Transformation**: Convert JSON, YAML, and CSV to markdown for RAG ingestion
31
+ - Generic conversion that preserves structure through headings
32
+ - Custom schemas for specialized formatting
33
+ - Configurable formatting options
34
+ - **Text Normalization**: Standardize text for consistent processing
35
+ - **Masking Tokenizer**: Advanced tokenization with masking capabilities
36
+ - **Annotations**: Text annotation system
37
+ - **Authorities**: Authority management for text processing
38
+ - **Lexicon**: Lexicon-based text analysis
39
+
40
+ ## Usage
41
+
42
+ ### Markdown Chunking
43
+
44
+ ```python
45
+ from dataknobs_xization import parse_markdown, chunk_markdown_tree
46
+
47
+ # Parse markdown into tree structure
48
+ markdown_text = """
49
+ # User Guide
50
+ ## Installation
51
+ Install the package using pip.
52
+ """
53
+
54
+ tree = parse_markdown(markdown_text)
55
+
56
+ # Generate chunks for RAG
57
+ chunks = chunk_markdown_tree(tree, max_chunk_size=500)
58
+
59
+ for chunk in chunks:
60
+ print(f"Headings: {chunk.metadata.get_heading_path()}")
61
+ print(f"Text: {chunk.text}\n")
62
+ ```
63
+
64
+ For more details, see the [Markdown Chunking documentation](docs/markdown/MARKDOWN_CHUNKING.md).
65
+
66
+ ### Content Transformation
67
+
68
+ Convert structured data (JSON, YAML, CSV) to well-formatted markdown for RAG ingestion:
69
+
70
+ ```python
71
+ from dataknobs_xization import ContentTransformer, json_to_markdown
72
+
73
+ # Quick conversion
74
+ data = [
75
+ {"name": "Chain of Thought", "description": "Step by step reasoning"},
76
+ {"name": "Few-Shot", "description": "Learning from examples"}
77
+ ]
78
+ markdown = json_to_markdown(data, title="Prompt Patterns")
79
+
80
+ # Or use the transformer class for more control
81
+ transformer = ContentTransformer(
82
+ base_heading_level=2,
83
+ include_field_labels=True,
84
+ code_block_fields=["example", "code"],
85
+ list_fields=["steps", "items"]
86
+ )
87
+
88
+ # Transform JSON
89
+ result = transformer.transform_json(data)
90
+
91
+ # Transform YAML
92
+ result = transformer.transform_yaml("config.yaml")
93
+
94
+ # Transform CSV
95
+ result = transformer.transform_csv("data.csv", title_field="name")
96
+ ```
97
+
98
+ #### Custom Schemas
99
+
100
+ Register schemas for specialized formatting of known data structures:
101
+
102
+ ```python
103
+ transformer = ContentTransformer()
104
+
105
+ # Register a schema for prompt patterns
106
+ transformer.register_schema("pattern", {
107
+ "title_field": "name",
108
+ "description_field": "description",
109
+ "sections": [
110
+ {"field": "use_case", "heading": "When to Use"},
111
+ {"field": "example", "heading": "Example", "format": "code", "language": "python"},
112
+ {"field": "variations", "heading": "Variations", "format": "list"}
113
+ ],
114
+ "metadata_fields": ["category", "difficulty"]
115
+ })
116
+
117
+ # Use the schema
118
+ patterns = [
119
+ {
120
+ "name": "Chain of Thought",
121
+ "description": "Prompting technique for complex reasoning",
122
+ "use_case": "Multi-step problems requiring logical reasoning",
123
+ "example": "Let's think step by step...",
124
+ "category": "reasoning",
125
+ "difficulty": "intermediate"
126
+ }
127
+ ]
128
+
129
+ markdown = transformer.transform_json(patterns, schema="pattern")
130
+ ```
131
+
132
+ #### Convenience Functions
133
+
134
+ ```python
135
+ from dataknobs_xization import json_to_markdown, yaml_to_markdown, csv_to_markdown
136
+
137
+ # Quick conversions
138
+ md = json_to_markdown(data, title="My Data")
139
+ md = yaml_to_markdown("config.yaml", title="Config")
140
+ md = csv_to_markdown("data.csv", title_field="name")
141
+ ```
142
+
143
+ ### Text Normalization and Tokenization
144
+
145
+ ```python
146
+ from dataknobs_xization import normalize, MaskingTokenizer
147
+
148
+ # Text normalization
149
+ normalized = normalize.normalize_text("Hello, World!")
150
+
151
+ # Tokenization with masking
152
+ tokenizer = MaskingTokenizer()
153
+ tokens = tokenizer.tokenize("This is a sample text.")
154
+
155
+ # Working with annotations
156
+ from dataknobs_xization import annotations
157
+ doc = annotations.create_document("Sample text", {"metadata": "value"})
158
+ ```
159
+
160
+ ## Dependencies
161
+
162
+ This package depends on:
163
+ - `dataknobs-common`
164
+ - `dataknobs-structures`
165
+ - `dataknobs-utils`
166
+ - nltk
167
+
168
+ ## License
169
+
170
+ See LICENSE file in the root repository.