dataknobs-xization 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +110 -0
- dataknobs_xization/annotations.py +1476 -0
- dataknobs_xization/authorities.py +860 -0
- dataknobs_xization/content_transformer.py +570 -0
- dataknobs_xization/ingestion/__init__.py +27 -0
- dataknobs_xization/ingestion/config.py +352 -0
- dataknobs_xization/ingestion/processor.py +367 -0
- dataknobs_xization/json/__init__.py +17 -0
- dataknobs_xization/json/json_chunker.py +591 -0
- dataknobs_xization/lexicon.py +723 -0
- dataknobs_xization/markdown/__init__.py +72 -0
- dataknobs_xization/markdown/enrichment.py +260 -0
- dataknobs_xization/markdown/filters.py +236 -0
- dataknobs_xization/markdown/md_chunker.py +478 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +768 -0
- dataknobs_xization/normalize.py +520 -0
- dataknobs_xization/py.typed +0 -0
- dataknobs_xization-1.2.3.dist-info/METADATA +170 -0
- dataknobs_xization-1.2.3.dist-info/RECORD +23 -0
- dataknobs_xization-1.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
"""Text normalization utilities and regular expressions.
|
|
2
|
+
|
|
3
|
+
Provides functions and regex patterns for normalizing text including
|
|
4
|
+
whitespace handling, camelCase splitting, and symbol processing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
import re
|
|
9
|
+
from itertools import product
|
|
10
|
+
from typing import List, Set
|
|
11
|
+
|
|
12
|
+
# squash whitespace: to collapse consecutive whitespace to a single space by
|
|
13
|
+
# x.sub(' ', text)
|
|
14
|
+
SQUASH_WS_RE = re.compile(r"\s+")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# to identify strings with any symbols by
|
|
18
|
+
# x.search(text)
|
|
19
|
+
ALL_SYMBOLS_RE = re.compile(r"[^\w\s]+")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# camelcase LU: to split between consecutive lower and upper chars by
|
|
23
|
+
# x.sub(r'\1 \2', text)
|
|
24
|
+
CAMELCASE_LU_RE = re.compile(r"([a-z]+)([A-Z])")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# camelcase UL: to split between consecutive upper and uppler lower chars by
|
|
28
|
+
# x.sub(r'\1 \2', text)
|
|
29
|
+
CAMELCASE_UL_RE = re.compile(r"([A-Z]+)([A-Z][a-z])")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# non-embedded symbols: those without a word char on both sides by
|
|
33
|
+
# x.sub('', text)
|
|
34
|
+
NON_EMBEDDED_WORD_SYMS_RE = re.compile(r"((?<!\w)[^\w\s]+)|([^\w\s]+(?!\w))")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# embedded symbols: to drop embedded symbols by
|
|
38
|
+
# x.sub('', text)
|
|
39
|
+
EMBEDDED_SYMS_RE = re.compile(r"(?<=\w)[^\w\s]+(?=\w)")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# hyphen-slash: to split between an embedded hyphen and/or slash by
|
|
43
|
+
# x.split(text)
|
|
44
|
+
HYPHEN_SLASH_RE = re.compile(r"(?<=\w)[\-\/ ](?=\w)")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# hyphen-only: to split between an embedded hyphen by
|
|
48
|
+
# x.split(text)
|
|
49
|
+
HYPHEN_ONLY_RE = re.compile(r"(?<=\w)[\- ](?=\w)")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# slash-only: to split between an embedded slash by
|
|
53
|
+
# x.split(text)
|
|
54
|
+
SLASH_ONLY_RE = re.compile(r"(?<=\w)\/(?=\w)")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# parenthetical expressions: to drop parenthetical expressions by
|
|
58
|
+
# x.sub('', text)
|
|
59
|
+
PARENTHETICAL_RE = re.compile(r"\(.*\)")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ampersand: to replace an ampersand with " and " by
|
|
63
|
+
# x.sub(' and ', text)
|
|
64
|
+
AMPERSAND_RE = re.compile(r"\s*\&\s*")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def expand_camelcase_fn(text: str) -> str:
|
|
68
|
+
"""Expand both "lU" and "UUl" camelcasing to "l U" and "U Ul" """
|
|
69
|
+
text = CAMELCASE_LU_RE.sub(r"\1 \2", text)
|
|
70
|
+
return CAMELCASE_UL_RE.sub(r"\1 \2", text)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def drop_non_embedded_symbols_fn(text: str, repl: str = "") -> str:
|
|
74
|
+
"""Drop symbols not embedded within word characters"""
|
|
75
|
+
return NON_EMBEDDED_WORD_SYMS_RE.sub(repl, text)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def drop_embedded_symbols_fn(text: str, repl: str = "") -> str:
|
|
79
|
+
"""Drop symbols embedded within word characters"""
|
|
80
|
+
return EMBEDDED_SYMS_RE.sub(repl, text)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def get_hyphen_slash_expansions_fn(
|
|
84
|
+
text: str,
|
|
85
|
+
subs: List[str] = ("-", " ", ""),
|
|
86
|
+
add_self: bool = True,
|
|
87
|
+
do_split: bool = True,
|
|
88
|
+
min_split_token_len: int = 2,
|
|
89
|
+
hyphen_slash_re: re.Pattern[str] = HYPHEN_SLASH_RE,
|
|
90
|
+
) -> Set[str]:
|
|
91
|
+
"""Given text with words that may or may not appear as hyphenated or with a
|
|
92
|
+
slash, return the set potential variations:
|
|
93
|
+
- the text as-is (add_self)
|
|
94
|
+
- with a hyphen between all words (if '-' in subs)
|
|
95
|
+
- with a space between all words (if ' ' in subs)
|
|
96
|
+
- with all words squashed together (empty string between if '' in subs)
|
|
97
|
+
- with each word separately (do_split as long as min_split_token_len is
|
|
98
|
+
met for all tokens)
|
|
99
|
+
|
|
100
|
+
Note:
|
|
101
|
+
* To add a variation with a slash, add '/' to subs.
|
|
102
|
+
* To not add any variations with symbols, leave them out of subs
|
|
103
|
+
and don't add self.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
text: The hyphen-worthy snippet of text, either already
|
|
107
|
+
hyphenated or with a slash or space delimited.
|
|
108
|
+
subs: A string of characters or list of strings to insert between
|
|
109
|
+
tokens.
|
|
110
|
+
add_self: True to include the text itself in the result.
|
|
111
|
+
do_split: True to add split tokens separately.
|
|
112
|
+
min_split_token_len: If any of the split tokens fail
|
|
113
|
+
to meet the min token length, don't add any of the splits.
|
|
114
|
+
hyphen_slash_re: The regex to identify hyphen/slash to expand.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
The set of text variations.
|
|
118
|
+
"""
|
|
119
|
+
variations = {text} if add_self else set()
|
|
120
|
+
if subs is not None and len(subs) > 0:
|
|
121
|
+
# create variant with all <s>'s
|
|
122
|
+
for s in subs:
|
|
123
|
+
variations.add(HYPHEN_SLASH_RE.sub(s, text))
|
|
124
|
+
if do_split:
|
|
125
|
+
# add each word separately
|
|
126
|
+
tokens = set(hyphen_slash_re.split(text))
|
|
127
|
+
if not max(len(t) < min_split_token_len for t in tokens):
|
|
128
|
+
variations.update(tokens)
|
|
129
|
+
return variations
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def drop_parentheticals_fn(text: str) -> str:
|
|
133
|
+
"""Drop parenthetical expressions from the text."""
|
|
134
|
+
return PARENTHETICAL_RE.sub("", text)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def expand_ampersand_fn(text: str) -> str:
|
|
138
|
+
"""Replace '&' with ' and '."""
|
|
139
|
+
return AMPERSAND_RE.sub(" and ", text)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_lexical_variations(
|
|
143
|
+
text: str,
|
|
144
|
+
include_self: bool = True,
|
|
145
|
+
expand_camelcase: bool = True,
|
|
146
|
+
drop_non_embedded_symbols: bool = True,
|
|
147
|
+
drop_embedded_symbols: bool = True,
|
|
148
|
+
spacify_embedded_symbols: bool = False,
|
|
149
|
+
do_hyphen_expansion: bool = True,
|
|
150
|
+
hyphen_subs: List[str] = (" ", ""),
|
|
151
|
+
do_hyphen_split: bool = True,
|
|
152
|
+
min_hyphen_split_token_len: int = 2,
|
|
153
|
+
do_slash_expansion: bool = True,
|
|
154
|
+
slash_subs: List[str] = (" ", " or "),
|
|
155
|
+
do_slash_split: bool = True,
|
|
156
|
+
min_slash_split_token_len: int = 1,
|
|
157
|
+
drop_parentheticals: bool = True,
|
|
158
|
+
expand_ampersands: bool = True,
|
|
159
|
+
add_eng_plurals: bool = True,
|
|
160
|
+
) -> Set[str]:
|
|
161
|
+
"""Get all variations for the text (including the text itself).
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
text: The text to generate variations for.
|
|
165
|
+
include_self: True to include the original text in the result.
|
|
166
|
+
expand_camelcase: True to expand camelCase text.
|
|
167
|
+
drop_non_embedded_symbols: True to drop symbols not embedded in words.
|
|
168
|
+
drop_embedded_symbols: True to drop symbols embedded in words.
|
|
169
|
+
spacify_embedded_symbols: True to replace embedded symbols with spaces.
|
|
170
|
+
do_hyphen_expansion: True to expand hyphenated text.
|
|
171
|
+
hyphen_subs: List of strings to substitute for hyphens.
|
|
172
|
+
do_hyphen_split: True to split on hyphens.
|
|
173
|
+
min_hyphen_split_token_len: Minimum token length for hyphen splits.
|
|
174
|
+
do_slash_expansion: True to expand slashes.
|
|
175
|
+
slash_subs: List of strings to substitute for slashes.
|
|
176
|
+
do_slash_split: True to split on slashes.
|
|
177
|
+
min_slash_split_token_len: Minimum token length for slash splits.
|
|
178
|
+
drop_parentheticals: True to drop parenthetical expressions.
|
|
179
|
+
expand_ampersands: True to expand ampersands to ' and '.
|
|
180
|
+
add_eng_plurals: True to add English plural forms.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
The set of all text variations.
|
|
184
|
+
"""
|
|
185
|
+
variations = {text} if include_self else set()
|
|
186
|
+
if expand_camelcase:
|
|
187
|
+
variations.add(expand_camelcase_fn(text))
|
|
188
|
+
if drop_non_embedded_symbols:
|
|
189
|
+
variations.add(drop_non_embedded_symbols_fn(text))
|
|
190
|
+
if drop_embedded_symbols:
|
|
191
|
+
variations.add(drop_embedded_symbols_fn(text))
|
|
192
|
+
if spacify_embedded_symbols:
|
|
193
|
+
variations.add(drop_embedded_symbols_fn(text, " "))
|
|
194
|
+
if (
|
|
195
|
+
do_hyphen_expansion and hyphen_subs is not None and len(hyphen_subs) > 0
|
|
196
|
+
) or do_hyphen_split:
|
|
197
|
+
variations.update(
|
|
198
|
+
get_hyphen_slash_expansions_fn(
|
|
199
|
+
text,
|
|
200
|
+
subs=hyphen_subs,
|
|
201
|
+
add_self=False,
|
|
202
|
+
do_split=do_hyphen_split,
|
|
203
|
+
min_split_token_len=min_hyphen_split_token_len,
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
if (do_slash_expansion and slash_subs is not None and len(slash_subs) > 0) or do_slash_split:
|
|
207
|
+
variations.update(
|
|
208
|
+
get_hyphen_slash_expansions_fn(
|
|
209
|
+
text,
|
|
210
|
+
subs=slash_subs,
|
|
211
|
+
add_self=False,
|
|
212
|
+
do_split=do_slash_split,
|
|
213
|
+
min_split_token_len=min_slash_split_token_len,
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
if drop_parentheticals:
|
|
217
|
+
variations.add(drop_parentheticals_fn(text))
|
|
218
|
+
if expand_ampersands:
|
|
219
|
+
variations.add(expand_ampersand_fn(text))
|
|
220
|
+
if add_eng_plurals:
|
|
221
|
+
# TODO: Use a better pluralizer
|
|
222
|
+
plurals = {f"{v}s" for v in variations}
|
|
223
|
+
variations.update(plurals)
|
|
224
|
+
return variations
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def int_to_en(num: int) -> str:
|
|
228
|
+
d = {
|
|
229
|
+
0: "zero",
|
|
230
|
+
1: "one",
|
|
231
|
+
2: "two",
|
|
232
|
+
3: "three",
|
|
233
|
+
4: "four",
|
|
234
|
+
5: "five",
|
|
235
|
+
6: "six",
|
|
236
|
+
7: "seven",
|
|
237
|
+
8: "eight",
|
|
238
|
+
9: "nine",
|
|
239
|
+
10: "ten",
|
|
240
|
+
11: "eleven",
|
|
241
|
+
12: "twelve",
|
|
242
|
+
13: "thirteen",
|
|
243
|
+
14: "fourteen",
|
|
244
|
+
15: "fifteen",
|
|
245
|
+
16: "sixteen",
|
|
246
|
+
17: "seventeen",
|
|
247
|
+
18: "eighteen",
|
|
248
|
+
19: "nineteen",
|
|
249
|
+
20: "twenty",
|
|
250
|
+
30: "thirty",
|
|
251
|
+
40: "forty",
|
|
252
|
+
50: "fifty",
|
|
253
|
+
60: "sixty",
|
|
254
|
+
70: "seventy",
|
|
255
|
+
80: "eighty",
|
|
256
|
+
90: "ninety",
|
|
257
|
+
}
|
|
258
|
+
k = 1000
|
|
259
|
+
m = k * 1000
|
|
260
|
+
b = m * 1000
|
|
261
|
+
t = b * 1000
|
|
262
|
+
|
|
263
|
+
if not isinstance(num, int):
|
|
264
|
+
return num
|
|
265
|
+
|
|
266
|
+
if num < 0:
|
|
267
|
+
return "negative " + int_to_en(abs(num))
|
|
268
|
+
|
|
269
|
+
if num < 20:
|
|
270
|
+
return d[num]
|
|
271
|
+
|
|
272
|
+
if num < 100:
|
|
273
|
+
if num % 10 == 0:
|
|
274
|
+
return d[num]
|
|
275
|
+
else:
|
|
276
|
+
return d[num // 10 * 10] + " " + d[num % 10]
|
|
277
|
+
|
|
278
|
+
if num < k:
|
|
279
|
+
if num % 100 == 0:
|
|
280
|
+
return d[num // 100] + " hundred"
|
|
281
|
+
else:
|
|
282
|
+
return d[num // 100] + " hundred and " + int_to_en(num % 100)
|
|
283
|
+
|
|
284
|
+
if num < m:
|
|
285
|
+
if num % k == 0:
|
|
286
|
+
return int_to_en(num // k) + " thousand"
|
|
287
|
+
else:
|
|
288
|
+
return int_to_en(num // k) + " thousand " + int_to_en(num % k)
|
|
289
|
+
|
|
290
|
+
if num < b:
|
|
291
|
+
if (num % m) == 0:
|
|
292
|
+
return int_to_en(num // m) + " million"
|
|
293
|
+
else:
|
|
294
|
+
return int_to_en(num // m) + " million " + int_to_en(num % m)
|
|
295
|
+
|
|
296
|
+
if num < t:
|
|
297
|
+
if (num % b) == 0:
|
|
298
|
+
return int_to_en(num // b) + " billion"
|
|
299
|
+
else:
|
|
300
|
+
return int_to_en(num // b) + " billion " + int_to_en(num % b)
|
|
301
|
+
|
|
302
|
+
if num % t == 0:
|
|
303
|
+
return int_to_en(num // t) + " trillion"
|
|
304
|
+
else:
|
|
305
|
+
return int_to_en(num // t) + " trillion " + int_to_en(num % t)
|
|
306
|
+
|
|
307
|
+
# num is too large
|
|
308
|
+
return str(num)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def zero_pad_variations(
|
|
312
|
+
val: int,
|
|
313
|
+
min_zpad_len: int,
|
|
314
|
+
max_zpad_len: int,
|
|
315
|
+
) -> Set[str]:
|
|
316
|
+
"""Get (only) zero-padded variations of the given value from min (inclusive)
|
|
317
|
+
to max (exclusive) zero-pad lengths.
|
|
318
|
+
|
|
319
|
+
Examples:
|
|
320
|
+
>>> from dataknobs_xization.normalize import zero_pad_variations
|
|
321
|
+
>>> zero_pad_variations(9, 2, 4)
|
|
322
|
+
{'09', '009'}
|
|
323
|
+
>>> zero_pad_variations(90, 2, 4)
|
|
324
|
+
{'090'}
|
|
325
|
+
>>> zero_pad_variations(90, 2, 3)
|
|
326
|
+
set()
|
|
327
|
+
>>> zero_pad_variations(3, 0, 5)
|
|
328
|
+
{'03', '003', '0003'}
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
val: The integer value to zero-pad.
|
|
332
|
+
min_zpad_len: The minimum zero-padded string length (inclusive).
|
|
333
|
+
max_zpad_len: The maximum zero-padded string length (exclusive).
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
The set of all requested zero-padded number strings.
|
|
337
|
+
"""
|
|
338
|
+
return {
|
|
339
|
+
f"{val:0{zpad}d}"
|
|
340
|
+
for zpad in range(
|
|
341
|
+
max(min_zpad_len, math.ceil(math.log10(val)) + 1 if val > 0 else 1), max_zpad_len
|
|
342
|
+
)
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def month_day_variations_fn(
|
|
347
|
+
month_or_day: int,
|
|
348
|
+
do_int_to_en: bool = False,
|
|
349
|
+
) -> Set[str]:
|
|
350
|
+
"""Get the variations for a month or day number, including the number
|
|
351
|
+
itself as a string, a 2-digit zero-padded form of the number, and
|
|
352
|
+
(optionally) english word for the number.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
month_or_day: The month or day for which to get variations.
|
|
356
|
+
do_int_to_en: Optionally include the english word for the number.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
The set of variations for the value.
|
|
360
|
+
"""
|
|
361
|
+
result = zero_pad_variations(month_or_day, 2, 3)
|
|
362
|
+
result.add(str(month_or_day))
|
|
363
|
+
if do_int_to_en:
|
|
364
|
+
result.add(int_to_en(month_or_day))
|
|
365
|
+
return result
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def year_variations_fn(
|
|
369
|
+
year: int,
|
|
370
|
+
min_year: int = 0,
|
|
371
|
+
max_year: int = 9999,
|
|
372
|
+
do_int_to_en_below_100: bool = False,
|
|
373
|
+
numeric_only: bool = False,
|
|
374
|
+
) -> Set[str]:
|
|
375
|
+
"""Convert a year to various text representations.
|
|
376
|
+
|
|
377
|
+
Generates variations including:
|
|
378
|
+
* "1999" (numeric)
|
|
379
|
+
* Long text: "one thousand, nine hundred and ninety nine"
|
|
380
|
+
* Short text: "nineteen [hundred and] ninety nine"
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
year: The year value to convert.
|
|
384
|
+
min_year: Minimum year to process (inclusive).
|
|
385
|
+
max_year: Maximum year to process (inclusive).
|
|
386
|
+
do_int_to_en_below_100: True to convert years below 100 to English text.
|
|
387
|
+
numeric_only: True to return only numeric variations.
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
The set of year variations.
|
|
391
|
+
"""
|
|
392
|
+
variations = {str(year)}
|
|
393
|
+
|
|
394
|
+
if year < min_year or year > max_year:
|
|
395
|
+
return variations
|
|
396
|
+
|
|
397
|
+
# one thousand, nine hundred and ninety nine
|
|
398
|
+
if not numeric_only and (do_int_to_en_below_100 or year >= 100):
|
|
399
|
+
variations.add(int_to_en(year))
|
|
400
|
+
|
|
401
|
+
# nineteen ninety five
|
|
402
|
+
century = year // 100
|
|
403
|
+
remainder = year % 100
|
|
404
|
+
remainder_text = int_to_en(remainder)
|
|
405
|
+
|
|
406
|
+
variations.update(zero_pad_variations(remainder, 2, 3))
|
|
407
|
+
|
|
408
|
+
if century > 0:
|
|
409
|
+
remainder_texts = []
|
|
410
|
+
if remainder > 0:
|
|
411
|
+
if remainder < 10:
|
|
412
|
+
if not numeric_only:
|
|
413
|
+
remainder_texts.append(f" oh {remainder_text}")
|
|
414
|
+
remainder_texts.append(f" 0{remainder}")
|
|
415
|
+
else:
|
|
416
|
+
if not numeric_only:
|
|
417
|
+
remainder_texts.append(f" {remainder_text}")
|
|
418
|
+
remainder_texts.append(f" {remainder}")
|
|
419
|
+
if not numeric_only:
|
|
420
|
+
remainder_texts.append(f" and {remainder_text}")
|
|
421
|
+
|
|
422
|
+
century_text = int_to_en(century)
|
|
423
|
+
scales = ["", century_text]
|
|
424
|
+
if century % 10 == 0:
|
|
425
|
+
mil_text = int_to_en(century // 10)
|
|
426
|
+
scales.append(f"{mil_text} thousand")
|
|
427
|
+
else:
|
|
428
|
+
scales.append(f"{century_text} hundred")
|
|
429
|
+
|
|
430
|
+
def clean_up(s):
|
|
431
|
+
s = s.strip()
|
|
432
|
+
if s.startswith("and "):
|
|
433
|
+
s = s[4:]
|
|
434
|
+
return s
|
|
435
|
+
|
|
436
|
+
variations.update({clean_up("".join(v)) for v in product(scales, remainder_texts)})
|
|
437
|
+
|
|
438
|
+
return variations
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def replace_smart_quotes_fn(text: str) -> str:
|
|
442
|
+
"""Replace "smart" quotes with their ascii version."""
|
|
443
|
+
return (
|
|
444
|
+
text.replace(
|
|
445
|
+
"\u201c",
|
|
446
|
+
'"', # left double quote U+201C
|
|
447
|
+
)
|
|
448
|
+
.replace(
|
|
449
|
+
"\u201d",
|
|
450
|
+
'"', # right double quote U+201D
|
|
451
|
+
)
|
|
452
|
+
.replace(
|
|
453
|
+
"\u2018",
|
|
454
|
+
"'", # left single quote U+2018
|
|
455
|
+
)
|
|
456
|
+
.replace(
|
|
457
|
+
"\u2019",
|
|
458
|
+
"'", # right single quote U+2019
|
|
459
|
+
)
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def basic_normalization_fn(
|
|
464
|
+
text: str,
|
|
465
|
+
lowercase: bool = True,
|
|
466
|
+
expand_camelcase: bool = True,
|
|
467
|
+
simplify_quote_chars: bool = True,
|
|
468
|
+
drop_non_embedded_symbols: bool = False,
|
|
469
|
+
spacify_embedded_symbols: bool = False,
|
|
470
|
+
drop_embedded_symbols: bool = False,
|
|
471
|
+
squash_whitespace: bool = False,
|
|
472
|
+
do_all: bool = False,
|
|
473
|
+
) -> str:
|
|
474
|
+
"""Basic normalization functions include:
|
|
475
|
+
* lowercasing [default]
|
|
476
|
+
* expanding camelcase [default]
|
|
477
|
+
* replacing "smart" quotes and apostrophes with ascii versions [default]
|
|
478
|
+
* dropping non_embedded symbols [optional]
|
|
479
|
+
* replacing embedded symbols with a space [takes precedence over dropping unless do_all]
|
|
480
|
+
* or dropping embedded symbols [optional]
|
|
481
|
+
* collapsing multiple spaces and stripping spaces from ends [optional]
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
text: The text to normalize.
|
|
485
|
+
lowercase: True to convert to lowercase.
|
|
486
|
+
expand_camelcase: True to expand camelCase text.
|
|
487
|
+
simplify_quote_chars: True to replace smart quotes with ASCII quotes.
|
|
488
|
+
drop_non_embedded_symbols: True to drop symbols not embedded in words.
|
|
489
|
+
spacify_embedded_symbols: True to replace embedded symbols with spaces.
|
|
490
|
+
drop_embedded_symbols: True to drop embedded symbols.
|
|
491
|
+
squash_whitespace: True to collapse whitespace and strip ends.
|
|
492
|
+
do_all: True to apply all normalization steps.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
The normalized text.
|
|
496
|
+
"""
|
|
497
|
+
# NOTE: do this before changing case
|
|
498
|
+
if expand_camelcase or do_all:
|
|
499
|
+
text = expand_camelcase_fn(text)
|
|
500
|
+
|
|
501
|
+
if lowercase or do_all:
|
|
502
|
+
text = text.lower()
|
|
503
|
+
if (drop_non_embedded_symbols and drop_embedded_symbols) or do_all:
|
|
504
|
+
text = re.sub(r"[^\w\s]+", "", text)
|
|
505
|
+
elif drop_non_embedded_symbols:
|
|
506
|
+
text = drop_non_embedded_symbols_fn(text)
|
|
507
|
+
elif spacify_embedded_symbols:
|
|
508
|
+
text = drop_embedded_symbols_fn(text, " ")
|
|
509
|
+
elif drop_embedded_symbols:
|
|
510
|
+
text = drop_embedded_symbols_fn(text)
|
|
511
|
+
|
|
512
|
+
# NOTE: do this after dropping (only some) symbols
|
|
513
|
+
if simplify_quote_chars and (not drop_non_embedded_symbols or not drop_embedded_symbols):
|
|
514
|
+
# NOTE: It only makes sense to do this if we're keeping symbols
|
|
515
|
+
text = replace_smart_quotes_fn(text)
|
|
516
|
+
|
|
517
|
+
# NOTE: do this last
|
|
518
|
+
if squash_whitespace or do_all:
|
|
519
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
520
|
+
return text
|
|
File without changes
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataknobs-xization
|
|
3
|
+
Version: 1.2.3
|
|
4
|
+
Summary: Text normalization and tokenization tools
|
|
5
|
+
Author-email: Spence Koehler <KoehlerSB747@gmail.com>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: dataknobs-common>=1.0.0
|
|
8
|
+
Requires-Dist: dataknobs-structures>=1.0.0
|
|
9
|
+
Requires-Dist: dataknobs-utils>=1.0.0
|
|
10
|
+
Requires-Dist: nltk>=3.9.1
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# dataknobs-xization
|
|
14
|
+
|
|
15
|
+
Text normalization and tokenization tools.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install dataknobs-xization
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Features
|
|
24
|
+
|
|
25
|
+
- **Markdown Chunking**: Parse and chunk markdown documents for RAG applications
|
|
26
|
+
- Preserves heading hierarchy and semantic structure
|
|
27
|
+
- Supports code blocks, tables, lists, and other markdown constructs
|
|
28
|
+
- Streaming support for large documents
|
|
29
|
+
- Flexible configuration for chunk size, overlap, and heading inclusion
|
|
30
|
+
- **Content Transformation**: Convert JSON, YAML, and CSV to markdown for RAG ingestion
|
|
31
|
+
- Generic conversion that preserves structure through headings
|
|
32
|
+
- Custom schemas for specialized formatting
|
|
33
|
+
- Configurable formatting options
|
|
34
|
+
- **Text Normalization**: Standardize text for consistent processing
|
|
35
|
+
- **Masking Tokenizer**: Advanced tokenization with masking capabilities
|
|
36
|
+
- **Annotations**: Text annotation system
|
|
37
|
+
- **Authorities**: Authority management for text processing
|
|
38
|
+
- **Lexicon**: Lexicon-based text analysis
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
### Markdown Chunking
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from dataknobs_xization import parse_markdown, chunk_markdown_tree
|
|
46
|
+
|
|
47
|
+
# Parse markdown into tree structure
|
|
48
|
+
markdown_text = """
|
|
49
|
+
# User Guide
|
|
50
|
+
## Installation
|
|
51
|
+
Install the package using pip.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
tree = parse_markdown(markdown_text)
|
|
55
|
+
|
|
56
|
+
# Generate chunks for RAG
|
|
57
|
+
chunks = chunk_markdown_tree(tree, max_chunk_size=500)
|
|
58
|
+
|
|
59
|
+
for chunk in chunks:
|
|
60
|
+
print(f"Headings: {chunk.metadata.get_heading_path()}")
|
|
61
|
+
print(f"Text: {chunk.text}\n")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
For more details, see the [Markdown Chunking documentation](docs/markdown/MARKDOWN_CHUNKING.md).
|
|
65
|
+
|
|
66
|
+
### Content Transformation
|
|
67
|
+
|
|
68
|
+
Convert structured data (JSON, YAML, CSV) to well-formatted markdown for RAG ingestion:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from dataknobs_xization import ContentTransformer, json_to_markdown
|
|
72
|
+
|
|
73
|
+
# Quick conversion
|
|
74
|
+
data = [
|
|
75
|
+
{"name": "Chain of Thought", "description": "Step by step reasoning"},
|
|
76
|
+
{"name": "Few-Shot", "description": "Learning from examples"}
|
|
77
|
+
]
|
|
78
|
+
markdown = json_to_markdown(data, title="Prompt Patterns")
|
|
79
|
+
|
|
80
|
+
# Or use the transformer class for more control
|
|
81
|
+
transformer = ContentTransformer(
|
|
82
|
+
base_heading_level=2,
|
|
83
|
+
include_field_labels=True,
|
|
84
|
+
code_block_fields=["example", "code"],
|
|
85
|
+
list_fields=["steps", "items"]
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Transform JSON
|
|
89
|
+
result = transformer.transform_json(data)
|
|
90
|
+
|
|
91
|
+
# Transform YAML
|
|
92
|
+
result = transformer.transform_yaml("config.yaml")
|
|
93
|
+
|
|
94
|
+
# Transform CSV
|
|
95
|
+
result = transformer.transform_csv("data.csv", title_field="name")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
#### Custom Schemas
|
|
99
|
+
|
|
100
|
+
Register schemas for specialized formatting of known data structures:
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
transformer = ContentTransformer()
|
|
104
|
+
|
|
105
|
+
# Register a schema for prompt patterns
|
|
106
|
+
transformer.register_schema("pattern", {
|
|
107
|
+
"title_field": "name",
|
|
108
|
+
"description_field": "description",
|
|
109
|
+
"sections": [
|
|
110
|
+
{"field": "use_case", "heading": "When to Use"},
|
|
111
|
+
{"field": "example", "heading": "Example", "format": "code", "language": "python"},
|
|
112
|
+
{"field": "variations", "heading": "Variations", "format": "list"}
|
|
113
|
+
],
|
|
114
|
+
"metadata_fields": ["category", "difficulty"]
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
# Use the schema
|
|
118
|
+
patterns = [
|
|
119
|
+
{
|
|
120
|
+
"name": "Chain of Thought",
|
|
121
|
+
"description": "Prompting technique for complex reasoning",
|
|
122
|
+
"use_case": "Multi-step problems requiring logical reasoning",
|
|
123
|
+
"example": "Let's think step by step...",
|
|
124
|
+
"category": "reasoning",
|
|
125
|
+
"difficulty": "intermediate"
|
|
126
|
+
}
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
markdown = transformer.transform_json(patterns, schema="pattern")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
#### Convenience Functions
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from dataknobs_xization import json_to_markdown, yaml_to_markdown, csv_to_markdown
|
|
136
|
+
|
|
137
|
+
# Quick conversions
|
|
138
|
+
md = json_to_markdown(data, title="My Data")
|
|
139
|
+
md = yaml_to_markdown("config.yaml", title="Config")
|
|
140
|
+
md = csv_to_markdown("data.csv", title_field="name")
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Text Normalization and Tokenization
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from dataknobs_xization import normalize, MaskingTokenizer
|
|
147
|
+
|
|
148
|
+
# Text normalization
|
|
149
|
+
normalized = normalize.normalize_text("Hello, World!")
|
|
150
|
+
|
|
151
|
+
# Tokenization with masking
|
|
152
|
+
tokenizer = MaskingTokenizer()
|
|
153
|
+
tokens = tokenizer.tokenize("This is a sample text.")
|
|
154
|
+
|
|
155
|
+
# Working with annotations
|
|
156
|
+
from dataknobs_xization import annotations
|
|
157
|
+
doc = annotations.create_document("Sample text", {"metadata": "value"})
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Dependencies
|
|
161
|
+
|
|
162
|
+
This package depends on:
|
|
163
|
+
- `dataknobs-common`
|
|
164
|
+
- `dataknobs-structures`
|
|
165
|
+
- `dataknobs-utils`
|
|
166
|
+
- nltk
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
See LICENSE file in the root repository.
|