dataknobs-xization 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -0,0 +1,448 @@
1
+ import math
2
+ import re
3
+ from itertools import product
4
+ from typing import List, Set
5
+
6
+ # squash whitespace: to collapse consecutive whitespace to a single space by
7
+ # x.sub(' ', text)
8
+ SQUASH_WS_RE = re.compile(r"\s+")
9
+
10
+
11
+ # to identify strings with any symbols by
12
+ # x.search(text)
13
+ ALL_SYMBOLS_RE = re.compile(r"[^\w\s]+")
14
+
15
+
16
+ # camelcase LU: to split between consecutive lower and upper chars by
17
+ # x.sub(r'\1 \2', text)
18
+ CAMELCASE_LU_RE = re.compile(r"([a-z]+)([A-Z])")
19
+
20
+
21
+ # camelcase UL: to split between consecutive upper and uppler lower chars by
22
+ # x.sub(r'\1 \2', text)
23
+ CAMELCASE_UL_RE = re.compile(r"([A-Z]+)([A-Z][a-z])")
24
+
25
+
26
+ # non-embedded symbols: those without a word char on both sides by
27
+ # x.sub('', text)
28
+ NON_EMBEDDED_WORD_SYMS_RE = re.compile(r"((?<!\w)[^\w\s]+)|([^\w\s]+(?!\w))")
29
+
30
+
31
+ # embedded symbols: to drop embedded symbols by
32
+ # x.sub('', text)
33
+ EMBEDDED_SYMS_RE = re.compile(r"(?<=\w)[^\w\s]+(?=\w)")
34
+
35
+
36
+ # hyphen-slash: to split between an embedded hyphen and/or slash by
37
+ # x.split(text)
38
+ HYPHEN_SLASH_RE = re.compile(r"(?<=\w)[\-\/ ](?=\w)")
39
+
40
+
41
+ # hyphen-only: to split between an embedded hyphen by
42
+ # x.split(text)
43
+ HYPHEN_ONLY_RE = re.compile(r"(?<=\w)[\- ](?=\w)")
44
+
45
+
46
+ # slash-only: to split between an embedded slash by
47
+ # x.split(text)
48
+ SLASH_ONLY_RE = re.compile(r"(?<=\w)\/(?=\w)")
49
+
50
+
51
+ # parenthetical expressions: to drop parenthetical expressions by
52
+ # x.sub('', text)
53
+ PARENTHETICAL_RE = re.compile(r"\(.*\)")
54
+
55
+
56
+ # ampersand: to replace an ampersand with " and " by
57
+ # x.sub(' and ', text)
58
+ AMPERSAND_RE = re.compile(r"\s*\&\s*")
59
+
60
+
61
+ def expand_camelcase_fn(text: str) -> str:
62
+ """Expand both "lU" and "UUl" camelcasing to "l U" and "U Ul" """
63
+ text = CAMELCASE_LU_RE.sub(r"\1 \2", text)
64
+ return CAMELCASE_UL_RE.sub(r"\1 \2", text)
65
+
66
+
67
+ def drop_non_embedded_symbols_fn(text: str, repl: str = "") -> str:
68
+ """Drop symbols not embedded within word characters"""
69
+ return NON_EMBEDDED_WORD_SYMS_RE.sub(repl, text)
70
+
71
+
72
+ def drop_embedded_symbols_fn(text: str, repl: str = "") -> str:
73
+ """Drop symbols embedded within word characters"""
74
+ return EMBEDDED_SYMS_RE.sub(repl, text)
75
+
76
+
77
+ def get_hyphen_slash_expansions_fn(
78
+ text: str,
79
+ subs: List[str] = ("-", " ", ""),
80
+ add_self: bool = True,
81
+ do_split: bool = True,
82
+ min_split_token_len: int = 2,
83
+ hyphen_slash_re=HYPHEN_SLASH_RE,
84
+ ) -> Set[str]:
85
+ """Given text with words that may or may not appear as hyphenated or with a
86
+ slash, return the set potential variations:
87
+ - the text as-is (add_self)
88
+ - with a hyphen between all words (if '-' in subs)
89
+ - with a space between all words (if ' ' in subs)
90
+ - with all words squashed together (empty string between if '' in subs)
91
+ - with each word separately (do_split as long as min_split_token_len is
92
+ met for all tokens)
93
+
94
+ :param text: The hyphen-worthy snippet of text, either already
95
+ hyphenated or with a slash or space delimited.
96
+ :param subs: A string of characters or list of strings to insert between
97
+ tokens
98
+ :param add_self: True to include the text itself in the result
99
+ :param min_split_token_len: If any of the split tokens fail
100
+ to meet the min token length, don't add any of the splits.
101
+ :param hyphen_slash_re: The regex to identify hyphen/slash to expand.
102
+
103
+ Notes:
104
+ * To add a variation with a slash, add '/' to subs.
105
+ * To not add any variations with symbols, leave them out of subs
106
+ * and don't add self.
107
+ """
108
+ variations = {text} if add_self else set()
109
+ if subs is not None and len(subs) > 0:
110
+ # create variant with all <s>'s
111
+ for s in subs:
112
+ variations.add(HYPHEN_SLASH_RE.sub(s, text))
113
+ if do_split:
114
+ # add each word separately
115
+ tokens = set(hyphen_slash_re.split(text))
116
+ if not max(map(lambda t: len(t) < min_split_token_len, tokens)):
117
+ variations.update(tokens)
118
+ return variations
119
+
120
+
121
+ def drop_parentheticals_fn(text: str) -> str:
122
+ """Drop parenthetical expressions from the text."""
123
+ return PARENTHETICAL_RE.sub("", text)
124
+
125
+
126
+ def expand_ampersand_fn(text: str) -> str:
127
+ """Replace '&' with ' and '."""
128
+ return AMPERSAND_RE.sub(" and ", text)
129
+
130
+
131
+ def get_lexical_variations(
132
+ text: str,
133
+ include_self: bool = True,
134
+ expand_camelcase: bool = True,
135
+ drop_non_embedded_symbols: bool = True,
136
+ drop_embedded_symbols: bool = True,
137
+ spacify_embedded_symbols: bool = False,
138
+ do_hyphen_expansion: bool = True,
139
+ hyphen_subs: List[str] = (" ", ""),
140
+ do_hyphen_split: bool = True,
141
+ min_hyphen_split_token_len=2,
142
+ do_slash_expansion: bool = True,
143
+ slash_subs: List[str] = (" ", " or "),
144
+ do_slash_split: bool = True,
145
+ min_slash_split_token_len: int = 1,
146
+ drop_parentheticals: bool = True,
147
+ expand_ampersands: bool = True,
148
+ add_eng_plurals: bool = True,
149
+ ) -> Set[str]:
150
+ """Get all variations for the text (including the text itself)."""
151
+ variations = {text} if include_self else set()
152
+ if expand_camelcase:
153
+ variations.add(expand_camelcase_fn(text))
154
+ if drop_non_embedded_symbols:
155
+ variations.add(drop_non_embedded_symbols_fn(text))
156
+ if drop_embedded_symbols:
157
+ variations.add(drop_embedded_symbols_fn(text))
158
+ if spacify_embedded_symbols:
159
+ variations.add(drop_embedded_symbols_fn(text, " "))
160
+ if (
161
+ do_hyphen_expansion and hyphen_subs is not None and len(hyphen_subs) > 0
162
+ ) or do_hyphen_split:
163
+ variations.update(
164
+ get_hyphen_slash_expansions_fn(
165
+ text,
166
+ subs=hyphen_subs,
167
+ add_self=False,
168
+ do_split=do_hyphen_split,
169
+ min_split_token_len=min_hyphen_split_token_len,
170
+ )
171
+ )
172
+ if (do_slash_expansion and slash_subs is not None and len(slash_subs) > 0) or do_slash_split:
173
+ variations.update(
174
+ get_hyphen_slash_expansions_fn(
175
+ text,
176
+ subs=slash_subs,
177
+ add_self=False,
178
+ do_split=do_slash_split,
179
+ min_split_token_len=min_slash_split_token_len,
180
+ )
181
+ )
182
+ if drop_parentheticals:
183
+ variations.add(drop_parentheticals_fn(text))
184
+ if expand_ampersands:
185
+ variations.add(expand_ampersand_fn(text))
186
+ if add_eng_plurals:
187
+ # TODO: Use a better pluralizer
188
+ plurals = {f"{v}s" for v in variations}
189
+ variations.update(plurals)
190
+ return variations
191
+
192
+
193
+ def int_to_en(num: int) -> str:
194
+ d = {
195
+ 0: "zero",
196
+ 1: "one",
197
+ 2: "two",
198
+ 3: "three",
199
+ 4: "four",
200
+ 5: "five",
201
+ 6: "six",
202
+ 7: "seven",
203
+ 8: "eight",
204
+ 9: "nine",
205
+ 10: "ten",
206
+ 11: "eleven",
207
+ 12: "twelve",
208
+ 13: "thirteen",
209
+ 14: "fourteen",
210
+ 15: "fifteen",
211
+ 16: "sixteen",
212
+ 17: "seventeen",
213
+ 18: "eighteen",
214
+ 19: "nineteen",
215
+ 20: "twenty",
216
+ 30: "thirty",
217
+ 40: "forty",
218
+ 50: "fifty",
219
+ 60: "sixty",
220
+ 70: "seventy",
221
+ 80: "eighty",
222
+ 90: "ninety",
223
+ }
224
+ k = 1000
225
+ m = k * 1000
226
+ b = m * 1000
227
+ t = b * 1000
228
+
229
+ if not isinstance(num, int):
230
+ return num
231
+
232
+ if num < 0:
233
+ return "negative " + int_to_en(abs(num))
234
+
235
+ if num < 20:
236
+ return d[num]
237
+
238
+ if num < 100:
239
+ if num % 10 == 0:
240
+ return d[num]
241
+ else:
242
+ return d[num // 10 * 10] + " " + d[num % 10]
243
+
244
+ if num < k:
245
+ if num % 100 == 0:
246
+ return d[num // 100] + " hundred"
247
+ else:
248
+ return d[num // 100] + " hundred and " + int_to_en(num % 100)
249
+
250
+ if num < m:
251
+ if num % k == 0:
252
+ return int_to_en(num // k) + " thousand"
253
+ else:
254
+ return int_to_en(num // k) + " thousand " + int_to_en(num % k)
255
+
256
+ if num < b:
257
+ if (num % m) == 0:
258
+ return int_to_en(num // m) + " million"
259
+ else:
260
+ return int_to_en(num // m) + " million " + int_to_en(num % m)
261
+
262
+ if num < t:
263
+ if (num % b) == 0:
264
+ return int_to_en(num // b) + " billion"
265
+ else:
266
+ return int_to_en(num // b) + " billion " + int_to_en(num % b)
267
+
268
+ if num % t == 0:
269
+ return int_to_en(num // t) + " trillion"
270
+ else:
271
+ return int_to_en(num // t) + " trillion " + int_to_en(num % t)
272
+
273
+ # num is too large
274
+ return str(num)
275
+
276
+
277
+ def zero_pad_variations(
278
+ val: int,
279
+ min_zpad_len: int,
280
+ max_zpad_len: int,
281
+ ) -> Set[str]:
282
+ """Get (only) zero-padded variations of the given value from min (inclusive)
283
+ to max (exclusive) zero-pad lengths.
284
+
285
+ Examples:
286
+ * zero_pad_variations(9, 2, 4) == {'09', '009'}
287
+ * zero_pad_variations(90, 2, 4) == {'090'}
288
+ * zero_pad_variations(90, 2, 3) == {}
289
+ * zero_pad_variations(3, 0, 5) == {'03', '003', '0003'}
290
+
291
+ :param min_zpad_len: The minimum zero-padded string length (inclusive)
292
+ :param max_zpad_len: The maximum zero-padded string length (exclusive)
293
+ :return: The set of all requested zero-padded number strings
294
+ """
295
+ return {
296
+ f"{val:0{zpad}d}"
297
+ for zpad in range(
298
+ max(min_zpad_len, math.ceil(math.log10(val)) + 1 if val > 0 else 1), max_zpad_len
299
+ )
300
+ }
301
+
302
+
303
+ def month_day_variations_fn(
304
+ month_or_day: int,
305
+ do_int_to_en: bool = False,
306
+ ) -> Set[str]:
307
+ """Get the variations for a month or day number, including the number
308
+ itself as a string, a 2-digit zero-padded form of the number, and
309
+ (optionally) english word for the number.
310
+
311
+ :param month_or_day: The month or day for which to get variations
312
+ :param do_int_to_en: Optionally include the english word for the number
313
+ :return: The set of variations for the value
314
+ """
315
+ result = zero_pad_variations(month_or_day, 2, 3)
316
+ result.add(str(month_or_day))
317
+ if do_int_to_en:
318
+ result.add(int_to_en(month_or_day))
319
+ return result
320
+
321
+
322
+ def year_variations_fn(
323
+ year: int,
324
+ min_year: int = 0,
325
+ max_year: int = 9999,
326
+ do_int_to_en_below_100: bool = False,
327
+ numeric_only: bool = False,
328
+ ) -> Set[str]:
329
+ """* "1999"
330
+ * Convert a year to english text:
331
+ * Long text: one thousand, nine hundred and ninety nine
332
+ * Short text: nineteen [hundred and] ninety nine
333
+ """
334
+ variations = {str(year)}
335
+
336
+ if year < min_year or year > max_year:
337
+ return variations
338
+
339
+ # one thousand, nine hundred and ninety nine
340
+ if not numeric_only and (do_int_to_en_below_100 or year >= 100):
341
+ variations.add(int_to_en(year))
342
+
343
+ # nineteen ninety five
344
+ century = year // 100
345
+ remainder = year % 100
346
+ remainder_text = int_to_en(remainder)
347
+
348
+ variations.update(zero_pad_variations(remainder, 2, 3))
349
+
350
+ if century > 0:
351
+ remainder_texts = list()
352
+ if remainder > 0:
353
+ if remainder < 10:
354
+ if not numeric_only:
355
+ remainder_texts.append(f" oh {remainder_text}")
356
+ remainder_texts.append(f" 0{remainder}")
357
+ else:
358
+ if not numeric_only:
359
+ remainder_texts.append(f" {remainder_text}")
360
+ remainder_texts.append(f" {remainder}")
361
+ if not numeric_only:
362
+ remainder_texts.append(f" and {remainder_text}")
363
+
364
+ century_text = int_to_en(century)
365
+ scales = ["", century_text]
366
+ if century % 10 == 0:
367
+ mil_text = int_to_en(century // 10)
368
+ scales.append(f"{mil_text} thousand")
369
+ else:
370
+ scales.append(f"{century_text} hundred")
371
+
372
+ def clean_up(s):
373
+ s = s.strip()
374
+ if s.startswith("and "):
375
+ s = s[4:]
376
+ return s
377
+
378
+ variations.update({clean_up("".join(v)) for v in product(scales, remainder_texts)})
379
+
380
+ return variations
381
+
382
+
383
+ def replace_smart_quotes_fn(text: str) -> str:
384
+ """Replace "smart" quotes with their ascii version."""
385
+ return (
386
+ text.replace(
387
+ "\u201c",
388
+ '"', # left double quote U+201C
389
+ )
390
+ .replace(
391
+ "\u201d",
392
+ '"', # right double quote U+201D
393
+ )
394
+ .replace(
395
+ "\u2018",
396
+ "'", # left single quote U+2018
397
+ )
398
+ .replace(
399
+ "\u2019",
400
+ "'", # right single quote U+2019
401
+ )
402
+ )
403
+
404
+
405
+ def basic_normalization_fn(
406
+ text: str,
407
+ lowercase: bool = True,
408
+ expand_camelcase: bool = True,
409
+ simplify_quote_chars: bool = True,
410
+ drop_non_embedded_symbols: bool = False,
411
+ spacify_embedded_symbols: bool = False,
412
+ drop_embedded_symbols: bool = False,
413
+ squash_whitespace: bool = False,
414
+ do_all: bool = False,
415
+ ) -> str:
416
+ """Basic normalization functions include:
417
+ * lowercasing [default]
418
+ * expanding camelcase [default]
419
+ * replacing "smart" quotes and apostrophes with ascii verisons [default]
420
+ * dropping non_embedded symbols [optional]
421
+ * replacing embedded symbols with a space [takes precedence over dropping unless do_all]
422
+ * or dropping embedded symbols [optional]
423
+ * collapsing multiple spaces and stripping spaces from ends [optional]
424
+ """
425
+ # NOTE: do this before changing case
426
+ if expand_camelcase or do_all:
427
+ text = expand_camelcase_fn(text)
428
+
429
+ if lowercase or do_all:
430
+ text = text.lower()
431
+ if (drop_non_embedded_symbols and drop_embedded_symbols) or do_all:
432
+ text = re.sub(r"[^\w\s]+", "", text)
433
+ elif drop_non_embedded_symbols:
434
+ text = drop_non_embedded_symbols_fn(text)
435
+ elif spacify_embedded_symbols:
436
+ text = drop_embedded_symbols_fn(text, " ")
437
+ elif drop_embedded_symbols:
438
+ text = drop_embedded_symbols_fn(text)
439
+
440
+ # NOTE: do this after dropping (only some) symbols
441
+ if simplify_quote_chars and (not drop_non_embedded_symbols or not drop_embedded_symbols):
442
+ # NOTE: It only makes sense to do this if we're keeping symbols
443
+ text = replace_smart_quotes_fn(text)
444
+
445
+ # NOTE: do this last
446
+ if squash_whitespace or do_all:
447
+ text = re.sub(r"\s+", " ", text).strip()
448
+ return text
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataknobs-xization
3
+ Version: 1.0.0
4
+ Summary: Text normalization and tokenization tools
5
+ Author-email: Spence Koehler <KoehlerSB747@gmail.com>
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: dataknobs-common>=1.0.0
8
+ Requires-Dist: dataknobs-structures>=1.0.0
9
+ Requires-Dist: dataknobs-utils>=1.0.0
10
+ Requires-Dist: nltk>=3.9.1
11
+ Description-Content-Type: text/markdown
12
+
13
+ # dataknobs-xization
14
+
15
+ Text normalization and tokenization tools.
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install dataknobs-xization
21
+ ```
22
+
23
+ ## Features
24
+
25
+ - **Text Normalization**: Standardize text for consistent processing
26
+ - **Masking Tokenizer**: Advanced tokenization with masking capabilities
27
+ - **Annotations**: Text annotation system
28
+ - **Authorities**: Authority management for text processing
29
+ - **Lexicon**: Lexicon-based text analysis
30
+
31
+ ## Usage
32
+
33
+ ```python
34
+ from dataknobs_xization import normalize, MaskingTokenizer
35
+
36
+ # Text normalization
37
+ normalized = normalize.normalize_text("Hello, World!")
38
+
39
+ # Tokenization with masking
40
+ tokenizer = MaskingTokenizer()
41
+ tokens = tokenizer.tokenize("This is a sample text.")
42
+
43
+ # Working with annotations
44
+ from dataknobs_xization import annotations
45
+ doc = annotations.create_document("Sample text", {"metadata": "value"})
46
+ ```
47
+
48
+ ## Dependencies
49
+
50
+ This package depends on:
51
+ - `dataknobs-common`
52
+ - `dataknobs-structures`
53
+ - `dataknobs-utils`
54
+ - nltk
55
+
56
+ ## License
57
+
58
+ See LICENSE file in the root repository.
@@ -0,0 +1,10 @@
1
+ dataknobs_xization/0.readme.txt,sha256=Q46suHOARkjQLY580eOfSCeUyIgQx-e6DLmtEhcuODE,2878
2
+ dataknobs_xization/__init__.py,sha256=ixsRSYr86q1T4LqQTRzP9Z_ihcOVN6r8SQNurhmHWmY,404
3
+ dataknobs_xization/annotations.py,sha256=qiH_QzzIs5mjvO2Yr4jiLBMIxIiPbzzfd_iublS8HTI,45143
4
+ dataknobs_xization/authorities.py,sha256=69nAlExbh_U7NKav1q3IujXb8lBq14QJhHHy5IZ0PZE,30745
5
+ dataknobs_xization/lexicon.py,sha256=NMo3lAXUVzFVRy246Y90TZtm-27qR5g0z8Ef9u2E2LA,23722
6
+ dataknobs_xization/masking_tokenizer.py,sha256=65RkHdU83l1Tf0f9bXwNrLDuFsN-xegMQNJGON7Z8WY,26036
7
+ dataknobs_xization/normalize.py,sha256=kpT8y1jEmeiKiNC8pruurFjasmREhr4rAQ3W_yB2v4U,14024
8
+ dataknobs_xization-1.0.0.dist-info/METADATA,sha256=xa_zmCme9PWkU1NJO6w8G2nQvubCrpSKvTIBwHWAv-4,1393
9
+ dataknobs_xization-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ dataknobs_xization-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any