dataknobs-xization 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -0,0 +1,697 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Callable
3
+ from typing import Any, List, Tuple, Union
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ import dataknobs_structures.document as dk_doc
9
+ from dataknobs_utils import emoji_utils
10
+
11
+
12
+ class CharacterFeatures(ABC):
13
+ """Class representing features of text as a dataframe with each character
14
+ as a row and columns representing character features.
15
+ """
16
+
17
+ def __init__(self, doctext: Union[dk_doc.Text, str], roll_padding: int = 0):
18
+ """:param doctext: The text to tokenize (or dk_doc.Text with its metadata)
19
+ :param roll_padding: The number of pad characters added to each end of
20
+ the text.
21
+ """
22
+ self._doctext = doctext
23
+ self._roll_padding = roll_padding
24
+ self._padded_text = None
25
+
26
+ @property
27
+ def cdf(self) -> pd.DataFrame:
28
+ """The character dataframe with each padded text character as a row."""
29
+ raise NotImplementedError
30
+
31
+ @property
32
+ def doctext(self) -> dk_doc.Text:
33
+ if isinstance(self._doctext, str):
34
+ self._doctext = dk_doc.Text(self._doctext, None)
35
+ return self._doctext
36
+
37
+ @property
38
+ def text_col(self) -> str:
39
+ """The name of the cdf column holding the text characters."""
40
+ return self.doctext.text_label
41
+
42
+ @property
43
+ def text(self) -> str:
44
+ """The text string."""
45
+ return self.doctext.text
46
+
47
+ @property
48
+ def text_id(self) -> Any:
49
+ """The ID of the text."""
50
+ return self.doctext.text_id
51
+
52
+ @abstractmethod
53
+ def build_first_token(
54
+ self,
55
+ normalize_fn: Callable[[str], str],
56
+ ) -> "Token":
57
+ """Build the first token as the start of tokenization.
58
+
59
+ :param normalize_fn: A function to normalize a raw text term or any
60
+ of its variations. If None, then the identity function is used.
61
+ :return: The first text token
62
+ """
63
+ raise NotImplementedError
64
+
65
+ @property
66
+ def roll_padding(self) -> int:
67
+ """The number of pad characters added to each end of the text."""
68
+ return self._roll_padding
69
+
70
+ @property
71
+ def padded_text(self) -> str:
72
+ """The text with padding included."""
73
+ if self._padded_text is None:
74
+ padding = " " * self.roll_padding
75
+ self._padded_text = padding + self.text + padding
76
+ return self._padded_text
77
+
78
+ def get_tokens(
79
+ self,
80
+ normalize_fn: Callable[[str], str] = lambda x: x,
81
+ ) -> List["Token"]:
82
+ """Get all token instances using the given normalize function.
83
+ :param normalize_fn: The normalization function (default=identity fn)
84
+ :return: A list of token instances
85
+ """
86
+ token = self.build_first_token(normalize_fn)
87
+ tokens = list()
88
+ while token is not None:
89
+ tokens.append(token)
90
+ token = token.next_token
91
+ return tokens
92
+
93
+
94
+ class TextFeatures(CharacterFeatures):
95
+ def __init__(
96
+ self,
97
+ doctext: Union[dk_doc.Text, str],
98
+ split_camelcase: bool = True,
99
+ mark_alpha: bool = False,
100
+ mark_digit: bool = False,
101
+ mark_upper: bool = False,
102
+ mark_lower: bool = False,
103
+ emoji_data: emoji_utils.EmojiData = None,
104
+ ):
105
+ """:param doctext: The text to tokenize with its metadata
106
+ :param split_camelcase: True to mark camel-case features
107
+ :param mark_alpha: True to mark alpha features (separate from alnum)
108
+ :param mark_digit: True to mark digit features (separate from alnum)
109
+ :param mark_upper: True to mark upper features (auto-included for
110
+ camel-case)
111
+ :param mark_lower: True to mark lower features (auto-included for
112
+ camel-case)
113
+ :param emoji_data: An EmojiData instance to mark emoji BIO features
114
+
115
+ Notes:
116
+ * If emoji_data is non-null:
117
+ * Then emojis will be treated as text (instead of as non-text)
118
+ * If split_camelcase is True,
119
+ * then each emoji will be in its own token
120
+ * otherwise, each sequence of (adjacent) emojis will be treated
121
+ as a single token.
122
+ """
123
+ # NOTE: roll_padding is determined by "roll" feature needs. Currently 1.
124
+ super().__init__(doctext, roll_padding=1)
125
+ self.split_camelcase = split_camelcase
126
+ self._cdf = self._build_character_dataframe(
127
+ split_camelcase,
128
+ mark_alpha,
129
+ mark_digit,
130
+ mark_upper,
131
+ mark_lower,
132
+ emoji_data,
133
+ )
134
+
135
+ @property
136
+ def cdf(self) -> pd.DataFrame:
137
+ """The character dataframe with each padded text character as a row."""
138
+ return self._cdf
139
+
140
+ def build_first_token(
141
+ self,
142
+ normalize_fn: Callable[[str], str],
143
+ ) -> "Token":
144
+ """Build the first token as the start of tokenization.
145
+
146
+ :param normalize_fn: A function to normalize a raw text term or any
147
+ of its variations. If None, then the identity function is used.
148
+ :return: The first text token
149
+ """
150
+ token_mask = (
151
+ DualTokenMask(
152
+ self,
153
+ self.cdf["tok_start"],
154
+ self.cdf["tok_end"],
155
+ )
156
+ if self.split_camelcase
157
+ else SimpleTokenMask(self, self.cdf["alnum"])
158
+ )
159
+ token = Token(token_mask, normalize_fn=normalize_fn)
160
+ return token
161
+
162
+ def _build_character_dataframe(
163
+ self,
164
+ split_camelcase,
165
+ mark_alpha,
166
+ mark_digit,
167
+ mark_upper,
168
+ mark_lower,
169
+ emoji_data,
170
+ ):
171
+ if split_camelcase:
172
+ mark_upper = True
173
+ mark_lower = True
174
+ cdf = pd.DataFrame({self.text_col: list(self.padded_text)})
175
+ if mark_alpha:
176
+ cdf["alpha"] = cdf[self.text_col].str.isalpha()
177
+ if mark_digit:
178
+ cdf["digit"] = cdf[self.text_col].str.isdigit()
179
+ cdf["alnum"] = cdf[self.text_col].str.isalnum()
180
+ cdf["space"] = cdf[self.text_col].str.isspace()
181
+ if mark_upper:
182
+ cdf["upper"] = cdf[self.text_col].str.isupper()
183
+ if mark_lower:
184
+ cdf["lower"] = cdf[self.text_col].str.islower()
185
+ cdf["sym"] = ~(cdf["alnum"] | cdf["space"])
186
+ if split_camelcase:
187
+ cdf["cc1"] = np.roll(cdf["lower"], 1) & cdf["upper"]
188
+ cdf["cc2"] = ( # Mark 2nd U of UUl
189
+ np.roll(cdf["upper"], 1) & cdf["upper"] & np.roll(cdf["lower"], -1)
190
+ )
191
+ # NOTE: tok_start and tok_end are both INCLUSIVE
192
+ cdf["tok_start"] = ( # mark a char following a non-char
193
+ cdf["alnum"] & ~np.roll(cdf["alnum"], 1)
194
+ )
195
+ cdf["tok_end"] = ( # mark a char followed by a non-char
196
+ cdf["alnum"] & ~np.roll(cdf["alnum"], -1)
197
+ )
198
+ if split_camelcase:
199
+ cdf["tok_start"] = cdf["tok_start"] | cdf["cc1"] | cdf["cc2"]
200
+ cdf["tok_end"] = cdf["tok_end"] | np.roll(cdf["cc1"] | cdf["cc2"], -1)
201
+ if emoji_data is not None:
202
+ cdf["emoji"] = pd.Series(list(emoji_data.emoji_bio(self.padded_text)))
203
+ if split_camelcase:
204
+ # Splitting camelcase includes splitting distinct emojis
205
+ cdf["tok_start"] |= cdf["emoji"] == "B"
206
+ cdf["tok_end"] |= ( # mark an 'I' followed by not 'I'
207
+ (cdf["emoji"] == "I") & np.roll(cdf["emoji"] != "I", -1)
208
+ )
209
+ cdf["tok_end"] |= ( # mark an 'B' followed by not 'I'
210
+ (cdf["emoji"] == "B") & np.roll(cdf["emoji"] != "I", -1)
211
+ )
212
+ else:
213
+ # Not splitting camelcase keeps consecutive emojis together
214
+ cdf["alnum"] |= cdf["emoji"] != "O"
215
+ return cdf
216
+
217
+
218
+ class CharacterInputFeatures(CharacterFeatures):
219
+ """A wrapper that starts with a pre-built character features dataframe."""
220
+
221
+ def __init__(
222
+ self,
223
+ cdf: pd.DataFrame,
224
+ token_mask: "TokenMask",
225
+ doctext: Union[dk_doc.Text, str],
226
+ roll_padding: int = 0,
227
+ ):
228
+ super().__init__(doctext, roll_padding=roll_padding)
229
+ self._cdf = cdf
230
+ self._token_mask = token_mask
231
+
232
+ @property
233
+ def cdf(self) -> pd.DataFrame:
234
+ """The character dataframe with each padded text character as a row."""
235
+ return self._cdf
236
+
237
+ def build_first_token(
238
+ self,
239
+ normalize_fn: Callable[[str], str] = None,
240
+ ) -> "Token":
241
+ """Build the first token as the start of tokenization.
242
+
243
+ :param normalize_fn: A function to normalize a raw text term or any
244
+ of its variations. If None, then the identity function is used.
245
+ :return: The first text token
246
+ """
247
+ token = Token(self._token_mask, normalize_fn=normalize_fn)
248
+ return token
249
+
250
+
251
+ class TokenLoc:
252
+ """Simple structure holding information about a token's location."""
253
+
254
+ def __init__(
255
+ self,
256
+ start_loc: int,
257
+ end_loc: int,
258
+ token_num: int = None,
259
+ start_incl: bool = True,
260
+ end_incl: bool = False,
261
+ ):
262
+ """Initialize with the available information.
263
+
264
+ :param start_loc: The starting location of the token
265
+ :param end_loc: The ending location of the token
266
+ :param token_num: The position of the token within its text string
267
+ :param start_incl: True if start_loc is part of the token; otherwise
268
+ start_loc+1 is part of the token.
269
+ :param end_incl: True if end_loc is part of the token; otherwise
270
+ end_loc-1 is part of the token.
271
+ """
272
+ self._start_loc = start_loc
273
+ self._end_loc = end_loc
274
+ self._token_num = token_num
275
+ self._start_incl = int(start_incl)
276
+ self._end_incl = int(end_incl)
277
+
278
+ def __repr__(self) -> str:
279
+ token_num = f"#{self._token_num}" if self._token_num >= 0 else ""
280
+
281
+ def inclc(incl, left):
282
+ if incl:
283
+ return "[" if left else "]"
284
+ else:
285
+ return "(" if left else ")"
286
+
287
+ return f"{token_num}{inclc(self._start_incl, True)}{self._start_loc}:{self._end_loc}{inclc(self._end_incl, False)}"
288
+
289
+ def _incl_offset(self, wanted_incl: bool, current_incl: int) -> int:
290
+ """Get the inclusivity offset based on what is wanted versus what is."""
291
+ return int(wanted_incl) - current_incl
292
+
293
+ @property
294
+ def len(self) -> int:
295
+ """Get the length of the token at this location."""
296
+ return self.end_loc_excl - self.start_loc_incl
297
+
298
+ @property
299
+ def start_loc_incl(self) -> int:
300
+ """Get the inclusive start location."""
301
+ return self._start_loc + self._incl_offset(True, self._start_incl)
302
+
303
+ @property
304
+ def start_loc_excl(self) -> int:
305
+ """Get the exclusive start location."""
306
+ return self._start_loc + self._incl_offset(False, self._start_incl)
307
+
308
+ @property
309
+ def end_loc_incl(self) -> int:
310
+ """Get the inclusive end location."""
311
+ return self._end_loc - self._incl_offset(True, self._end_incl)
312
+
313
+ @property
314
+ def end_loc_excl(self) -> int:
315
+ """Get the exclusive end location."""
316
+ return self._end_loc - self._incl_offset(False, self._end_incl)
317
+
318
+ @property
319
+ def token_num(self) -> int:
320
+ """Get the token's position within its text string, or -1 if unknown."""
321
+ return self._token_num if self._token_num is not None else -1
322
+
323
+
324
+ class TokenMask(ABC):
325
+ """A class for accessing text characters through feature masks."""
326
+
327
+ def __init__(self, text_features: CharacterFeatures):
328
+ self.text_features = text_features
329
+ self.pad = self.text_features.roll_padding
330
+ self.max_ploc = max(self.text_features.cdf.index)
331
+
332
+ def _get_next_start(self, ref_ploc: int, token_mask: pd.Series) -> int:
333
+ """Given the end of a prior token or possible start of the next, get
334
+ the "next" start token's starting ploc. If there is no subsequent
335
+ token, then return None.
336
+ :param ref_ploc: The end ploc of the prior token or start of string
337
+ :param token_mask: The token mask to use
338
+ :return: The ploc of the start of the next token or None.
339
+ """
340
+ # if not at end of string or already at the start of a token, increment
341
+ if ref_ploc > self.max_ploc:
342
+ ref_ploc = None # At end of string
343
+ elif not token_mask.loc[ref_ploc]:
344
+ next_ploc = increment(ref_ploc, token_mask)
345
+ ref_ploc = next_ploc if next_ploc > ref_ploc else None
346
+ return ref_ploc
347
+
348
+ def get_padded_text(self, start_loc_incl: int, end_loc_excl: int) -> str:
349
+ return self.text_features.padded_text[start_loc_incl:end_loc_excl]
350
+
351
+ def get_text(self, token_loc: TokenLoc) -> str:
352
+ """Get the text at the (padded) token location.
353
+ :param token_loc: The token location
354
+ :return: The token text
355
+ """
356
+ return self.get_padded_text(token_loc.start_loc_incl, token_loc.end_loc_excl)
357
+
358
+ @abstractmethod
359
+ def get_next_token_loc(self, ref_ploc: int, token_num: int = -1) -> TokenLoc:
360
+ """Given the end of a prior token or possible start of the next, get
361
+ the "next" token's location.
362
+ If there is no subsequent token, then return None.
363
+ :param ref_ploc: The end ploc of the prior token or start of string
364
+ :param token_num: The token position within its text string
365
+ :return: The TokenLoc of the next token or None.
366
+ """
367
+ raise NotImplementedError
368
+
369
+ @abstractmethod
370
+ def get_prev_token_loc(self, from_token_loc: TokenLoc) -> TokenLoc:
371
+ """Get the previous token bounds before the given token start ploc.
372
+ If there is no prior token, then return None.
373
+ :param from_start: The start ploc of the token after the result
374
+ :return: (tok_start, tok_end) plocs of the prior token or None.
375
+ """
376
+ raise NotImplementedError
377
+
378
+
379
+ def increment(start_loc: int, mask: pd.Series) -> Tuple[int, bool]:
380
+ """Increment to the opposite True or False index location in the given mask
381
+ from the given start index location.
382
+
383
+ If the mask value at index (loc) start_idx is False, then find the
384
+ index (loc) value where the mask is True. Then the mask values from
385
+ start_idx (inclusive) to end_idx (exclusive) are all False.
386
+ And vice-versa for if the mask value at start_idx is True.
387
+
388
+ :param start_loc: The start index location
389
+ :param mask: The boolean feature mask
390
+ :return: end_loc Where the mask value is opposite that at start_loc
391
+ If unable to increment (e.g., at the end of the mask or no flips),
392
+ then end_idx will equal start_idx.
393
+ """
394
+ end_loc = start_loc
395
+ if start_loc in mask.index:
396
+ m = mask.loc[start_loc:]
397
+ end_iloc = m.argmin() if m.iloc[0] else m.argmax()
398
+ if end_iloc > 0:
399
+ end_loc = m.index[end_iloc]
400
+ return end_loc
401
+
402
+
403
+ class SimpleTokenMask(TokenMask):
404
+ """A mask where "in" tokens are ones and "out" are zeros."""
405
+
406
+ def __init__(self, text_features: CharacterFeatures, token_mask: pd.Series):
407
+ """Initialize with the text_features and token mask.
408
+
409
+ :param text_features: The text features to tokenize.
410
+ :param token_mask: The token mask identifying token characters as True
411
+ and characters between tokens as False.
412
+ """
413
+ super().__init__(text_features)
414
+ self.token_mask = token_mask
415
+ self.revmask = token_mask[::-1]
416
+
417
+ def get_next_token_loc(self, ref_ploc: int, token_num: int = -1) -> TokenLoc:
418
+ """Given the end of a prior token or possible start of the next, get
419
+ the "next" token's location.
420
+ If there is no subsequent token, then return None.
421
+ :param ref_ploc: The end ploc of the prior token or start of string
422
+ :param token_num: The token position within its text string
423
+ :return: The TokenLoc of the next token or None.
424
+ """
425
+ result = None
426
+ start_ploc = self._get_next_start(ref_ploc, self.token_mask)
427
+ if start_ploc is not None:
428
+ end_ploc = increment(start_ploc, self.token_mask)
429
+ result = TokenLoc(start_ploc, end_ploc, token_num=token_num)
430
+ return result
431
+
432
+ def get_prev_token_loc(self, from_token_loc: TokenLoc) -> TokenLoc:
433
+ """Get the previous token bounds before the given token start ploc.
434
+ If there is no prior token, then return None.
435
+ :param from_start: The start ploc of the token after the result
436
+ :return: (tok_start, tok_end) plocs of the prior token or None.
437
+ """
438
+ result = None
439
+
440
+ from_loc = from_token_loc.start_loc_excl
441
+ start_loc = increment(increment(from_loc, self.revmask), self.revmask)
442
+ if start_loc != from_loc:
443
+ start_loc += 1
444
+ end_loc = increment(start_loc, self.token_mask)
445
+ result = TokenLoc(start_loc, end_loc, token_num=from_token_loc.token_num - 1)
446
+ return result
447
+
448
+
449
+ class DualTokenMask(TokenMask):
450
+ """A mask comprised of a mask for token starts and a mask for token ends."""
451
+
452
+ def __init__(
453
+ self,
454
+ text_features: CharacterFeatures,
455
+ start_mask: pd.Series,
456
+ end_mask: pd.Series,
457
+ ):
458
+ super().__init__(text_features)
459
+ self.start_mask = start_mask
460
+ self.end_mask = end_mask
461
+ # self.tok_starts = start_mask.index[start_mask]
462
+ # self.tok_ends = end_mask.index[end_mask]
463
+ self.tok_starts = start_mask
464
+ self.tok_ends = end_mask
465
+ self.rev_starts = self.tok_starts[::-1]
466
+ self.rev_ends = self.tok_starts[::-1]
467
+
468
+ def _get_token_end(self, start_ploc: int) -> int:
469
+ return self._get_next_start(start_ploc, self.tok_ends) + 1
470
+
471
+ def get_next_token_loc(self, ref_ploc: int, token_num: int = -1) -> TokenLoc:
472
+ """Given the end of a prior token or possible start of the next, get
473
+ the "next" token's location.
474
+ If there is no subsequent token, then return None.
475
+ :param ref_ploc: The end ploc of the prior token or start of string
476
+ :param token_num: The token position within its text string
477
+ :return: The TokenLoc of the next token or None.
478
+ """
479
+ result = None
480
+ start_ploc = self._get_next_start(ref_ploc, self.tok_starts)
481
+ if start_ploc is not None:
482
+ end_ploc = self._get_token_end(start_ploc)
483
+ result = TokenLoc(start_ploc, end_ploc, token_num=token_num)
484
+ return result
485
+
486
+ def get_prev_token_loc(self, from_token_loc: TokenLoc) -> TokenLoc:
487
+ """Get the previous token bounds before the given token start ploc.
488
+ If there is no prior token, then return None.
489
+ :param from_start: The start ploc of the token after the result
490
+ :return: (tok_start, tok_end) plocs of the prior token or None.
491
+ """
492
+ result = None
493
+ from_loc = from_token_loc.start_loc_excl
494
+ if from_loc > self.pad:
495
+ start_loc = increment(from_loc, self.rev_starts)
496
+ result = TokenLoc(
497
+ start_loc, self._get_token_end(start_loc), token_num=from_token_loc.token_num + 1
498
+ )
499
+ return result
500
+
501
+
502
+ class Token:
503
+ """A structure identifying the token start (inclusive) and end (exclusive)
504
+ index positions according to text features mask.
505
+
506
+ NOTE: The masks in CharacterFeatures include padding, which displaces indices
507
+ relative to positions in the original text. In this class, padded indices
508
+ are referred to with a "p".
509
+ """
510
+
511
+ def __init__(
512
+ self,
513
+ token_mask: TokenMask,
514
+ token_loc: TokenLoc = None,
515
+ start_ploc: int = 0,
516
+ prev_token: "Token" = None,
517
+ next_token: "Token" = None,
518
+ normalize_fn: Callable[[str], str] = None,
519
+ ):
520
+ """Initialize the token pointer with text features and the token_mask.
521
+
522
+ :param token_mask: The token mask to use.
523
+ :param token_loc: The (padded) token location, if known or None.
524
+ If token_loc is None and start_ploc is 0, then this will be the
525
+ first token of the text.
526
+ :param start_ploc: The padded character index for the start of this
527
+ token as an alternate to specifying token_loc. If start_ploc is not
528
+ at a token character according to the token mask, then it will be
529
+ auto-incremented to the next token.
530
+ :param prev_token: The token prior to this token.
531
+ :param next_token: The token following this token.
532
+ :param normalize_fn: A function to normalize token text.
533
+ """
534
+ self.token_mask = token_mask
535
+ self._next = next_token
536
+ self._prev = prev_token
537
+ self.normalize_fn = normalize_fn
538
+ self._text = None
539
+ self._norm_text = None
540
+ self._pre_delims = None
541
+ self._post_delims = None
542
+ if token_loc is not None:
543
+ self.token_loc = token_loc
544
+ else:
545
+ self.token_loc = self.token_mask.get_next_token_loc(
546
+ max(start_ploc, token_mask.pad),
547
+ token_num=0,
548
+ )
549
+ # If token_loc is None, the text is empty
550
+ if self.token_loc is None:
551
+ self._text = ""
552
+ self.token_loc = TokenLoc(
553
+ self.token_mask.max_ploc + 1,
554
+ self.token_mask.max_ploc + 1,
555
+ token_num=0,
556
+ )
557
+ self._pre_delims = ""
558
+ self._post_delims = ""
559
+
560
+ def __repr__(self) -> str:
561
+ return f"Token({self.token_text}){self.token_loc}"
562
+
563
+ @property
564
+ def doctext(self) -> dk_doc.Text:
565
+ """Get the text object with metadata."""
566
+ return self.token_mask.text_features.doctext
567
+
568
+ @property
569
+ def full_text(self) -> str:
570
+ """Get the full original text of which this token is a part."""
571
+ return self.token_mask.text_features.text
572
+
573
+ @property
574
+ def text_id(self) -> Any:
575
+ """Get the full text ID."""
576
+ return self.token_mask.text_features.text_id
577
+
578
+ @property
579
+ def token_num(self) -> int:
580
+ """Get the position of this token within its text string."""
581
+ return self.token_loc.token_num
582
+
583
+ @property
584
+ def len(self) -> int:
585
+ """Get the length of this token."""
586
+ return self.token_loc.len
587
+
588
+ @property
589
+ def token_text(self) -> str:
590
+ """Get this token's original text."""
591
+ if self._text is None:
592
+ self._text = self.token_mask.get_text(self.token_loc)
593
+ return self._text
594
+
595
+ @property
596
+ def norm_text(self) -> str:
597
+ """Get this token's normalized text."""
598
+ if self._norm_text is None:
599
+ self._norm_text = (
600
+ self.normalize_fn(self.token_text)
601
+ if self.normalize_fn is not None
602
+ else self.token_text
603
+ )
604
+ return self._norm_text
605
+
606
+ @property
607
+ def start_pos(self) -> int:
608
+ """Get this token's start (incl) position in the original text."""
609
+ return self.token_loc.start_loc_incl - self.token_mask.pad
610
+
611
+ @property
612
+ def end_pos(self) -> int:
613
+ """Get this token's end (excl) position in the original text."""
614
+ return self.token_loc.end_loc_excl - self.token_mask.pad
615
+
616
+ @property
617
+ def token_pos(self) -> Tuple[int, int]:
618
+ """Get the token start (incl) and end (excl) indexes in the original text."""
619
+ return (self.start_pos, self.end_pos)
620
+
621
+ @property
622
+ def pre_delims(self) -> str:
623
+ if self._pre_delims is None:
624
+ delims = ""
625
+ prev_loc = self.token_mask.get_prev_token_loc(self.token_loc)
626
+ if prev_loc is not None:
627
+ delims = self.token_mask.get_padded_text(
628
+ prev_loc.end_loc_excl, self.token_loc.start_loc_incl
629
+ )
630
+ self._pre_delims = delims
631
+ return self._pre_delims
632
+
633
+ @property
634
+ def post_delims(self) -> str:
635
+ if self._post_delims is None:
636
+ delims = ""
637
+ next_loc = self.token_mask.get_next_token_loc(
638
+ self.token_loc.end_loc_excl,
639
+ )
640
+ if next_loc is not None:
641
+ delims = self.token_mask.get_padded_text(
642
+ self.token_loc.end_loc_excl, next_loc.start_loc_incl
643
+ )
644
+ else:
645
+ # There isn't a next token. Get remainder of text after tok.
646
+ delims = self.token_mask.get_padded_text(
647
+ self.token_loc.end_loc_excl,
648
+ self.token_mask.max_ploc,
649
+ )
650
+
651
+ self._post_delims = delims
652
+ return self._post_delims
653
+
654
+ @property
655
+ def next_token(self) -> "Token":
656
+ if self._next is None:
657
+ next_token_loc = self.token_mask.get_next_token_loc(
658
+ self.token_loc.end_loc_excl,
659
+ token_num=self.token_loc.token_num + 1,
660
+ )
661
+ if next_token_loc is not None:
662
+ self._next = Token(
663
+ self.token_mask,
664
+ token_loc=next_token_loc,
665
+ prev_token=self,
666
+ normalize_fn=self.normalize_fn,
667
+ )
668
+ return self._next
669
+
670
+ @property
671
+ def prev_token(self) -> "Token":
672
+ if self._prev is None:
673
+ prev_token_loc = self.token_mask.get_prev_token_loc(self.token_loc)
674
+ if prev_token_loc is not None:
675
+ self._prev = Token(
676
+ self.token_mask,
677
+ token_loc=prev_token_loc,
678
+ next_token=self,
679
+ normalize_fn=self.normalize_fn,
680
+ )
681
+ return self._prev
682
+
683
+ @property
684
+ def first_token(self) -> "Token":
685
+ """Get the first token for this token's input."""
686
+ first = self
687
+ while first.prev_token is not None:
688
+ first = first.prev_token
689
+ return first
690
+
691
+ @property
692
+ def last_token(self) -> "Token":
693
+ """Get the last token for this token's input."""
694
+ last = self
695
+ while last.next_token is not None:
696
+ last = last.next_token
697
+ return last