dataknobs-xization 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dataknobs-xization might be problematic. Click here for more details.

@@ -0,0 +1,596 @@
1
+ from abc import abstractmethod
2
+ from collections import defaultdict
3
+ from collections.abc import Callable
4
+ from typing import Any, Dict, List, Set, Union
5
+
6
+ import more_itertools
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ import dataknobs_structures.document as dk_doc
11
+ import dataknobs_xization.annotations as dk_anns
12
+ import dataknobs_xization.authorities as dk_auth
13
+ import dataknobs_xization.masking_tokenizer as dk_tok
14
+ from dataknobs_utils import emoji_utils
15
+
16
+
17
+ class LexicalExpander:
18
+ """A class to expand and/or normalize original lexical input terms, to
19
+ keep back-references from generated data to corresponding original input,
20
+ and to build consistent tokens for lexical matching.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ variations_fn: Callable[[str], Set[str]],
26
+ normalize_fn: Callable[[str], str],
27
+ split_input_camelcase: bool = True,
28
+ detect_emojis: bool = False,
29
+ ):
30
+ """Initialize with the given functions.
31
+ :param variations_fn: A function, f(t), to expand a raw input term to
32
+ all of its variations (including itself if desired). If None, the
33
+ default is to expand each term to itself.
34
+ :param normalize_fn: A function to normalize a raw input term or any
35
+ of its variations. If None, then the identity function is used.
36
+ :param split_input_camelcase: True to split input camelcase tokens
37
+ :param detect_emojis: True to detect emojis. If split_input_camelcase,
38
+ then adjacent emojis will also be split; otherwise, adjacent
39
+ emojis will appear as a single token.
40
+ """
41
+ self.variations_fn = variations_fn if variations_fn else lambda x: {x}
42
+ self.normalize_fn = normalize_fn if normalize_fn else lambda x: x
43
+ self.split_input_camelcase = split_input_camelcase
44
+ self.emoji_data = emoji_utils.load_emoji_data() if detect_emojis else None
45
+ self.v2t = defaultdict(set)
46
+
47
+ def __call__(self, term: Any, normalize=True) -> Set[str]:
48
+ """Get all variations of the original term.
49
+
50
+ :param term: The term whose variations to compute.
51
+ :param normalize: True to normalize the resulting variations.
52
+ :return: All variations
53
+ """
54
+ variations = self.variations_fn(term)
55
+ if normalize:
56
+ variations = {self.normalize_fn(v) for v in variations}
57
+ # Add a mapping from each variation to its original term
58
+ if variations is not None and len(variations) > 0:
59
+ more_itertools.consume(map(lambda v: self.v2t[v].add(term), variations))
60
+ return variations
61
+
62
+ def normalize(self, input_term: str) -> str:
63
+ """Normalize the given input term or variation.
64
+ :param input_term: An input term to normalize
65
+ :return: The normalized string of the input_term.
66
+ """
67
+ return self.normalize_fn(input_term)
68
+
69
+ def get_terms(self, variation: str) -> Set[Any]:
70
+ """Get the term ids for which the given variation was generated
71
+ :param variation: A variation whose reference term(s) to retrieve
72
+ :return: The set term ids for the variation or the missing_value.
73
+ """
74
+ return self.v2t.get(variation, set())
75
+
76
+ def build_first_token(
77
+ self,
78
+ doctext: Union[dk_doc.Text, str],
79
+ ) -> dk_tok.Token:
80
+ inputf = dk_tok.TextFeatures(
81
+ doctext, split_camelcase=self.split_input_camelcase, emoji_data=self.emoji_data
82
+ )
83
+ return inputf.build_first_token(normalize_fn=self.normalize_fn)
84
+
85
+
86
+ class TokenMatch:
87
+ def __init__(self, auth: dk_auth.LexicalAuthority, val_idx: int, var: str, token: dk_tok.Token):
88
+ self.auth = auth
89
+ self.val_idx = val_idx
90
+ self.var = var
91
+ self.token = token
92
+
93
+ self.varparts = var.split()
94
+ self.matches = True
95
+ self.tokens = list()
96
+ t = token
97
+ for v in self.varparts:
98
+ if t is not None and v == t.norm_text:
99
+ self.tokens.append(t)
100
+ t = t.next_token
101
+ else:
102
+ self.matches = False
103
+ break
104
+
105
+ def __repr__(self):
106
+ ttext = " ".join(t.token_text for t in self.tokens)
107
+ return (
108
+ f"Match_{self.tokens[0].token_num}-{self.tokens[-1].token_num}({ttext})[{self.val_idx}]"
109
+ )
110
+
111
+ @property
112
+ def next_token(self):
113
+ next_token = None
114
+ if self.matches:
115
+ next_token = self.tokens[-1].next_token
116
+ return next_token
117
+
118
+ @property
119
+ def matched_text(self):
120
+ """Get the matched original text."""
121
+ return self.token.input_text[self.tokens[0].start_pos : self.tokens[-1].end_pos]
122
+
123
+ def build_annotation(self):
124
+ return self.auth.build_annotation(
125
+ start_pos=self.tokens[0].start_pos,
126
+ end_pos=self.tokens[-1].end_pos,
127
+ entity_text=self.matched_text,
128
+ auth_value_id=self.val_idx,
129
+ )
130
+
131
+
132
+ class TokenAligner:
133
+ def __init__(self, first_token: dk_tok.Token, authority: dk_auth.LexicalAuthority):
134
+ self.first_token = first_token
135
+ self.auth = authority
136
+ self.annotations = list() # List[Dict[str, Any]]
137
+ self._processed_idx = set()
138
+ self._process(self.first_token)
139
+
140
+ def _process(self, token):
141
+ if token is not None:
142
+ if token.token_num not in self._processed_idx:
143
+ token_matches = self._get_token_matches(token)
144
+ for token_match in token_matches:
145
+ self.annotations.append(token_match.build_annotation())
146
+ self._process(token_match.next_token)
147
+ self._process(token.next_token)
148
+
149
+ def _get_token_matches(self, token):
150
+ token_matches = list()
151
+ vs = self.auth.find_variations(token.norm_text, starts_with=True)
152
+ if len(vs) > 0:
153
+ for val_idx, var in vs.items():
154
+ token_match = TokenMatch(self.auth, val_idx, var, token)
155
+ if token_match.matches:
156
+ # mark token position(s) as matched
157
+ self._processed_idx.update({t.token_num for t in token_match.tokens})
158
+ token_matches.append(token_match)
159
+ return token_matches
160
+
161
+
162
+ class DataframeAuthority(dk_auth.LexicalAuthority):
163
+ """A pandas dataframe-based lexical authority."""
164
+
165
+ def __init__(
166
+ self,
167
+ name: str,
168
+ lexical_expander: LexicalExpander,
169
+ authdata: dk_auth.AuthorityData,
170
+ auth_anns_builder: dk_auth.AuthorityAnnotationsBuilder = None,
171
+ field_groups: dk_auth.DerivedFieldGroups = None,
172
+ anns_validator: Callable[["Authority", Dict[str, Any]], bool] = None,
173
+ parent_auth: dk_auth.Authority = None,
174
+ ):
175
+ """Initialize with the name, values, and associated ids of the authority;
176
+ and with the lexical expander for authoritative values.
177
+
178
+ :param name: The authority name, if different from df.columns[0]
179
+ :param lexical_expander: The lexical expander for the values.
180
+ :param authdata: The data for this authority
181
+ :param auth_anns_builder: The authority annotations row builder to use
182
+ for building annotation rows.
183
+ :param field_groups: The derived field groups to use
184
+ :param anns_validator: fn(auth, anns_dict_list) that returns True if
185
+ the list of annotation row dicts are valid to be added as
186
+ annotations for a single match or "entity".
187
+ :param parent_auth: This authority's parent authority (if any)
188
+ """
189
+ super().__init__(
190
+ name if name else authdata.df.columns[0],
191
+ auth_anns_builder=auth_anns_builder,
192
+ authdata=authdata,
193
+ field_groups=field_groups,
194
+ anns_validator=anns_validator,
195
+ parent_auth=parent_auth,
196
+ )
197
+ self.lexical_expander = lexical_expander
198
+ self._variations = None
199
+ self._prev_aligner = None
200
+
201
+ @property
202
+ def prev_aligner(self) -> TokenAligner:
203
+ """Get the token aligner created in the latest call to annotate_text."""
204
+ return self._prev_aligner
205
+
206
+ @property
207
+ def variations(self) -> pd.Series:
208
+ """Get all lexical variations in a series whose index has associated
209
+ value IDs.
210
+ :return: A pandas series with index-identified variations
211
+ """
212
+ if self._variations is None:
213
+ self._variations = (
214
+ self.authdata.df[self.name].apply(self.lexical_expander).explode().dropna()
215
+ )
216
+ return self._variations
217
+
218
+ def get_id_by_variation(self, variation: str) -> Set[str]:
219
+ """Get the IDs of the value(s) associated with the given variation.
220
+ :param variation: Variation text
221
+ :return: The possibly empty set of associated value IDS.
222
+ """
223
+ ids = set()
224
+ for value in self.lexical_expander.get_terms(variation):
225
+ ids.update(self.get_value_ids(value))
226
+ return ids
227
+
228
+ def get_variations(self, value: Any, normalize: bool = True) -> Set[Any]:
229
+ """Convenience method to compute variations for the value.
230
+ :param value: The authority value, or term, whose variationsn to compute
231
+ :param normalize: True to normalize the variations
232
+ :return: The set of variations for theh value.
233
+ """
234
+ return self.lexical_expander(value, normalize=normalize)
235
+
236
+ def has_value(self, value: Any) -> bool:
237
+ """Determine whether the given value is in this authority.
238
+ :value: A possible authority value
239
+ :return: True if the value is a valid entity value.
240
+ """
241
+ return np.any(self.authdata.df[self.name] == value)
242
+
243
+ def get_value_ids(self, value: Any) -> Set[Any]:
244
+ """Get all IDs associated with the given value. Note that typically
245
+ there is a single ID for any value, but this allows for inherent
246
+ ambiguities in the authority.
247
+ :param value: An authority value
248
+ :return: The associated IDs or an empty set if the value is not valid.
249
+ """
250
+ return set(self.authdata.lookup_values(value).index.tolist())
251
+
252
+ def get_values_by_id(self, value_id: Any) -> Set[Any]:
253
+ """Get all values for the associated value ID. Note that typically
254
+ there is a single value for an ID, but this allows for inherent
255
+ ambiguities in the authority.
256
+
257
+ :param value: An authority value
258
+ :return: The associated IDs or an empty set if the value is not valid.
259
+ """
260
+ return set(self.authdata.lookup_values(value_id, is_id=True)[self.name].tolist())
261
+
262
+ def find_variations(
263
+ self,
264
+ variation: str,
265
+ starts_with: bool = False,
266
+ ends_with: bool = False,
267
+ scope: str = "fullmatch",
268
+ ) -> pd.Series:
269
+ """Find all matches to the given variation.
270
+ :param variation: The text to find; treated as a regular expression
271
+ unless either starts_with or ends_with is True.
272
+ :param starts_with: When True, find all terms that start with the
273
+ variation text.
274
+ :param ends_with: When True, find all terms that end with the variation
275
+ text.
276
+ :param scope: 'fullmatch' (default), 'match', or 'contains' for
277
+ strict, less strict, and least strict matching
278
+ :return: The matching variations as a pd.Series
279
+
280
+ Note only the first true of starts_with, ends_with, and scope will
281
+ be applied. If none of these are true, an full match on the pattern
282
+ is performed.
283
+ """
284
+ vs = self.variations
285
+ if starts_with:
286
+ vs = vs[vs.str.startswith(variation)]
287
+ elif ends_with:
288
+ vs = vs[vs.str.endswith(variation)]
289
+ else:
290
+ if scope == "fullmatch":
291
+ hits = vs.str.fullmatch(variation)
292
+ elif scope == "match":
293
+ hits = vs.str.match(variation)
294
+ else:
295
+ hits = vs.str.contains(variation)
296
+ vs = vs[hits]
297
+ vs = vs.drop_duplicates()
298
+ return vs
299
+
300
+ def get_variations_df(
301
+ self,
302
+ variations: pd.Series,
303
+ variations_colname: str = "variation",
304
+ ids_colname: str = None,
305
+ lookup_values: bool = False,
306
+ ) -> pd.DataFrame:
307
+ """Create a DataFrame including associated ids for each variation.
308
+ :param variations: The variations to include in the dataframe
309
+ :param variaions_colname: The name of the variations column
310
+ :param id_colname: The column name for value ids.
311
+ :param lookup_values: When True, include a self.name column
312
+ with associated values
313
+ """
314
+ if ids_colname is None:
315
+ ids_colname = f"{self.name}_id"
316
+ df = pd.DataFrame(
317
+ {
318
+ variations_colname: variations,
319
+ ids_colname: variations.apply(self.get_id_by_variation),
320
+ }
321
+ ).explode(ids_colname)
322
+ if lookup_values:
323
+ df[self.name] = df[ids_colname].apply(self.get_values_by_id)
324
+ df = df.explode(self.name)
325
+ return df
326
+
327
+ def add_annotations(
328
+ self,
329
+ doctext: dk_doc.Text,
330
+ annotations: dk_anns.Annotations,
331
+ ) -> dk_anns.Annotations:
332
+ """Method to do the work of finding, validating, and adding annotations.
333
+ :param doctext: The text to process.
334
+ :param annotations: The annotations object to add annotations to
335
+ :return: The given or a new Annotations instance
336
+ """
337
+ first_token = self.lexical_expander.build_first_token(
338
+ doctext.text, input_id=doctext.text_id
339
+ )
340
+ token_aligner = TokenAligner(first_token, self)
341
+ self._prev_aligner = token_aligner
342
+ if self.validate_ann_dicts(token_aligner.annotations):
343
+ annotations.add_dicts(token_aligner.annotations)
344
+ return annotations
345
+
346
+
347
+ class CorrelatedAuthorityData(dk_auth.AuthorityData):
348
+ """Container for authoritative data containing correlated data for multiple
349
+ "sub" authorities.
350
+ """
351
+
352
+ def __init__(self, df: pd.DataFrame, name: str):
353
+ super().__init__(df, name)
354
+ self._authority_data = dict()
355
+
356
+ def sub_authority_names(self) -> List[str]:
357
+ """Get the "sub" authority names."""
358
+ return None
359
+
360
+ @abstractmethod
361
+ def auth_values_mask(self, name: str, value_id: int) -> pd.Series:
362
+ """Identify full-authority data corresponding to this sub-value.
363
+ :param name: The sub-authority name.
364
+ :param value_id: The sub-authority value_id
365
+ :return: A series representing relevant full-authority data.
366
+ """
367
+ raise NotImplementedError
368
+
369
+ @abstractmethod
370
+ def auth_records_mask(
371
+ self,
372
+ record_value_ids: Dict[str, int],
373
+ filter_mask: pd.Series = None,
374
+ ) -> pd.Series:
375
+ """Get a series identifying records in the full authority matching
376
+ the given records of the form {<sub-name>: <sub-value-id>}.
377
+ :param record_value_ids: The dict of field names to value_ids
378
+ :param filter_mask: A pre-filter limiting records to consider and/or
379
+ building records incrementally
380
+ :return: A series identifying where all fields exist
381
+ """
382
+ raise NotImplementedError
383
+
384
+ @abstractmethod
385
+ def get_auth_records(self, records_mask: pd.Series) -> pd.DataFrame:
386
+ """Get the authority records identified by the mask.
387
+ :param records_mask: A series identifying records in the full data
388
+ :return: The records for which the mask is True.
389
+ """
390
+ raise NotImplementedError
391
+
392
+ @abstractmethod
393
+ def combine_masks(self, mask1: pd.Series, mask2: pd.Series) -> pd.Series:
394
+ """Combine the masks if possible, returning the valid combination or None.
395
+ :param mask1: An auth_records_mask consistent with this data
396
+ :param mask2: Another data auth_records_mask
397
+ :return: The combined consistent records_mask or None
398
+ """
399
+ raise NotImplementedError
400
+
401
+
402
+ class MultiAuthorityData(CorrelatedAuthorityData):
403
+ """Container for authoritative data containing correlated data for multiple
404
+ "sub" authorities composed of explicit data for each component.
405
+ """
406
+
407
+ def __init__(self, df: pd.DataFrame, name: str):
408
+ super().__init__(df, name)
409
+ self._authority_data = dict()
410
+
411
+ @abstractmethod
412
+ def build_authority_data(self, name: str) -> dk_auth.AuthorityData:
413
+ """Build an authority for the named sub-authority.
414
+
415
+ :param name: The "sub" authority name
416
+ :return: The "sub" authority data.
417
+ """
418
+ raise NotImplementedError
419
+
420
+ @property
421
+ def authority_data(self, name: str) -> dk_auth.AuthorityData:
422
+ """Retrieve without building the named authority data, or None"""
423
+ return self._authority_data.get(name, None)
424
+
425
+ def get_authority_data(self, name: str) -> dk_auth.AuthorityData:
426
+ """Get AuthorityData for the named "sub" authority, building if needed.
427
+
428
+ :param name: The "sub" authority name
429
+ :return: The "sub" authority data.
430
+ """
431
+ if name not in self._authority_data:
432
+ self._authority_data[name] = self.build_authority_data(name)
433
+ return self._authority_data[name]
434
+
435
+ @staticmethod
436
+ def get_unique_vals_df(col: pd.Series, name: str) -> pd.DataFrame:
437
+ """Get a dataframe with the unique values from the column and the given
438
+ column name.
439
+ """
440
+ data = np.sort(pd.unique(col.dropna()))
441
+ if np.issubdtype(col.dtype, np.integer):
442
+ # IDs for an integer column are the integers themselves
443
+ col_df = pd.DataFrame({name: data}, index=data)
444
+ else:
445
+ # IDs for other columns are auto-generated from 0 to n-1
446
+ col_df = pd.DataFrame({name: data})
447
+ return col_df
448
+
449
+ def lookup_subauth_values(self, name: str, value: int, is_id: bool = False) -> pd.DataFrame:
450
+ """Lookup "sub" authority data for the named "sub" authority value.
451
+ :param name: The sub-authority name
452
+ :param value: The value for the sub-authority to lookup
453
+ :param is_id: True if value is an ID
454
+ :return: The applicable authority dataframe rows.
455
+ """
456
+ values_df = None
457
+ authdata = self._authority_data.get(name, None)
458
+ if authdata is not None:
459
+ values_df = authdata.lookup_values(value, is_id=is_id)
460
+ return values_df
461
+
462
+ def lookup_auth_values(
463
+ self,
464
+ name: str,
465
+ value: str,
466
+ ) -> pd.DataFrame:
467
+ """Lookup original authority data for the named "sub" authority value.
468
+ :param name: The sub-authority name
469
+ :param value: The sub-authority value(s) (or dataframe row(s))
470
+ :return: The original authority dataframe rows.
471
+ """
472
+ return self.df[self.df[name] == value]
473
+
474
+ def auth_values_mask(self, name: str, value_id: int) -> pd.Series:
475
+ """Identify the rows in the full authority corresponding to this sub-value.
476
+ :param name: The sub-authority name.
477
+ :param value_id: The sub-authority value_id
478
+ :return: A boolean series where the field exists.
479
+ """
480
+ field_values = self.lookup_subauth_values(name, value_id, is_id=True)
481
+ return self.df[name].isin(field_values[name].tolist())
482
+
483
+ def auth_records_mask(
484
+ self,
485
+ record_value_ids: Dict[str, int],
486
+ filter_mask: pd.Series = None,
487
+ ) -> pd.Series:
488
+ """Get a boolean series identifying records in the full authority matching
489
+ the given records of the form {<sub-name>: <sub-value-id>}.
490
+ :param record_value_ids: The dict of field names to value_ids
491
+ :param filter_mask: A pre-filter limiting records to consider and/or
492
+ building records incrementally
493
+ :return: A boolean series where all fields exist or None
494
+ """
495
+ has_fields = filter_mask
496
+ for name, value_id in record_value_ids.items():
497
+ has_field = self.auth_values_mask(name, value_id)
498
+ if has_fields is None:
499
+ has_fields = has_field
500
+ else:
501
+ has_fields &= has_field
502
+ return has_fields
503
+
504
+ def get_auth_records(self, records_mask: pd.Series) -> pd.DataFrame:
505
+ """Get the authority records identified by the mask.
506
+ :param records_mask: A boolean series identifying records in the full df
507
+ :return: The records/rows for which the mask is True.
508
+ """
509
+ return self.df[records_mask]
510
+
511
+ def combine_masks(self, mask1: pd.Series, mask2: pd.Series) -> pd.Series:
512
+ """Combine the masks if possible, returning the valid combination or None.
513
+ :param mask1: An auth_records_mask consistent with this data
514
+ :param mask2: Another data auth_records_mask
515
+ :return: The combined consistent records_mask or None
516
+ """
517
+ result = None
518
+ if mask1 is not None and mask2 is not None:
519
+ result = mask1 & mask2
520
+ elif mask1 is not None:
521
+ result = mask1
522
+ elif mask2 is not None:
523
+ result = mask2
524
+ return result if np.any(result) else None
525
+
526
+
527
+ class SimpleMultiAuthorityData(MultiAuthorityData):
528
+ """Data class for pulling a single column from the multi-authority data
529
+ as a "sub" authority.
530
+ """
531
+
532
+ def build_authority_data(self, name: str) -> dk_auth.AuthorityData:
533
+ """Build an authority for the named column holding authority data.
534
+
535
+ Note that only unique values are kept and the full dataframe's index
536
+ will not be preserved.
537
+
538
+ :param name: The "sub" authority (and column) name
539
+ :return: The "sub" authority data.
540
+ """
541
+ col = self.df[name]
542
+ col_df = self.get_unique_vals_df(col, name)
543
+ return dk_auth.AuthorityData(col_df, name)
544
+
545
+
546
+ class MultiAuthorityFactory(dk_auth.AuthorityFactory):
547
+ """An factory for building a "sub" authority directly or indirectly
548
+ from MultiAuthorityData.
549
+ """
550
+
551
+ def __init__(
552
+ self,
553
+ auth_name: str,
554
+ lexical_expander: LexicalExpander = None,
555
+ ):
556
+ """:param auth_name: The name of the dataframe authority to build
557
+ :param lexical_expander: The lexical expander to use (default=identity)
558
+ """
559
+ self.auth_name = auth_name
560
+ self._lexical_expander = lexical_expander
561
+
562
+ def get_lexical_expander(self, name: str) -> LexicalExpander:
563
+ """Get the lexical expander for the named (column) data.
564
+ :param name: The name of the column to expand
565
+ :return: The appropriate lexical_expander
566
+ """
567
+ if self._lexical_expander is None:
568
+ self._lexical_expander = LexicalExpander(None, None)
569
+ return self._lexical_expander
570
+
571
+ def build_authority(
572
+ self,
573
+ name: str,
574
+ auth_anns_builder: dk_auth.AuthorityAnnotationsBuilder,
575
+ multiauthdata: MultiAuthorityData,
576
+ parent_auth: dk_auth.Authority = None,
577
+ ) -> DataframeAuthority:
578
+ """Build a DataframeAuthority.
579
+
580
+ :param name: The name of the authority to build
581
+ :param auth_anns_builder: The authority annotations row builder to use
582
+ for building annotation rows.
583
+ :param multiauthdata: The multi-authority source data
584
+ :param parent_auth: The parent authority
585
+ """
586
+ authdata = multiauthdata.get_authority_data(name)
587
+ field_groups = None # TODO: get from instance var set on construction?
588
+ anns_validator = None # TODO: get from multiauthdata?
589
+ return DataframeAuthority(
590
+ name,
591
+ self.get_lexical_expander(name),
592
+ authdata,
593
+ field_groups=field_groups,
594
+ anns_validator=anns_validator,
595
+ parent_auth=parent_auth,
596
+ )