reflexive 1.2.7__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
reflexive/analyse.py DELETED
@@ -1,430 +0,0 @@
1
-
2
- import json
3
- import pandas as pd
4
- import re
5
-
6
- from reflexive import util
7
- from reflexive import cfg
8
- from reflexive import session
9
-
10
- import logging
11
- #logging.basicConfig(level=logging.DEBUG)
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class Nlp:
16
- aws:session.AWS = None
17
- config:cfg.Config = None
18
-
19
- top_ngrams = {}
20
-
21
-
22
- def __init__(self,aws):
23
- self.aws = aws
24
- self.aws = aws
25
- self.config = self.aws.config
26
-
27
- ### GENERAL ANALYSIS FUNCTIONS ######
28
-
29
- #checked
30
- def text_length(self,df,text_col_name='text'):
31
- self.config.text_col_name = text_col_name
32
- custom_df = df.copy()
33
- custom_df["text_length"] = df[text_col_name].apply(lambda x: len(x))
34
- if (len(custom_df)>1):
35
- custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
36
- else:
37
- custom_df["text_scaled"] = 1
38
- return custom_df
39
-
40
- #checked
41
- def remove_IQR_outliers(self,df):
42
- fence = util.outlier_fence(df.text_length)
43
- logger.debug("Fence: %s",repr(fence))
44
- if fence['LOWER']==fence['UPPER']:
45
- logger.info("No fence, returning original df")
46
- return df
47
- else:
48
- tempdf = df.copy()
49
- # Check change with removed outliers
50
- checkdf = tempdf[tempdf.text_length<fence['UPPER']]
51
- checkdf.reset_index(drop=True,inplace=True)
52
- logger.debug("Original:",len(tempdf))
53
- logger.debug(tempdf.describe())
54
- logger.info:("Outliers: %s",repr(len(tempdf)-len(checkdf)))
55
- logger.debug("No outliers:",len(checkdf))
56
- logger.debug(checkdf.describe())
57
- return checkdf
58
-
59
- #checked
60
- #Add domain terms to config
61
- def add_domain_terms(self,domain_terms):
62
- self.config.domain_terms = domain_terms
63
-
64
- #checked
65
- # Parse text for domain terms
66
- def parse_text_domain_terms(self,text):
67
- matched_terms = {}
68
- for dtk,dtv in self.config.domain_terms.items():
69
- temp_matches = []
70
- for term in dtv:
71
- if term[0]=='_': #acronym - treat as whole word
72
- regex = r"\b{}\b".format(term[1:])
73
- matches = re.findall(regex,str.lower(text))
74
- if len(matches)>0:
75
- temp_matches.append((term[1:],len(matches)))
76
- else:
77
- count = str.lower(text).count(term)
78
- if count > 0:
79
- temp_matches.append((term,count))
80
- matched_terms[dtk] = dict(temp_matches)
81
- return dict(matched_terms)
82
-
83
- #checked
84
- def match_domain_terms(self,df):
85
- custom_df = df.copy()
86
- custom_df["domain_terms"] = df[self.config.text_col_name].apply(lambda t: self.parse_text_domain_terms(t))
87
- custom_df["domain_counts"] = custom_df["domain_terms"].apply(lambda d: self.__count_domain_terms(d))
88
- return custom_df
89
-
90
- #checked
91
- # Count domain terms
92
- def __count_domain_terms(self,domain_terms):
93
- domain_counts = {}
94
- for domain,terms in domain_terms.items():
95
- domain_counts[domain] = sum(terms.values())
96
- return domain_counts
97
-
98
- #checked
99
- def get_top_ngrams(self,text_series,min_val=3):
100
- ngrams = {}
101
- for text in text_series:
102
- self.__ngrams345(text,ngrams)
103
- #print("Generated 3,4,5 ngrams:", len(ngrams))
104
- f_ngrams = util.filter_dict_by_value(ngrams,min_val)
105
- self.top_ngrams = util.sort_dict_by_value(f_ngrams)
106
- return self.top_ngrams
107
-
108
-
109
- #checked
110
- def match_top_ngrams(self,df):
111
- custom_df = df.copy()
112
- custom_df["top_ngrams"] = df[self.config.text_col_name].apply(lambda t: self.parse_text_top_ngrams(t))
113
- custom_df["top_ngrams_count"] = custom_df["top_ngrams"].apply(lambda n: self.__ngram_counts(n))
114
- return custom_df
115
-
116
- #checked
117
- def parse_text_top_ngrams(self,text):
118
- ngrams = self.__ngrams345(text,{})
119
- return {key: ngrams[key] for key in self.top_ngrams.keys() if key in ngrams}
120
-
121
- #checked
122
- def __ngram_counts(self,ref_top_ngrams):
123
- return sum(ref_top_ngrams.values())
124
-
125
- #checked
126
- # Given text and number of terms, create ngrams from the text
127
- def __make_ngrams(self,text, n=1):
128
- # Replace all none alphanumeric characters with spaces
129
- s = re.sub(r'[^a-zA-Z0-9\s]', ' ', text.lower())
130
- tokens = [token for token in s.split(" ") if token != ""]
131
- ngrams = zip(*[tokens[i:] for i in range(n)])
132
- return [" ".join(ngram) for ngram in ngrams]
133
-
134
- #checked
135
- # Generate 3,4,5 -grams
136
- def __ngrams345(self,text,ngrams):
137
- ngrams3 = self.__make_ngrams(text,3)
138
- for n in ngrams3:
139
- ngrams[n] = ngrams.get(n,0)+1
140
- ngrams4 = self.__make_ngrams(text,4)
141
- for n in ngrams4:
142
- ngrams[n] = ngrams.get(n,0)+1
143
- ngrams5 = self.__make_ngrams(text,5)
144
- for n in ngrams5:
145
- ngrams[n] = ngrams.get(n,0)+1
146
- return ngrams
147
-
148
-
149
- #### COMPREHEND ANALYSIS
150
-
151
- #checked
152
- def comprehend_analysis(self,comprehend,df):
153
- self.analysis_types = self.config.analysis_types
154
- #print(type(df.text))
155
- # chunk the text for batch analysis
156
- chunked_text = util.series_to_chunked_list(series=df[self.config.text_col_name])
157
- print("Number of chunks:",len(chunked_text))
158
- # start batch analysis
159
- chunked_results = comprehend.get_multiple_batch_analysis(chunked_text)
160
- print("Finished Analysis.")
161
- # write to file
162
- print("Writing data to file...")
163
- with open(f"{self.config.local_path}{self.config.prefix}analysis_chunks{self.config.postfix}.json", "w") as fp:
164
- json.dump(chunked_results,fp)
165
- print("DONE!")
166
- # unchunk
167
- final_results = {}
168
- for key in chunked_results.keys():
169
- final_results[key] = comprehend.unbatch_results(self.analysis_types[key],chunked_results[key])
170
- print("Finished Unbatching",key," - Writing data to file...")
171
- filename = f"{self.config.local_path}{self.config.prefix}{key}{self.config.postfix}.json"
172
- with open(filename, "w") as fp:
173
- json.dump(final_results[key],fp)
174
- print("DONE!")
175
- # Save final_results for reload if necessary
176
- with open(f"{self.config.local_path}{self.config.prefix}final_results{self.config.postfix}.json", "w") as fp:
177
- json.dump(final_results,fp)
178
- return final_results
179
-
180
- #checked
181
- def check_results(self,results):
182
- print("Checking for errors...")
183
- for key in results.keys():
184
- errors = results[key]['errors']
185
- print(f"Errors for {key}: {errors}")
186
- print()
187
- print("Checking that we have results for all docs")
188
- for key in results.keys():
189
- num_results= len(results[key]['results'])
190
- print(f"Number of results for {key}: {num_results}")
191
- return errors
192
-
193
- #checked
194
- def add_results_to_df(self,results,df):
195
- for key in results.keys():
196
- rs = results[key]['results']
197
- newresults = {}
198
- for oldkey in rs.keys():
199
- newresults[int(oldkey)] = rs[oldkey] # Need to change keys to int to properly add to dataframe
200
- df[key] = pd.Series(newresults)
201
- return df
202
-
203
- #checked
204
- def comprehend_analytics(self,df):
205
- temp_df = df.copy()
206
- temp_df = self.keyphrase_analytics(temp_df)
207
- temp_df = self.named_entity_analytics(temp_df)
208
- temp_df = self.targeted_sentiment_analytics(temp_df)
209
- temp_df = self.syntax_analytics(temp_df)
210
- return temp_df
211
-
212
- #checked
213
- def keyphrase_analytics(self,df):
214
- df["key_phrases"] = df.KeyPhraseResults.apply(self.parse_keyPhraseResults)
215
- df["key_phrase_counts"] = df.key_phrases.apply(util.count_keys)
216
- df["key_phrases_total"] = df.key_phrase_counts.apply(util.tuple_values_total)
217
- if (len(df)>1):
218
- df["key_phrases_scaled"] = util.scale_min_max(df[['key_phrases_total']])
219
- else:
220
- df["key_phrases_scaled"] = 1
221
- # Normalise based on text_scaled
222
- df['key_phrases_norm'] = util.normalise_scaled(df,'key_phrases_scaled')
223
- return df
224
-
225
- #checked
226
- def named_entity_analytics(self,df):
227
- df["named_entities"] = df.TargetedSentimentResults.apply(self.parse_namedEntities)
228
- df['named_entity_counts'] = df.named_entities.apply(util.count_entities)
229
- df["named_entity_ratios"] = df.named_entity_counts.apply(util.ratios)
230
- df["named_entities_total"] = df.named_entity_counts.apply(util.tuple_values_total)
231
- if (len(df)>1):
232
- df["named_entities_scaled"] = util.scale_min_max(df[['named_entities_total']])
233
- else:
234
- df["named_entities_scaled"] = 1
235
- df['named_entities_norm'] = util.normalise_scaled(df,'named_entities_scaled')
236
- return df
237
-
238
- #checked
239
- def targeted_sentiment_analytics(self,df):
240
- df["targeted_sentiment"] = df.TargetedSentimentResults.apply(self.parse_targetedSentimentResults)
241
- df['targeted_sentiment_counts'] = df.targeted_sentiment.apply(util.count_entities)
242
- df["targeted_sentiment_ratios"] = df.targeted_sentiment_counts.apply(util.ratios)
243
- df["targeted_sentiment_total"] = df.targeted_sentiment_counts.apply(util.tuple_values_total)
244
- if (len(df)>1):
245
- df["targeted_sentiment_scaled"] = util.scale_min_max(df[['targeted_sentiment_total']])
246
- else:
247
- df["targeted_sentiment_scaled"] = 1
248
- df['targeted_sentiment_norm'] = util.normalise_scaled(df,'targeted_sentiment_scaled')
249
- return df
250
-
251
- #checked
252
- def syntax_analytics(self,df):
253
- df["pos_tags"] = df.SyntaxResults.apply(self.parse_syntaxResults)
254
- df['pos_tag_counts'] = df.pos_tags.apply(util.count_labels)
255
- df["pos_tag_ratios"] = df.pos_tag_counts.apply(util.ratios)
256
- df["pos_tags_total"] = df.pos_tag_counts.apply(util.tuple_values_total)
257
- if (len(df)>1):
258
- df["pos_tags_scaled"] = util.scale_min_max(df[['pos_tags_total']])
259
- else:
260
- df["pos_tags_scaled"] = 1
261
- df['pos_tags_norm'] = util.normalise_scaled(df,'pos_tags_scaled')
262
- return df
263
-
264
- #checked
265
- # Parse key_phrases results - include all above threshold
266
- def parse_keyPhraseResults(self,keyPhraseResults,threshold=0.95,min_count=1):
267
- phrases = {}
268
- filtered = [str.lower(r['Text']) for r in keyPhraseResults if r['Score'] > threshold]
269
- for phrase in filtered:
270
- phrases[phrase] = phrases.get(phrase,0)+1
271
-
272
- filtered_phrases = {k:v for k,v in phrases.items() if v >= min_count}
273
- return util.sort_dict_by_value(filtered_phrases)
274
-
275
- #checked
276
- # Parse syntax results - include specific postags
277
- def parse_syntaxResults(self,syntax_results,postags_keep = ['ADV','VERB','AUX','ADJ','NOUN','PRON','PROPN']):
278
- sequence = list()
279
- for token in syntax_results:
280
- tag = token['PartOfSpeech']['Tag']
281
- if tag in postags_keep:
282
- sequence.append((str.lower(token['Text']),tag))
283
- return sequence
284
-
285
- #checked
286
- # Parse targeted sentiment results - keep non-neutral above threshold
287
- def parse_targetedSentimentResults(self,targetedSentiment_results,threshold = 0.4):
288
- sents = dict()
289
- for grp in targetedSentiment_results:
290
- for mention in grp["Mentions"]:
291
- if mention['Score'] >= threshold:
292
- if "NEUTRAL" not in mention['MentionSentiment']['Sentiment']:
293
- k = mention['MentionSentiment']['Sentiment']
294
- text = str.lower(mention['Text'])
295
- sents.setdefault(k,{text}).add(text)
296
- for k,v in sents.items():
297
- sents[k] = list(v) # change set to list
298
- return sents
299
-
300
- #checked
301
- # Parse targeted sentiment results for named entities
302
- def parse_namedEntities(self,targetedSentimentResults,threshold = 0.1):
303
- ents = dict()
304
- for grp in targetedSentimentResults:
305
- for mention in grp["Mentions"]:
306
- if mention['Score'] >= threshold:
307
- k = mention['Type']
308
- text = str.lower(mention['Text'])
309
- ents.setdefault(k,{text}).add(text)
310
- for k,v in ents.items():
311
- ents[k] = list(v) # change set to list
312
- return ents
313
-
314
- #--
315
- # Ratio between action POS and object POS
316
- # def action_object_ratio(self,pos_ratios,action_pos = ['VERB'],object_pos = ['NOUN','PROPN']):
317
- # ap = [s[1] for s in pos_ratios if s[0] in action_pos]
318
- # if ap:
319
- # aps = sum(ap)
320
- # else:
321
- # aps = 0
322
- # op = [s[1] for s in pos_ratios if s[0] in object_pos]
323
- # if op:
324
- # ops = sum(op)
325
- # else:
326
- # ops = 1 #avoid divide zero error - only happens with aps of 1
327
- # #print("aps",aps,"ops",ops)
328
- # return aps/ops
329
-
330
- ######## REFLEXIVE EXPRESSION ANALYSIS FUNCTIONS
331
-
332
- #checked
333
- def analyse_reflexive_expressions(self,df,s3:session.S3,comprehend):
334
- #self.__bucket_name = s3_bucket_name
335
- text_series = df.text.replace('\r\n','\n') # Comprehend treats \r\n as one character
336
- # Upload reflections to S3 for analysis
337
- s3.upload_docs(text_series)
338
-
339
- # Save a copy of reflections locally for offline analysis
340
- self.save_docs(text_series)
341
-
342
- # Submit the job
343
- return comprehend.submit_custom_entity_job("reflexive_expressions_analysis") #submitReflexiveExpressionsJob(access_role_arn, entity_recogniser_arn)
344
-
345
- #checked
346
- def save_docs(self,text_series,):
347
- logger.info(f"Saving {len(text_series)} docs to {self.config.local_path}...")
348
- for idx in text_series.index:
349
- file_name = f"{self.config.prefix}{idx}.txt"
350
- file_body = text_series.iloc[idx]
351
- logger.info(f"Saving {file_name}")
352
- with open(f"{self.config.local_path}{file_name}",'w') as fp:
353
- fp.write(file_body)
354
- logger.info("Finished saving reflections locally.")
355
-
356
-
357
-
358
-
359
- #checked
360
- def extractAnalysisFromResults(self,results):
361
- analysis_output = dict()
362
- jresults = json.loads(results)
363
- for result in jresults:
364
- j = json.loads(result)
365
- #print(j)
366
- idx = j["File"].split('_')[-1].split('.')[0]
367
- analysis_output[int(idx)] = j["Entities"]
368
- return analysis_output
369
-
370
- #checked
371
- def add_to_dataframe(self,df,results):
372
- # Extract analysis from raw results
373
- analysis_output = self.extractAnalysisFromResults(results)
374
- # Add results to dataframe
375
- results_df = df.copy()
376
- results_df['ReflexiveResults'] = pd.Series(analysis_output)
377
- return results_df
378
-
379
- #--
380
- def reflexive_analytics(self,df):
381
- #util = Util()
382
- custom_df = df.copy()
383
- # custom_df["text_length"] = df.text.apply(lambda x: len(x))
384
- # if (len(custom_df)>1):
385
- # custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
386
- # else:
387
- # custom_df["text_scaled"] = 1
388
- #custom_df["reflexive_results"] = df.reflexiveResults
389
- # The expressions and their reflexive expression labels
390
- custom_df["reflexive_expressions"] = df.ReflexiveResults.apply(self.parse_reflexiveResults)
391
- # The counts for each labels
392
- custom_df["reflexive_counts"] = custom_df.reflexive_expressions.apply(util.count_labels)
393
- # Ratios between reflexive expressions
394
- custom_df["reflexive_ratio"] = custom_df.reflexive_counts.apply(util.ratios)
395
- # Ratio vector
396
- custom_df['ratio_vector'] = custom_df.reflexive_ratio.apply(self.make_ratio_vector)
397
- # Get the diversity of reflexive types - out of 8 possible types
398
- custom_df["reflexive_type_diversity"] = custom_df.reflexive_counts.apply(lambda x: len(x)/8)
399
- # A total of all labels
400
- custom_df["reflexive_total"] = custom_df.reflexive_counts.apply(util.tuple_values_total)
401
- # MinMax scale the reflexive_total
402
- if (len(custom_df)>1):
403
- custom_df["reflexive_scaled"] = util.scale_min_max(custom_df[['reflexive_total']])
404
- else:
405
- custom_df["reflexive_scaled"] = 1
406
- # Normalise based on text_scaled
407
- custom_df['reflexive_norm'] = util.normalise_scaled(custom_df,'reflexive_scaled')
408
- return custom_df
409
-
410
- #checked
411
- # Parse reflexive results - include all above threshold
412
- def parse_reflexiveResults(self,reflexiveResults,threshold=0.5):
413
- final_refs = list()
414
- #rr = json.loads(reflexiveResults)
415
- for ref in reflexiveResults:
416
- if ref['Score'] > threshold:
417
- final_refs.append((str.lower(ref['Text']),ref['Type']))
418
- return final_refs
419
-
420
- #--
421
- # Function for creating a vector out of reflexive ratio - could be used for others
422
- def make_ratio_vector(self,ratio_list,ref_codes = ['RR','ER','VR','AR','EP','AF','CN','EV']):
423
- ratio_dict = dict(ratio_list)
424
- vec = []
425
- for rc in ref_codes:
426
- if rc in ratio_dict.keys():
427
- vec.append(ratio_dict[rc])
428
- else:
429
- vec.append(0)
430
- return vec
reflexive/cfg.py DELETED
@@ -1,116 +0,0 @@
1
- import logging
2
- import os
3
- from datetime import datetime
4
-
5
- #logging.basicConfig(level=logging.DEBUG)
6
- logger = logging.getLogger(__name__)
7
-
8
- class Config:
9
-
10
- aws_profile = None
11
- aws_region = None
12
- aws_account_number = None
13
- aws_access_key = None
14
-
15
- local_path = None
16
- date_string = None
17
- analysis_types = None
18
- prefix = None
19
- postfix = None
20
-
21
- s3_access_point = None
22
- s3_bucket_name = None
23
- s3_accesspoint_arn = None
24
-
25
- comprehend_service_role_name = None
26
- comprehend_access_role_arn = None
27
-
28
- s3_files_folder = None
29
- s3_results_folder = None
30
- s3_input_uri = None
31
- s3_output_uri = None
32
- reflexive_entity_name = None
33
- reflexive_entity_version = None
34
- reflexive_entity_arn = None
35
-
36
- text_col_name = 'text'
37
- domain_terms = {}
38
-
39
- display_priority_tags = None
40
- display_colours = None
41
- display_options = None
42
-
43
- def __init__(self,profile="default"):
44
- self.aws_profile = profile
45
-
46
- def get_parameters(self):
47
- return self.__dict__
48
-
49
- def set_parameters(self,name_prefix="rfx",local_path=None,date_string=None):
50
- working_dir = os.getcwd()
51
- self.local_path = local_path
52
- self.date_string = date_string
53
- self.analysis_types = {
54
- "KeyPhraseResults":"KeyPhrases",
55
- "SentimentResults":"Sentiment",
56
- "TargetedSentimentResults":"Entities",
57
- "SyntaxResults":"SyntaxTokens"
58
- }
59
-
60
- # General parameters
61
-
62
- if not local_path:
63
- logger.warning("No path supplied, creating a data directory...")
64
- #print(f"WD: {working_dir}")
65
- data_dir = working_dir+"/data/"
66
- if not os.path.exists(data_dir):
67
- os.makedirs(data_dir)
68
- logger.info("Created:%s",repr(data_dir))
69
- self.local_path = data_dir
70
- else:
71
- data_dir = local_path
72
- if not os.path.exists(data_dir):
73
- logger.warning("Path does not exist, creating directory")
74
- os.makedirs(data_dir)
75
- logger.info("Created %s",repr(data_dir))
76
- self.local_path = local_path
77
- if not date_string:
78
- date_string = datetime.today().strftime('%Y%m%d')
79
- logger.warning(f"No date_string supplied, using today: {date_string}")
80
- self.date_string = date_string
81
- self.prefix = f"{name_prefix}_"
82
- self.postfix = f"-{date_string}"
83
- return self.get_parameters()
84
-
85
-
86
- def set_s3_parameters(self,s3_access_point,s3_bucket_name):
87
- self.s3_access_point = s3_access_point
88
- self.s3_bucket_name = s3_bucket_name
89
- self.s3_accesspoint_arn = f"arn:aws:s3:{self.aws_region}:{self.aws_account_number}:accesspoint/{s3_access_point}"
90
- return self.get_parameters()
91
-
92
- def set_comprehend_parameters(self,comprehend_service_role_name):
93
- self.comprehend_service_role_name = comprehend_service_role_name
94
- self.comprehend_access_role_arn = f"arn:aws:iam::{self.aws_account_number}:role/service-role/{comprehend_service_role_name}"
95
- return self.get_parameters()
96
-
97
- def set_comprehend_custom_entity_parameters(self,reflexive_entity_name,reflexive_entity_version):
98
- #Comprehend requires S3 parameters
99
- self.s3_files_folder = f"{self.prefix}files{self.postfix}"
100
- self.s3_results_folder = f"{self.prefix}results{self.postfix}"
101
- self.s3_input_uri = f"s3://{self.s3_bucket_name}/{self.s3_files_folder}/{self.prefix}"
102
- self.s3_output_uri = f"s3://{self.s3_bucket_name}/{self.s3_results_folder}/"
103
- self.reflexive_entity_name = reflexive_entity_name
104
- self.reflexive_entity_version = reflexive_entity_version
105
- self.reflexive_entity_arn = f"arn:aws:comprehend:{self.aws_region}:{self.aws_account_number}:entity-recognizer/{self.reflexive_entity_name}/version/{self.reflexive_entity_version}"
106
- return self.get_parameters()
107
-
108
- def set_display_parameters(self,priority_tags,display_colours,display_options):
109
- self.display_priority_tags = priority_tags
110
- self.display_colours = display_colours
111
- self.display_options = display_options
112
- return self.get_parameters()
113
-
114
-
115
-
116
-