reflexive 0.1.9__py3-none-any.whl → 1.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
reflexive/session.py ADDED
@@ -0,0 +1,265 @@
1
+
2
+ import boto3
3
+ import time
4
+ import tarfile
5
+ import json
6
+
7
+ from reflexive import cfg
8
+ import reflexive as rfx
9
+
10
+ import logging
11
+ logging.basicConfig(level=logging.DEBUG)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class AWS:
15
+
16
+ config = None
17
+ aws_session = None
18
+
19
+ def __init__(self,config:cfg.Config):
20
+ # on initialisation create a new session with provided profile (or with default profile)
21
+ #logger.error(config.get_parameters())
22
+ if config==None:
23
+ config = cfg.Config()
24
+ self.config = config
25
+ self.new_session()
26
+
27
+ def get_parameters(self):
28
+ return self.__dict__
29
+
30
+ def new_session(self):
31
+ logger.info("In new_session")
32
+ try:
33
+ self.aws_session = boto3.Session(profile_name=self.config.aws_profile)
34
+ self.config.aws_region = self.aws_session.region_name
35
+ self.config.aws_access_key = self.aws_session.get_credentials().access_key
36
+ logger.info("Created new AWS session in region %s for profile: %s",self.config.aws_region,self.config.aws_profile)
37
+
38
+ except Exception as e:
39
+ logger.error("Unable to create an AWS session: %s",repr(e))
40
+
41
+ try:
42
+ self.config.aws_account_number = self.aws_session.client('sts').get_caller_identity().get('Account')
43
+ logger.info("Retrieved account number from AWS")
44
+ except Exception as e:
45
+ logger.error("Unable to retrieve account number from AWS: %s",repr(e))
46
+
47
+ return self.aws_session
48
+
49
+
50
+ class S3:
51
+
52
+ aws = None
53
+ config = None
54
+ __s3_client = None
55
+
56
+ def __init__(self,aws:AWS):
57
+ self.aws = aws
58
+ self.config = self.aws.config
59
+
60
+ # create client
61
+ try:
62
+ logger.debug(f"Region:{self.aws.aws_session.region_name}")
63
+ self.__s3_client = aws.aws_session.client(service_name='s3')
64
+ except Exception as err:
65
+ logger.error("Unable to create S3 client: ",err)
66
+
67
+ # Return the S3 client
68
+ def client(self):
69
+ return self.__s3_client
70
+
71
+ # Function to upload reflections to S3
72
+ def upload_docs(self,text_series):
73
+
74
+ files_folder = f"{self.config.prefix}files{self.config.postfix}"
75
+
76
+ s3 = self.__s3_client
77
+ s3ap = self.config.s3_accesspoint_arn
78
+ logger.debug(f"ACCESS POINT: {s3ap}")
79
+
80
+ logger.info(f"Uploading {len(text_series)} reflections to S3 ({files_folder})...")
81
+ logger.debug(f"({s3ap}/{files_folder})")
82
+ for idx in text_series.index:
83
+ file_name = f"{self.config.prefix}{idx}.txt"
84
+ file_body = text_series.iloc[idx]
85
+ logger.info(f"Uploading {file_name}")
86
+ #print(file_body)
87
+ response = s3.put_object(Body=file_body,Bucket=s3ap,Key=f"{files_folder}/{file_name}")
88
+ if response['ResponseMetadata']['HTTPStatusCode'] != 200:
89
+ logger.error("------------------------------------------------------------")
90
+ logger.error(f"ERROR: There was a problem with {file_name}")
91
+ logger.error(response)
92
+ logger.error("------------------------------------------------------------")
93
+ else:
94
+ logger.info('Success')
95
+ logger.info("Finished uploading reflections to S3.")
96
+ return response
97
+
98
+ # download and save results
99
+ def results_download_save_extract(self,s3Uri,local_file_path):
100
+ s3 = self.__s3_client
101
+ output_key = s3Uri.split(self.config.s3_bucket_name)[1]
102
+ # download from S3 to local path
103
+ with open(f"{local_file_path}.tar.gz",'wb') as output_data:
104
+ s3.download_fileobj(self.config.s3_bucket_name,output_key[1:],output_data)
105
+
106
+ # extract the files from tar archive
107
+ files = list()
108
+ with tarfile.open(f"{local_file_path}.tar.gz", "r:gz") as tf:
109
+ for member in tf.getmembers():
110
+ f = tf.extractfile(member)
111
+ if f is not None:
112
+ content = f.read()
113
+ files.append(content)
114
+ #print("Number of files:",len(files))
115
+ # extract results and save and return
116
+ raw_results = files[0].decode("utf-8").split('\n')
117
+ raw_results.pop() # pop last item off as empty entry due to final \n
118
+ json_results = json.dumps(raw_results)
119
+ with open(f"{local_file_path}.json","w") as fp:
120
+ fp.write(json_results)
121
+ return json_results
122
+
123
+
124
+ class Comprehend:
125
+
126
+ aws = None
127
+ config = None
128
+ __comp_client = None
129
+
130
+ def __init__(self,aws:AWS):
131
+ self.aws = aws
132
+ self.config = self.aws.config
133
+
134
+ # create client
135
+ try:
136
+ logger.debug(f"Region:{self.aws.aws_session.region_name}")
137
+ self.__comp_client = self.aws.aws_session.client(service_name='comprehend')
138
+ except Exception as err:
139
+ logger.error("Unable to create Comprehend client: ",err)
140
+
141
+ def client(self):
142
+ return self.__comp_client
143
+
144
+ # Use AWS comprehend to get bulk key phrases from single batch of chunked text
145
+ def get_single_batch_analysis(self,index,chunk):
146
+ comp_client = self.client()
147
+ results = {}
148
+ print("Analysing chunk",index)
149
+ print(" . key_phrase")
150
+ kpresult = comp_client.batch_detect_key_phrases(TextList=chunk,LanguageCode='en')
151
+ results['KeyPhraseResults'] = kpresult
152
+ #key_phrase_results.append(kpresult)
153
+ time.sleep(2)
154
+ print(" . sentiment")
155
+ senresult = comp_client.batch_detect_sentiment(TextList=chunk,LanguageCode='en')
156
+ results['SentimentResults'] = senresult
157
+ #sentiment_results.append(senresult)
158
+ time.sleep(2)
159
+ print(" . targeted_sentiment")
160
+ tsenresult = comp_client.batch_detect_targeted_sentiment(TextList=chunk,LanguageCode='en')
161
+ results['TargetedSentimentResults'] = tsenresult
162
+ #target_sent_results.append(tsenresult)
163
+ time.sleep(2)
164
+ print(" . syntax")
165
+ synresult = comp_client.batch_detect_syntax(TextList=chunk,LanguageCode='en')
166
+ results['SyntaxResults'] = synresult
167
+ #syntax_results.append(synresult)
168
+ time.sleep(2)
169
+ return results
170
+
171
+
172
+ # Use AWS comprehend to get bulk key phrases from chunked text
173
+ def get_multiple_batch_analysis(self,chunked_text):
174
+ chunk_results = {}
175
+ for key in self.config.analysis_types.keys():
176
+ chunk_results[key] = []
177
+
178
+ for idx,chunk in enumerate(chunked_text):
179
+ if len(chunked_text) > 4999:
180
+ print("WARNING: Text too long to analyse - index",idx,"skipped!")
181
+ else:
182
+ try:
183
+ results = self.get_single_batch_analysis(index=idx,chunk=chunk)
184
+ except(Exception) as error:
185
+ print("There was an error with index",idx,error)
186
+ finally:
187
+ if results:
188
+ for key in results.keys():
189
+ chunk_results[key].append(results[key])
190
+
191
+ return chunk_results
192
+
193
+ # Take batched responses and concenate single lists of results, errors, and http responses
194
+ def unbatch_results(self,result_type,results,batch_size=25):
195
+ unbatched_results = {}
196
+ unbatched_errors = {}
197
+ batch_responses = {}
198
+ for idx,batch in enumerate(results):
199
+ #print("Response for batch:",idx)
200
+ batch_responses[idx] = batch['ResponseMetadata']
201
+ result_list = batch['ResultList']
202
+ error_list = batch['ErrorList']
203
+ for r in result_list:
204
+ ridx = idx*batch_size + r['Index']
205
+ rdata = r[result_type]
206
+ unbatched_results[ridx] = rdata
207
+ for e in error_list:
208
+ eidx = e['Index']
209
+ unbatched_errors[eidx] = 'ERROR' + e['ErrorCode'] + ': ' + e['ErrorMessage']
210
+ unbatched = {}
211
+ unbatched['results'] = unbatched_results
212
+ unbatched['errors'] = unbatched_errors
213
+ unbatched['responses'] = batch_responses
214
+ return unbatched
215
+
216
+ def check_long_text(self,df):
217
+ # Check for long reflections (too long for batch analysis)
218
+ long_df = df.copy()
219
+ long_df = long_df[long_df.text.str.len()>5000]
220
+ long_df['length'] = long_df.text.str.len()
221
+ return long_df
222
+
223
+ # #### CUSTOM ENTITY
224
+
225
+ def submit_custom_entity_job(self,job_name): #access_role_arn,entity_recogniser_arn):
226
+ job_str = f"{self.config.prefix}{job_name}{self.config.postfix}"
227
+
228
+ response = self.__comp_client.start_entities_detection_job(
229
+ InputDataConfig={
230
+ 'S3Uri': self.config.s3_input_uri,
231
+ 'InputFormat': 'ONE_DOC_PER_FILE'
232
+ },
233
+ OutputDataConfig={
234
+ 'S3Uri': self.config.s3_output_uri
235
+ },
236
+ DataAccessRoleArn=self.config.comprehend_access_role_arn,
237
+ JobName=job_str,
238
+ EntityRecognizerArn=self.config.reflexive_entity_arn,
239
+ LanguageCode='en'
240
+ )
241
+ self.job_id = response['JobId']
242
+ self.check_job_status() # force the creation of __job_properties
243
+ return response
244
+
245
+ def get_current_job_id(self):
246
+ return self.job_id
247
+
248
+ # Check job status
249
+ def check_job_status(self):
250
+ job_status = self.__comp_client.describe_entities_detection_job(
251
+ JobId=self.job_id
252
+ )
253
+ self.__job_properties = job_status['EntitiesDetectionJobProperties']
254
+ return self.__job_properties['JobStatus']
255
+
256
+ def get_job_details(self):
257
+ return self.__job_properties
258
+
259
+ #checked
260
+ def download_and_extract(self,s3):
261
+ local_output_dir = f"{self.config.local_path}{self.config.prefix}output{self.config.postfix}"
262
+ job_details = self.get_job_details()
263
+ s3Uri = job_details['OutputDataConfig']['S3Uri']
264
+ return s3.results_download_save_extract(s3Uri,local_output_dir)
265
+
reflexive/util.py ADDED
@@ -0,0 +1,125 @@
1
+ import os
2
+
3
+ import logging
4
+ logging.basicConfig(level=logging.DEBUG)
5
+ logger = logging.getLogger(__name__)
6
+
7
+ from sklearn.preprocessing import MinMaxScaler
8
+
9
+ # File functions
10
+ def get_data_path_name(config,name,ext):
11
+ return f"{config.local_path}{config.prefix}{name}{config.postfix}.{ext}"
12
+
13
+ def set_sub_dir(config,sub_dir=None):
14
+ # check dir sub_dir exists
15
+ if sub_dir:
16
+ local_dir = f"{config.local_path}{sub_dir}/"
17
+ logger.debug(f"local_dir: {local_dir}")
18
+ dirExists = os.path.exists(local_dir)
19
+ if not dirExists:
20
+ logger.info(f"Creating subdirectory: {local_dir}")
21
+ os.makedirs(local_dir)
22
+ else:
23
+ local_dir = local_path
24
+ return local_dir
25
+
26
+
27
+
28
+ # Function to write dictionaries to both json and csv
29
+ def writeDictJsonCSV(dictionary,path_file):
30
+ with open(f"{path_file}.json",'w') as fp:
31
+ fp.write(json.dumps(dictionary))
32
+
33
+ ngram_df = pd.DataFrame.from_dict(dictionary,orient='index')
34
+ ngram_df.to_csv(f"{path_file}.csv")
35
+
36
+ # Data functions
37
+ def sort_dict_by_value(d):
38
+ return dict(sorted(d.items(), key=lambda x:x[1], reverse=True))
39
+
40
+ def filter_dict_by_value(ngrams,min_val=3):
41
+ filtered_ngrams = {}
42
+ for k,v in ngrams.items():
43
+ if v >=min_val:
44
+ filtered_ngrams[k] = v
45
+ return filtered_ngrams
46
+
47
+ # Input a series and output a list of lists with each maxn elements
48
+ def series_to_chunked_list(series,maxn=25):
49
+ l = list(series)
50
+ return __chunk_list(l,maxn)
51
+
52
+ # Chunk a list into a list of lists with maxn elements
53
+ def __chunk_list(l,maxn=25):
54
+ return [l[i:i + maxn] for i in range(0, len(l), maxn)]
55
+
56
+ # Count named entities
57
+ def count_entities(entities):
58
+ counts = []
59
+ for k,v in entities.items():
60
+ counts.append((k,len(v)))
61
+ return sorted(counts, key=lambda x: x[1], reverse=True)
62
+
63
+ # Function for calculating proportions of features
64
+ def ratios(elements):
65
+ etotal = sum([v[1] for v in elements])
66
+ if etotal==0:
67
+ return elements
68
+ else:
69
+ proportioned = []
70
+ for element in elements:
71
+ prop_val = round((element[1]/etotal),4)
72
+ proportioned.append((element[0],prop_val))
73
+ return proportioned
74
+
75
+
76
+
77
+ # Count labels associated with strings
78
+ def count_labels(string_labels):
79
+ counts = dict()
80
+ for rt in string_labels:
81
+ counts[rt[1]] = counts.setdefault(rt[1],0) + 1
82
+ return sorted(counts.items(), key=lambda x: x[1], reverse=True)
83
+
84
+ def count_keys(key_count_dict):
85
+ counts = dict()
86
+ for k,v in key_count_dict.items():
87
+ counts[k] = counts.setdefault(k,0) + v
88
+ return sorted(counts.items(), key=lambda x: x[1], reverse=True)
89
+
90
+ # Total the values in list of tuples
91
+ def tuple_values_total(tuples):
92
+ tvs = [t[1] for t in tuples]
93
+ return sum(tvs)
94
+
95
+ #### SCALING AND NORMALISING
96
+
97
+ # Outliers
98
+
99
+ def outlier_fence(series):
100
+ bounds = {}
101
+ stats = series.describe()
102
+ iqr = stats['75%'] - stats['25%']
103
+ bounds["IQR"]=iqr
104
+ upper = stats['75%']+1.5*iqr
105
+ bounds["UPPER"]=upper
106
+ lower = stats['25%']-1.5*iqr
107
+ bounds["LOWER"]=lower
108
+ return bounds
109
+
110
+ # MinMax Scaling
111
+ def scale_min_max(df_cols):
112
+ scaler = MinMaxScaler()
113
+ return scaler.fit_transform(df_cols)
114
+
115
+ # Normalise domain term counts
116
+ def normalise_domain_counts(domain_counts,text_size):
117
+ norms = {}
118
+ for k,v in domain_counts.items():
119
+ norms[k] = round(v*text_size,3)
120
+ return norms
121
+
122
+ def normalise_scaled(df,feature,norm_feature = 'text_scaled'):
123
+ tempdf = df[[feature,norm_feature]].copy()
124
+ tempdf['norm_scaled'] = tempdf.apply(lambda r: round(r[feature]/(r[norm_feature]+0.01),4),axis=1)
125
+ return tempdf['norm_scaled']
@@ -1,30 +1,30 @@
1
-
2
- #from spacy import displacy
3
-
4
- #from reflexive.common.parameters import Parameters
5
-
6
- import logging
7
- try:
8
- import coloredlogs
9
- coloredlogs.install(level='INFO')
10
- except:
11
- print("Colored logs not available")
1
+ from reflexive import session
2
+ from reflexive import cfg
12
3
 
13
4
  class Display:
5
+ aws:session.AWS = None
6
+ config:cfg.Config = None
14
7
 
15
- logger = logging.getLogger(__name__)
8
+ defaults = {
9
+ "priority_tags": ["AR","EP","VR_EV_CN","ER_AF","RR","KP"],
10
+ "colours": {"VR_EV_CN": "#ff6644","ER_AF": "#dd44cc","AR": "#00cc00","EP": "#aacc33","RR": "#00aaff","KP":"#aaaacc"}}
16
11
 
17
- def __init__(self): #,parameters:Parameters):
18
- self.name="Display"
19
- self.priority_tags = ["AR","EP","VR_EV_CN","ER_AF","RR","KP"]
20
- self.colours = {"VR_EV_CN": "#ff6644","ER_AF": "#dd44cc","AR": "#00cc00","EP": "#aacc33","RR": "#00aaff","KP":"#aaaacc"}
21
- self.options = {"ents": ["VR_EV_CN","ER_AF","AR","EP","RR","KP"], "colors": self.colours}
22
-
23
-
24
-
12
+ def __init__(self,aws):
13
+ self.aws = aws
14
+ self.aws = aws
15
+ self.config = self.aws.config
16
+ self.set_default_parameters()
17
+
18
+ def set_default_parameters(self):
19
+ priority_tags = self.defaults['priority_tags']
20
+ colours = self.defaults['colours']
21
+ options = {"ents": colours.keys(), "colors": colours.values}
22
+ self.config.set_display_parameters(priority_tags,colours,options)
23
+
24
+
25
25
  def add_reflexive_offsets(self,df):
26
26
  temp_df = df.copy()
27
- temp_df['reflexive_offsets'] = temp_df.reflexiveResults.apply(self.collect_reflexive_offsets)
27
+ temp_df['reflexive_offsets'] = temp_df.ReflexiveResults.apply(self.collect_reflexive_offsets)
28
28
  return temp_df
29
29
 
30
30
  def add_keyphrase_offsets(self,df):
@@ -49,7 +49,7 @@ class Display:
49
49
  #pseudonym = record['pseudonym']
50
50
  #point_round = record['point_round']
51
51
  #title = f"{pseudonym} ({point_round}) - {timestamp}"
52
- tags = self.priority_tags
52
+ tags = self.config.display_priority_tags
53
53
  text = record['text']
54
54
  reflexive_offsets = record['reflexive_offsets']
55
55
  keyphrase_offsets = record['keyphrase_offsets']
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: reflexive
3
- Version: 0.1.9
3
+ Version: 1.0.13
4
4
  Summary: Supports AWS Reflexive Expressions Analysis
5
5
  Home-page: https://github.com/nlytx/reflexive
6
6
  Author: Andrew Gibson
@@ -0,0 +1,12 @@
1
+ reflexive/__init__.py,sha256=Ke3gGqrVgPP2IRgifljQL8Ep3qVuuOf4LgZUkxdJQ2k,119
2
+ reflexive/analyse.py,sha256=UzWwgjAFNjeWFkCQ2o99g2vWajf17_OtSq4dFCvuPYU,17489
3
+ reflexive/cfg.py,sha256=Ges35G234P2lvOQHgPZQae5hMSOGyBsmp1bY_yQEKkk,4303
4
+ reflexive/session.py,sha256=MbqwTsYTgq_e_gw3mb1eRv6USs-zZ2cTCrvUNWuKfAQ,10067
5
+ reflexive/util.py,sha256=WQ1oyzDi1i8wQ6IBwBPk6IFy07YKhg-Ug2FsOGVJRJQ,3649
6
+ reflexive/visualise.py,sha256=weBNqd3uiCEg3bvLLNDPVxSkdjapm1jZrTw8cU3uZx8,4032
7
+ reflexive-1.0.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
8
+ reflexive-1.0.13.dist-info/LICENSE.txt,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
9
+ reflexive-1.0.13.dist-info/METADATA,sha256=oT1mUVaMPVDIZ7oQdqe628lqmJROtjJhXfwuzDaRWmk,12038
10
+ reflexive-1.0.13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
11
+ reflexive-1.0.13.dist-info/top_level.txt,sha256=pOMr-QGleRBRCFBozgvM-UUUmOjD_-naJfu1522E2V8,10
12
+ reflexive-1.0.13.dist-info/RECORD,,
File without changes
@@ -1,196 +0,0 @@
1
-
2
-
3
- from reflexive.common.parameters import Parameters
4
- from reflexive.common.local import Local
5
- from reflexive.aws_connect.comprehend import Comprehend
6
- from reflexive.common.util import Util
7
-
8
- import json
9
- import logging
10
- import pandas as pd
11
-
12
- try:
13
- import coloredlogs
14
- coloredlogs.install(level='INFO')
15
- except:
16
- print("Colored logs not available")
17
-
18
- class Nlp:
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
- def __init__(self,parameters:Parameters,local:Local,comprehend:Comprehend):
23
- self.__parameters = parameters.all_parameters()
24
- self.logger.debug(f"Parameters: {self.__parameters}")
25
- self.local_path = self.__parameters['local_path']
26
- self.prefix = self.__parameters['prefix']
27
- self.postfix = self.__parameters['postfix']
28
- self.analysis_types = self.__parameters['analysis_types']
29
- self.__local = local
30
- self.__comprehend = comprehend
31
-
32
-
33
-
34
- #### COMPREHEND ANALYSIS
35
-
36
- def comprehend_analysis(self,df):
37
- util = Util()
38
- comprehend = self.__comprehend
39
- self.analysis_types = self.__parameters['analysis_types']
40
- #print(type(df.text))
41
- # chunk the text for batch analysis
42
- chunked_text = util.series_to_chunked_list(series=df.text)
43
- print("Number of chunks:",len(chunked_text))
44
- # start batch analysis
45
- chunked_results = comprehend.get_multiple_batch_analysis(chunked_text)
46
- print("Finished Analysis.")
47
- # write to file
48
- print("Writing data to file...")
49
- with open(f"{self.local_path}{self.prefix}analysis_chunks{self.postfix}.json", "w") as fp:
50
- json.dump(chunked_results,fp)
51
- print("DONE!")
52
- # unchunk
53
- final_results = {}
54
- for key in chunked_results.keys():
55
- final_results[key] = comprehend.unbatch_results(self.analysis_types[key],chunked_results[key])
56
- print("Finished Unbatching",key," - Writing data to file...")
57
- filename = f"{self.local_path}{self.prefix}{key}{self.postfix}.json"
58
- with open(filename, "w") as fp:
59
- json.dump(final_results[key],fp)
60
- print("DONE!")
61
- # Save final_results for reload if necessary
62
- with open(f"{self.local_path}{self.prefix}final_results{self.postfix}.json", "w") as fp:
63
- json.dump(final_results,fp)
64
- return final_results
65
-
66
- def check_results(self,results):
67
- print("Checking for errors...")
68
- for key in results.keys():
69
- errors = results[key]['errors']
70
- print(f"Errors for {key}: {errors}")
71
- print()
72
- print("Checking that we have results for all docs")
73
- for key in results.keys():
74
- num_results= len(results[key]['results'])
75
- print(f"Number of results for {key}: {num_results}")
76
- return errors
77
-
78
- def add_results_to_df(self,results,df):
79
- for key in results.keys():
80
- rs = results[key]['results']
81
- newresults = {}
82
- for oldkey in rs.keys():
83
- newresults[int(oldkey)] = rs[oldkey] # Need to change keys to int to properly add to dataframe
84
- df[key] = pd.Series(newresults)
85
- return df
86
-
87
- def nlp_analytics(self,df):
88
- temp_df = df.copy()
89
- temp_df = self.keyphrase_analytics(temp_df)
90
- temp_df = self.named_entity_analytics(temp_df)
91
- temp_df = self.targeted_sentiment_analytics(temp_df)
92
- temp_df = self.syntax_analytics(temp_df)
93
- return temp_df
94
-
95
-
96
- def keyphrase_analytics(self,df):
97
- util = Util()
98
- df["key_phrases"] = df.KeyPhraseResults.apply(self.parse_keyPhraseResults)
99
- df["key_phrase_counts"] = df.key_phrases.apply(util.count_keys)
100
- df["key_phrases_total"] = df.key_phrase_counts.apply(util.tuple_values_total)
101
- if (len(df)>1):
102
- df["key_phrases_scaled"] = util.scale_min_max(df[['key_phrases_total']])
103
- else:
104
- df["key_phrases_scaled"] = 1
105
- # Normalise based on text_scaled
106
- df['key_phrases_norm'] = util.normalise_scaled(df,'key_phrases_scaled')
107
- return df
108
-
109
- def named_entity_analytics(self,df):
110
- util = Util()
111
- df["named_entities"] = df.TargetedSentimentResults.apply(self.parse_namedEntities)
112
- df['named_entity_counts'] = df.named_entities.apply(util.count_entities)
113
- df["named_entity_ratios"] = df.named_entity_counts.apply(util.ratios)
114
- df["named_entities_total"] = df.named_entity_counts.apply(util.tuple_values_total)
115
- if (len(df)>1):
116
- df["named_entities_scaled"] = util.scale_min_max(df[['named_entities_total']])
117
- else:
118
- df["named_entities_scaled"] = 1
119
- df['named_entities_norm'] = util.normalise_scaled(df,'named_entities_scaled')
120
- return df
121
-
122
- def targeted_sentiment_analytics(self,df):
123
- util = Util()
124
- df["targeted_sentiment"] = df.TargetedSentimentResults.apply(self.parse_targetedSentimentResults)
125
- df['targeted_sentiment_counts'] = df.targeted_sentiment.apply(util.count_entities)
126
- df["targeted_sentiment_ratios"] = df.targeted_sentiment_counts.apply(util.ratios)
127
- df["targeted_sentiment_total"] = df.targeted_sentiment_counts.apply(util.tuple_values_total)
128
- if (len(df)>1):
129
- df["targeted_sentiment_scaled"] = util.scale_min_max(df[['targeted_sentiment_total']])
130
- else:
131
- df["targeted_sentiment_scaled"] = 1
132
- df['targeted_sentiment_norm'] = util.normalise_scaled(df,'targeted_sentiment_scaled')
133
- return df
134
-
135
- def syntax_analytics(self,df):
136
- util = Util()
137
- df["pos_tags"] = df.SyntaxResults.apply(self.parse_syntaxResults)
138
- df['pos_tag_counts'] = df.pos_tags.apply(util.count_labels)
139
- df["pos_tag_ratios"] = df.pos_tag_counts.apply(util.ratios)
140
- df["pos_tags_total"] = df.pos_tag_counts.apply(util.tuple_values_total)
141
- if (len(df)>1):
142
- df["pos_tags_scaled"] = util.scale_min_max(df[['pos_tags_total']])
143
- else:
144
- df["pos_tags_scaled"] = 1
145
- df['pos_tags_norm'] = util.normalise_scaled(df,'pos_tags_scaled')
146
- return df
147
-
148
-
149
- # Parse key_phrases results - include all above threshold
150
- def parse_keyPhraseResults(self,keyPhraseResults,threshold=0.95,min_count=1):
151
- util = Util()
152
- phrases = {}
153
- filtered = [str.lower(r['Text']) for r in keyPhraseResults if r['Score'] > threshold]
154
- for phrase in filtered:
155
- phrases[phrase] = phrases.get(phrase,0)+1
156
-
157
- filtered_phrases = {k:v for k,v in phrases.items() if v >= min_count}
158
- return util.sort_dict_by_value(filtered_phrases)
159
-
160
- # Parse syntax results - include specific postags
161
- def parse_syntaxResults(self,syntax_results,postags_keep = ['ADV','VERB','AUX','ADJ','NOUN','PRON','PROPN']):
162
- sequence = list()
163
- for token in syntax_results:
164
- tag = token['PartOfSpeech']['Tag']
165
- if tag in postags_keep:
166
- sequence.append((str.lower(token['Text']),tag))
167
- return sequence
168
-
169
- # Parse targeted sentiment results - keep non-neutral above threshold
170
-
171
- def parse_targetedSentimentResults(self,targetedSentiment_results,threshold = 0.4):
172
- sents = dict()
173
- for grp in targetedSentiment_results:
174
- for mention in grp["Mentions"]:
175
- if mention['Score'] >= threshold:
176
- if not "NEUTRAL" in mention['MentionSentiment']['Sentiment']:
177
- k = mention['MentionSentiment']['Sentiment']
178
- text = str.lower(mention['Text'])
179
- sents.setdefault(k,{text}).add(text)
180
- for k,v in sents.items():
181
- sents[k] = list(v) # change set to list
182
- return sents
183
-
184
- # Parse targeted sentiment results for named entities
185
- def parse_namedEntities(self,targetedSentimentResults,threshold = 0.1):
186
- ents = dict()
187
- for grp in targetedSentimentResults:
188
- for mention in grp["Mentions"]:
189
- if mention['Score'] >= threshold:
190
- k = mention['Type']
191
- text = str.lower(mention['Text'])
192
- ents.setdefault(k,{text}).add(text)
193
- for k,v in ents.items():
194
- ents[k] = list(v) # change set to list
195
- return ents
196
-