reflexive 0.1.7__2-py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
reflexive/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ from .common.parameters import *
2
+ from .common.local import *
3
+ from .common.util import *
4
+ from .visual.display import *
5
+ from .analyse.reflexive_expressions import *
6
+ from .analyse.aws_nlp import *
7
+ from .analyse.general import *
8
+ from .aws_connect.comprehend import *
9
+ from .aws_connect.s3 import *
File without changes
@@ -0,0 +1,196 @@
1
+
2
+
3
+ from reflexive.common.parameters import Parameters
4
+ from reflexive.common.local import Local
5
+ from reflexive.aws_connect.comprehend import Comprehend
6
+ from reflexive.common.util import Util
7
+
8
+ import json
9
+ import logging
10
+ import pandas as pd
11
+
12
+ try:
13
+ import coloredlogs
14
+ coloredlogs.install(level='INFO')
15
+ except:
16
+ print("Colored logs not available")
17
+
18
+ class Nlp:
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ def __init__(self,parameters:Parameters,local:Local,comprehend:Comprehend):
23
+ self.__parameters = parameters.all_parameters()
24
+ self.logger.debug(f"Parameters: {self.__parameters}")
25
+ self.local_path = self.__parameters['local_path']
26
+ self.prefix = self.__parameters['prefix']
27
+ self.postfix = self.__parameters['postfix']
28
+ self.analysis_types = self.__parameters['analysis_types']
29
+ self.__local = local
30
+ self.__comprehend = comprehend
31
+
32
+
33
+
34
+ #### COMPREHEND ANALYSIS
35
+
36
+ def comprehend_analysis(self,df):
37
+ util = Util()
38
+ comprehend = self.__comprehend
39
+ self.analysis_types = self.__parameters['analysis_types']
40
+ #print(type(df.text))
41
+ # chunk the text for batch analysis
42
+ chunked_text = util.series_to_chunked_list(series=df.text)
43
+ print("Number of chunks:",len(chunked_text))
44
+ # start batch analysis
45
+ chunked_results = comprehend.get_multiple_batch_analysis(chunked_text)
46
+ print("Finished Analysis.")
47
+ # write to file
48
+ print("Writing data to file...")
49
+ with open(f"{self.local_path}{self.prefix}analysis_chunks{self.postfix}.json", "w") as fp:
50
+ json.dump(chunked_results,fp)
51
+ print("DONE!")
52
+ # unchunk
53
+ final_results = {}
54
+ for key in chunked_results.keys():
55
+ final_results[key] = comprehend.unbatch_results(self.analysis_types[key],chunked_results[key])
56
+ print("Finished Unbatching",key," - Writing data to file...")
57
+ filename = f"{self.local_path}{self.prefix}{key}{self.postfix}.json"
58
+ with open(filename, "w") as fp:
59
+ json.dump(final_results[key],fp)
60
+ print("DONE!")
61
+ # Save final_results for reload if necessary
62
+ with open(f"{self.local_path}{self.prefix}final_results{self.postfix}.json", "w") as fp:
63
+ json.dump(final_results,fp)
64
+ return final_results
65
+
66
+ def check_results(self,results):
67
+ print("Checking for errors...")
68
+ for key in results.keys():
69
+ errors = results[key]['errors']
70
+ print(f"Errors for {key}: {errors}")
71
+ print()
72
+ print("Checking that we have results for all docs")
73
+ for key in results.keys():
74
+ num_results= len(results[key]['results'])
75
+ print(f"Number of results for {key}: {num_results}")
76
+ return errors
77
+
78
+ def add_results_to_df(self,results,df):
79
+ for key in results.keys():
80
+ rs = results[key]['results']
81
+ newresults = {}
82
+ for oldkey in rs.keys():
83
+ newresults[int(oldkey)] = rs[oldkey] # Need to change keys to int to properly add to dataframe
84
+ df[key] = pd.Series(newresults)
85
+ return df
86
+
87
+ def nlp_analytics(self,df):
88
+ temp_df = df.copy()
89
+ temp_df = self.keyphrase_analytics(temp_df)
90
+ temp_df = self.named_entity_analytics(temp_df)
91
+ temp_df = self.targeted_sentiment_analytics(temp_df)
92
+ temp_df = self.syntax_analytics(temp_df)
93
+ return temp_df
94
+
95
+
96
+ def keyphrase_analytics(self,df):
97
+ util = Util()
98
+ df["key_phrases"] = df.KeyPhraseResults.apply(self.parse_keyPhraseResults)
99
+ df["key_phrase_counts"] = df.key_phrases.apply(util.count_keys)
100
+ df["key_phrases_total"] = df.key_phrase_counts.apply(util.tuple_values_total)
101
+ if (len(df)>1):
102
+ df["key_phrases_scaled"] = util.scale_min_max(df[['key_phrases_total']])
103
+ else:
104
+ df["key_phrases_scaled"] = 1
105
+ # Normalise based on text_scaled
106
+ df['key_phrases_norm'] = util.normalise_scaled(df,'key_phrases_scaled')
107
+ return df
108
+
109
+ def named_entity_analytics(self,df):
110
+ util = Util()
111
+ df["named_entities"] = df.TargetedSentimentResults.apply(self.parse_namedEntities)
112
+ df['named_entity_counts'] = df.named_entities.apply(util.count_entities)
113
+ df["named_entity_ratios"] = df.named_entity_counts.apply(util.ratios)
114
+ df["named_entities_total"] = df.named_entity_counts.apply(util.tuple_values_total)
115
+ if (len(df)>1):
116
+ df["named_entities_scaled"] = util.scale_min_max(df[['named_entities_total']])
117
+ else:
118
+ df["named_entities_scaled"] = 1
119
+ df['named_entities_norm'] = util.normalise_scaled(df,'named_entities_scaled')
120
+ return df
121
+
122
+ def targeted_sentiment_analytics(self,df):
123
+ util = Util()
124
+ df["targeted_sentiment"] = df.TargetedSentimentResults.apply(self.parse_targetedSentimentResults)
125
+ df['targeted_sentiment_counts'] = df.targeted_sentiment.apply(util.count_entities)
126
+ df["targeted_sentiment_ratios"] = df.targeted_sentiment_counts.apply(util.ratios)
127
+ df["targeted_sentiment_total"] = df.targeted_sentiment_counts.apply(util.tuple_values_total)
128
+ if (len(df)>1):
129
+ df["targeted_sentiment_scaled"] = util.scale_min_max(df[['targeted_sentiment_total']])
130
+ else:
131
+ df["targeted_sentiment_scaled"] = 1
132
+ df['targeted_sentiment_norm'] = util.normalise_scaled(df,'targeted_sentiment_scaled')
133
+ return df
134
+
135
+ def syntax_analytics(self,df):
136
+ util = Util()
137
+ df["pos_tags"] = df.SyntaxResults.apply(self.parse_syntaxResults)
138
+ df['pos_tag_counts'] = df.pos_tags.apply(util.count_labels)
139
+ df["pos_tag_ratios"] = df.pos_tag_counts.apply(util.ratios)
140
+ df["pos_tags_total"] = df.pos_tag_counts.apply(util.tuple_values_total)
141
+ if (len(df)>1):
142
+ df["pos_tags_scaled"] = util.scale_min_max(df[['pos_tags_total']])
143
+ else:
144
+ df["pos_tags_scaled"] = 1
145
+ df['pos_tags_norm'] = util.normalise_scaled(df,'pos_tags_scaled')
146
+ return df
147
+
148
+
149
+ # Parse key_phrases results - include all above threshold
150
+ def parse_keyPhraseResults(self,keyPhraseResults,threshold=0.95,min_count=1):
151
+ util = Util()
152
+ phrases = {}
153
+ filtered = [str.lower(r['Text']) for r in keyPhraseResults if r['Score'] > threshold]
154
+ for phrase in filtered:
155
+ phrases[phrase] = phrases.get(phrase,0)+1
156
+
157
+ filtered_phrases = {k:v for k,v in phrases.items() if v >= min_count}
158
+ return util.sort_dict_by_value(filtered_phrases)
159
+
160
+ # Parse syntax results - include specific postags
161
+ def parse_syntaxResults(self,syntax_results,postags_keep = ['ADV','VERB','AUX','ADJ','NOUN','PRON','PROPN']):
162
+ sequence = list()
163
+ for token in syntax_results:
164
+ tag = token['PartOfSpeech']['Tag']
165
+ if tag in postags_keep:
166
+ sequence.append((str.lower(token['Text']),tag))
167
+ return sequence
168
+
169
+ # Parse targeted sentiment results - keep non-neutral above threshold
170
+
171
+ def parse_targetedSentimentResults(self,targetedSentiment_results,threshold = 0.4):
172
+ sents = dict()
173
+ for grp in targetedSentiment_results:
174
+ for mention in grp["Mentions"]:
175
+ if mention['Score'] >= threshold:
176
+ if not "NEUTRAL" in mention['MentionSentiment']['Sentiment']:
177
+ k = mention['MentionSentiment']['Sentiment']
178
+ text = str.lower(mention['Text'])
179
+ sents.setdefault(k,{text}).add(text)
180
+ for k,v in sents.items():
181
+ sents[k] = list(v) # change set to list
182
+ return sents
183
+
184
+ # Parse targeted sentiment results for named entities
185
+ def parse_namedEntities(self,targetedSentimentResults,threshold = 0.1):
186
+ ents = dict()
187
+ for grp in targetedSentimentResults:
188
+ for mention in grp["Mentions"]:
189
+ if mention['Score'] >= threshold:
190
+ k = mention['Type']
191
+ text = str.lower(mention['Text'])
192
+ ents.setdefault(k,{text}).add(text)
193
+ for k,v in ents.items():
194
+ ents[k] = list(v) # change set to list
195
+ return ents
196
+
@@ -0,0 +1,128 @@
1
+
2
+ import logging,coloredlogs
3
+ import pandas as pd
4
+ import json
5
+
6
+ from reflexive.common.parameters import Parameters
7
+ from reflexive.common.util import Util
8
+
9
+ coloredlogs.install(level='INFO')
10
+
11
+ class General:
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ def __init__(self,parameters:Parameters):
16
+ #print(parameters)
17
+ self.__parameters = parameters.all_parameters()
18
+ self.logger.debug(f"Parameters: {self.__parameters}")
19
+
20
+
21
+ def general_analytics(self,df):
22
+ util = Util()
23
+ custom_df = df.copy()
24
+ custom_df["text_length"] = df.text.apply(lambda x: len(x))
25
+ if (len(custom_df)>1):
26
+ custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
27
+ else:
28
+ custom_df["text_scaled"] = 1
29
+ return custom_df
30
+
31
+
32
+ def remove_IQR_outliers(self,df):
33
+ tempdf = df.copy()
34
+ # Calculate text length
35
+ tempdf["text_length"] = tempdf.text.apply(lambda t: len(t))
36
+ fence = Util.outlier_fence(tempdf.text_length)
37
+ print(fence)
38
+ # Check change with removed outliers
39
+ checkdf = tempdf[tempdf.text_length<fence['UPPER']]
40
+ checkdf.reset_index(drop=True,inplace=True)
41
+ print("Original:",len(tempdf))
42
+ print(tempdf.describe())
43
+ print()
44
+ print("Outliers:",len(tempdf)-len(checkdf))
45
+ print()
46
+ print("No outliers:",len(checkdf))
47
+ print(checkdf.describe())
48
+ return checkdf
49
+
50
+ # Parse text for domain terms
51
+ def parse_domain_terms(self,text,domain_terms):
52
+ matched_terms = {}
53
+ for dtk,dtv in domain_terms.items():
54
+ matched_terms[dtk] = []
55
+ for term in dtv:
56
+ if term[0]=='_': #acronym - treat as whole word
57
+ regex = r"\b{}\b".format(term[1:])
58
+ matches = re.findall(regex,str.lower(text))
59
+ if len(matches)>0:
60
+ matched_terms[dtk].append((term[1:],len(matches)))
61
+ else:
62
+ count = str.lower(text).count(term)
63
+ if count > 0:
64
+ matched_terms[dtk].append((term,count))
65
+ return matched_terms
66
+
67
+
68
+ def get_top_ngrams(self,text_series,min_val=3):
69
+ ngrams = {}
70
+ for text in text_series:
71
+ self.__ngrams345(text,ngrams)
72
+ #print("Generated 3,4,5 ngrams:", len(ngrams))
73
+ f_ngrams = self.filter_dict_by_value(ngrams,min_val)
74
+ return self.sort_dict_by_value(f_ngrams)
75
+
76
+ def get_top_ngrams_for_text(self,text,top_ngrams):
77
+ ngrams = self.__ngrams345(text,{})
78
+ return {key: ngrams[key] for key in top_ngrams.keys() if key in ngrams}
79
+
80
+ def ngram_counts(self,ref_top_ngrams):
81
+ return sum(ref_top_ngrams.values())
82
+
83
+ # Given text and number of terms, create ngrams from the text
84
+ def __make_ngrams(self,text, n=1):
85
+ # Replace all none alphanumeric characters with spaces
86
+ s = re.sub(r'[^a-zA-Z0-9\s]', ' ', text.lower())
87
+
88
+ tokens = [token for token in s.split(" ") if token != ""]
89
+ ngrams = zip(*[tokens[i:] for i in range(n)])
90
+ return [" ".join(ngram) for ngram in ngrams]
91
+
92
+ # Generate 3,4,5 -grams
93
+ def __ngrams345(self,text,ngrams):
94
+ ngrams3 = self.__make_ngrams(text,3)
95
+ for n in ngrams3:
96
+ ngrams[n] = ngrams.get(n,0)+1
97
+ ngrams4 = self.__make_ngrams(text,4)
98
+ for n in ngrams4:
99
+ ngrams[n] = ngrams.get(n,0)+1
100
+ ngrams5 = self.__make_ngrams(text,5)
101
+ for n in ngrams5:
102
+ ngrams[n] = ngrams.get(n,0)+1
103
+ return ngrams
104
+
105
+
106
+ # Count domain terms
107
+ def count_domain_terms(self,terms):
108
+ counts = {}
109
+ for k,v in terms.items():
110
+ for term in v:
111
+ counts[k] = counts.setdefault(k,0) + term[1]
112
+ return counts
113
+
114
+
115
+ # Ratio between action POS and object POS
116
+ def action_object_ratio(self,pos_ratios,action_pos = ['VERB'],object_pos = ['NOUN','PROPN']):
117
+ ap = [s[1] for s in pos_ratios if s[0] in action_pos]
118
+ if ap:
119
+ aps = sum(ap)
120
+ else:
121
+ aps = 0
122
+ op = [s[1] for s in pos_ratios if s[0] in object_pos]
123
+ if op:
124
+ ops = sum(op)
125
+ else:
126
+ ops = 1 #avoid divide zero error - only happens with aps of 1
127
+ #print("aps",aps,"ops",ops)
128
+ return aps/ops
@@ -0,0 +1,124 @@
1
+ #
2
+
3
+ import logging,coloredlogs
4
+ import pandas as pd
5
+ import json
6
+
7
+ from reflexive.common.parameters import Parameters
8
+ from reflexive.common.local import Local
9
+ from reflexive.common.util import Util
10
+ from reflexive.aws_connect.s3 import S3
11
+ from reflexive.aws_connect.comprehend import Comprehend
12
+
13
+
14
+ coloredlogs.install(level='INFO')
15
+
16
+ class ReflexiveExpressions:
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ def __init__(self,parameters:Parameters,aws_s3:S3,local:Local,comprehend:Comprehend):
21
+ #print(parameters)
22
+ self.__params = parameters
23
+ self.__parameters = parameters.all_parameters()
24
+ self.logger.debug(f"Parameters: {self.__parameters}")
25
+ self.prefix = self.__parameters['prefix']
26
+ self.postfix = self.__parameters['postfix']
27
+ self.local_path = self.__parameters['local_path']
28
+ self.__s3 = aws_s3
29
+ self.__local = local
30
+ self.__comprehend = comprehend
31
+
32
+
33
+ ######## REFLEXIVE EXPRESSION ANALYSIS FUNCTIONS
34
+
35
+ def analyse_reflexive_expressions(self,df): #,s3_bucket_name,access_role_arn,entity_recogniser_arn):
36
+ #self.__bucket_name = s3_bucket_name
37
+ text = df.text.replace('\r\n','\n') # Comprehend treats \r\n as one character
38
+ # Upload reflections to S3 for analysis
39
+ self.__s3.upload_docs(text)
40
+
41
+ # Save a copy of reflections locally for offline analysis
42
+ self.__local.save_docs(text)
43
+
44
+ # Submit the job
45
+ return self.__comprehend.submit_custom_entity_job("reflexive_expressions_analysis") #submitReflexiveExpressionsJob(access_role_arn, entity_recogniser_arn)
46
+
47
+ def check_job_status(self):
48
+ return self.__comprehend.check_job_status()
49
+
50
+ def get_job_details(self):
51
+ return self.__comprehend.get_job_details()
52
+
53
+ def download_and_extract(self):
54
+ local_output_dir = f"{self.local_path}{self.prefix}output{self.postfix}"
55
+ job_details = self.get_job_details()
56
+ s3Uri = job_details['OutputDataConfig']['S3Uri']
57
+ return self.__s3.results_download_save_extract(s3Uri,local_output_dir)
58
+
59
+ def extractAnalysisFromResults(self,results):
60
+ analysis_output = dict()
61
+ for result in results:
62
+ j = json.loads(result)
63
+ #print(j)
64
+ idx = j["File"].split('_')[-1].split('.')[0]
65
+ analysis_output[int(idx)] = j["Entities"]
66
+ return analysis_output
67
+
68
+ def add_to_dataframe(self,df,results):
69
+ # Extract analysis from raw results
70
+ analysis_output = self.extractAnalysisFromResults(results)
71
+ # Add results to dataframe
72
+ results_df = df.copy()
73
+ results_df['reflexiveResults'] = pd.Series(analysis_output)
74
+ return results_df
75
+
76
+ def reflexive_analytics(self,df):
77
+ util = Util()
78
+ custom_df = df.copy()
79
+ # custom_df["text_length"] = df.text.apply(lambda x: len(x))
80
+ # if (len(custom_df)>1):
81
+ # custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
82
+ # else:
83
+ # custom_df["text_scaled"] = 1
84
+ custom_df["reflexive_results"] = df.reflexiveResults
85
+ # The expressions and their reflexive expression labels
86
+ custom_df["reflexive_expressions"] = df.reflexiveResults.apply(self.parse_reflexiveResults)
87
+ # The counts for each labels
88
+ custom_df["reflexive_counts"] = custom_df.reflexive_expressions.apply(util.count_labels)
89
+ # Ratios between reflexive expressions
90
+ custom_df["reflexive_ratio"] = custom_df.reflexive_counts.apply(util.ratios)
91
+ # Ratio vector
92
+ custom_df['ratio_vector'] = custom_df.reflexive_ratio.apply(self.make_ratio_vector)
93
+ # Get the diversity of reflexive types - out of 8 possible types
94
+ custom_df["reflexive_type_diversity"] = custom_df.reflexive_counts.apply(lambda x: len(x)/8)
95
+ # A total of all labels
96
+ custom_df["reflexive_total"] = custom_df.reflexive_counts.apply(util.tuple_values_total)
97
+ # MinMax scale the reflexive_total
98
+ if (len(custom_df)>1):
99
+ custom_df["reflexive_scaled"] = util.scale_min_max(custom_df[['reflexive_total']])
100
+ else:
101
+ custom_df["reflexive_scaled"] = 1
102
+ # Normalise based on text_scaled
103
+ custom_df['reflexive_norm'] = util.normalise_scaled(custom_df,'reflexive_scaled')
104
+ return custom_df
105
+
106
+
107
+ # Parse reflexive results - include all above threshold
108
+ def parse_reflexiveResults(self,reflexiveResults,threshold=0.5):
109
+ final_refs = list()
110
+ for ref in reflexiveResults:
111
+ if ref['Score'] > threshold:
112
+ final_refs.append((str.lower(ref['Text']),ref['Type']))
113
+ return final_refs
114
+
115
+ # Function for creating a vector out of reflexive ratio - could be used for others
116
+ def make_ratio_vector(self,ratio_list,ref_codes = ['RR','ER','VR','AR','EP','AF','CN','EV']):
117
+ ratio_dict = dict(ratio_list)
118
+ vec = []
119
+ for rc in ref_codes:
120
+ if rc in ratio_dict.keys():
121
+ vec.append(ratio_dict[rc])
122
+ else:
123
+ vec.append(0)
124
+ return vec
File without changes
@@ -0,0 +1,205 @@
1
+ #
2
+
3
+
4
+ import boto3
5
+ import time
6
+ import json
7
+ import pandas as pd
8
+
9
+ from reflexive.common.parameters import Parameters
10
+ from reflexive.common.util import Util
11
+ from reflexive.aws_connect.s3 import S3
12
+
13
+ import logging
14
+ try:
15
+ import coloredlogs
16
+ coloredlogs.install(level='INFO')
17
+ except:
18
+ print("Colored logs not available")
19
+
20
+ class Comprehend:
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ def __init__(self,parameters:Parameters):
25
+ #print(parameters)
26
+ self.__parameters = parameters.all_parameters()
27
+ self.logger.debug(f"Parameters: {self.__parameters}")
28
+ self.region = self.__parameters['region']
29
+ self.access_role_arn = self.__parameters['comprehend_access_role_arn']
30
+ self.entity_recogniser_arn = self.__parameters['reflexive_entity_arn']
31
+ self.local_path = self.__parameters['local_path']
32
+ self.prefix = self.__parameters['prefix']
33
+ self.postfix = self.__parameters['postfix']
34
+ self.bucket_name = self.__parameters["bucket_name"]
35
+ self.files_folder = f"{self.prefix}files{self.postfix}"
36
+ self.results_folder = f"{self.prefix}results{self.postfix}"
37
+ self.input_uri = f"s3://{self.bucket_name}/{self.files_folder}/{self.prefix}"
38
+ self.output_uri = f"s3://{self.bucket_name}/{self.results_folder}/"
39
+ self.analysis_types = self.__parameters['analysis_types']
40
+ # create client
41
+ try:
42
+ self.logger.debug(f"Region:{self.region}")
43
+ self.__comp_client = boto3.client(service_name='comprehend',region_name=self.region)
44
+ except Exception as err:
45
+ self.logger.error("Unable to create Comprehend client: ",err)
46
+
47
+
48
+ def client(self):
49
+ return self.__comp_client
50
+
51
+
52
+ #### CUSTOM ENTITY
53
+
54
+ def submit_custom_entity_job(self,job_name): #access_role_arn,entity_recogniser_arn):
55
+ job_str = f"{self.prefix}{job_name}{self.postfix}"
56
+
57
+ response = self.__comp_client.start_entities_detection_job(
58
+ InputDataConfig={
59
+ 'S3Uri': self.input_uri,
60
+ 'InputFormat': 'ONE_DOC_PER_FILE'
61
+ },
62
+ OutputDataConfig={
63
+ 'S3Uri': self.output_uri
64
+ },
65
+ DataAccessRoleArn=self.access_role_arn,
66
+ JobName=job_str,
67
+ EntityRecognizerArn=self.entity_recogniser_arn,
68
+ LanguageCode='en'
69
+ )
70
+ self.job_id = response['JobId']
71
+ return response
72
+
73
+ # Check job status
74
+ def check_job_status(self):
75
+ job_status = self.__comp_client.describe_entities_detection_job(
76
+ JobId=self.job_id
77
+ )
78
+ self.__job_properties = job_status['EntitiesDetectionJobProperties']
79
+ return self.__job_properties['JobStatus']
80
+
81
+ def get_job_details(self):
82
+ return self.__job_properties
83
+
84
+
85
+ # Use AWS comprehend to get bulk key phrases from single batch of chunked text
86
+ def get_single_batch_analysis(self,index,chunk):
87
+ comprehend = self.client()
88
+ results = {}
89
+ print("Analysing chunk",index)
90
+ print(" . key_phrase")
91
+ kpresult = comprehend.batch_detect_key_phrases(TextList=chunk,LanguageCode='en')
92
+ results['KeyPhraseResults'] = kpresult
93
+ #key_phrase_results.append(kpresult)
94
+ time.sleep(2)
95
+ print(" . sentiment")
96
+ senresult = comprehend.batch_detect_sentiment(TextList=chunk,LanguageCode='en')
97
+ results['SentimentResults'] = senresult
98
+ #sentiment_results.append(senresult)
99
+ time.sleep(2)
100
+ print(" . targeted_sentiment")
101
+ tsenresult = comprehend.batch_detect_targeted_sentiment(TextList=chunk,LanguageCode='en')
102
+ results['TargetedSentimentResults'] = tsenresult
103
+ #target_sent_results.append(tsenresult)
104
+ time.sleep(2)
105
+ print(" . syntax")
106
+ synresult = comprehend.batch_detect_syntax(TextList=chunk,LanguageCode='en')
107
+ results['SyntaxResults'] = synresult
108
+ #syntax_results.append(synresult)
109
+ time.sleep(2)
110
+ return results
111
+
112
+
113
+ # Use AWS comprehend to get bulk key phrases from chunked text
114
+ def get_multiple_batch_analysis(self,chunked_text):
115
+ chunk_results = {}
116
+ for key in self.analysis_types.keys():
117
+ chunk_results[key] = []
118
+
119
+ for idx,chunk in enumerate(chunked_text):
120
+ if len(chunked_text) > 4999:
121
+ print("WARNING: Text too long to analyse - index",idx,"skipped!")
122
+ else:
123
+ try:
124
+ results = self.get_single_batch_analysis(index=idx,chunk=chunk)
125
+ except(Exception) as error:
126
+ print("There was an error with index",idx,error)
127
+ finally:
128
+ if results:
129
+ for key in results.keys():
130
+ chunk_results[key].append(results[key])
131
+
132
+ return chunk_results
133
+
134
+ # Take batched responses and concenate single lists of results, errors, and http responses
135
+ def unbatch_results(self,result_type,results,batch_size=25):
136
+ unbatched_results = {}
137
+ unbatched_errors = {}
138
+ batch_responses = {}
139
+ for idx,batch in enumerate(results):
140
+ #print("Response for batch:",idx)
141
+ batch_responses[idx] = batch['ResponseMetadata']
142
+ result_list = batch['ResultList']
143
+ error_list = batch['ErrorList']
144
+ for r in result_list:
145
+ ridx = idx*batch_size + r['Index']
146
+ rdata = r[result_type]
147
+ unbatched_results[ridx] = rdata
148
+ for e in error_list:
149
+ eidx = e['Index']
150
+ unbatched_errors[eidx] = 'ERROR' + e['ErrorCode'] + ': ' + e['ErrorMessage']
151
+ unbatched = {}
152
+ unbatched['results'] = unbatched_results
153
+ unbatched['errors'] = unbatched_errors
154
+ unbatched['responses'] = batch_responses
155
+ return unbatched
156
+
157
+
158
+
159
+ def check_long_text(self,df):
160
+ # Check for long reflections (too long for batch analysis)
161
+ long_df = df.copy()
162
+ long_df = long_df[long_df.text.str.len()>5000]
163
+ long_df['length'] = long_df.text.str.len()
164
+ return long_df
165
+
166
+
167
+ # def extract_result(self,result,batch,batch_params):
168
+ # match batch:
169
+ # case "KeyPhraseResults":
170
+ # extracted = [r['Text'] for r in result if r['Score'] >= batch_params["min_score"]]
171
+ # case "SentimentResults":
172
+ # extracted = result
173
+ # case "TargetedSentimentResults":
174
+ # extracted = dict()
175
+ # for r in result:
176
+ # for mention in r['Mentions']:
177
+ # if (mention['Score'] >= batch_params["min_score"]):
178
+ # text = mention['Text']
179
+ # key = f"{mention['Type']}_{mention['MentionSentiment']['Sentiment']}"
180
+ # if key in extracted.keys():
181
+ # extracted[key].add(text)
182
+ # else:
183
+ # extracted[key] = {text}
184
+ # case "SyntaxResults":
185
+ # tags = []
186
+ # tokens = []
187
+ # for r in result:
188
+ # pos = r['PartOfSpeech']
189
+ # tag = pos['Tag']
190
+ # if pos['Score'] < batch_params["max_score"]:
191
+ # tag = tag+"_?"
192
+ # tags.append(tag)
193
+ # tokens.append(r['Text'])
194
+
195
+ # extracted = {'tokens':tokens,'tags':tags}
196
+ # case other:
197
+ # extracted = []
198
+ # return extracted
199
+
200
+
201
+
202
+
203
+
204
+
205
+