reflexive 0.1.7__2-py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reflexive/__init__.py +9 -0
- reflexive/analyse/__init__.py +0 -0
- reflexive/analyse/aws_nlp.py +196 -0
- reflexive/analyse/general.py +128 -0
- reflexive/analyse/reflexive_expressions.py +124 -0
- reflexive/aws_connect/__init__.py +0 -0
- reflexive/aws_connect/comprehend.py +205 -0
- reflexive/aws_connect/s3.py +89 -0
- reflexive/common/__init__.py +0 -0
- reflexive/common/local.py +48 -0
- reflexive/common/parameters.py +74 -0
- reflexive/common/util.py +108 -0
- reflexive/visual/__init__.py +0 -0
- reflexive/visual/display.py +110 -0
- reflexive-0.1.7.dist-info/LICENSE +201 -0
- reflexive-0.1.7.dist-info/LICENSE.txt +202 -0
- reflexive-0.1.7.dist-info/METADATA +222 -0
- reflexive-0.1.7.dist-info/RECORD +20 -0
- reflexive-0.1.7.dist-info/WHEEL +5 -0
- reflexive-0.1.7.dist-info/top_level.txt +1 -0
reflexive/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from .common.parameters import *
|
|
2
|
+
from .common.local import *
|
|
3
|
+
from .common.util import *
|
|
4
|
+
from .visual.display import *
|
|
5
|
+
from .analyse.reflexive_expressions import *
|
|
6
|
+
from .analyse.aws_nlp import *
|
|
7
|
+
from .analyse.general import *
|
|
8
|
+
from .aws_connect.comprehend import *
|
|
9
|
+
from .aws_connect.s3 import *
|
|
File without changes
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
from reflexive.common.parameters import Parameters
|
|
4
|
+
from reflexive.common.local import Local
|
|
5
|
+
from reflexive.aws_connect.comprehend import Comprehend
|
|
6
|
+
from reflexive.common.util import Util
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import coloredlogs
|
|
14
|
+
coloredlogs.install(level='INFO')
|
|
15
|
+
except:
|
|
16
|
+
print("Colored logs not available")
|
|
17
|
+
|
|
18
|
+
class Nlp:
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
def __init__(self,parameters:Parameters,local:Local,comprehend:Comprehend):
|
|
23
|
+
self.__parameters = parameters.all_parameters()
|
|
24
|
+
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
25
|
+
self.local_path = self.__parameters['local_path']
|
|
26
|
+
self.prefix = self.__parameters['prefix']
|
|
27
|
+
self.postfix = self.__parameters['postfix']
|
|
28
|
+
self.analysis_types = self.__parameters['analysis_types']
|
|
29
|
+
self.__local = local
|
|
30
|
+
self.__comprehend = comprehend
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
#### COMPREHEND ANALYSIS
|
|
35
|
+
|
|
36
|
+
def comprehend_analysis(self,df):
|
|
37
|
+
util = Util()
|
|
38
|
+
comprehend = self.__comprehend
|
|
39
|
+
self.analysis_types = self.__parameters['analysis_types']
|
|
40
|
+
#print(type(df.text))
|
|
41
|
+
# chunk the text for batch analysis
|
|
42
|
+
chunked_text = util.series_to_chunked_list(series=df.text)
|
|
43
|
+
print("Number of chunks:",len(chunked_text))
|
|
44
|
+
# start batch analysis
|
|
45
|
+
chunked_results = comprehend.get_multiple_batch_analysis(chunked_text)
|
|
46
|
+
print("Finished Analysis.")
|
|
47
|
+
# write to file
|
|
48
|
+
print("Writing data to file...")
|
|
49
|
+
with open(f"{self.local_path}{self.prefix}analysis_chunks{self.postfix}.json", "w") as fp:
|
|
50
|
+
json.dump(chunked_results,fp)
|
|
51
|
+
print("DONE!")
|
|
52
|
+
# unchunk
|
|
53
|
+
final_results = {}
|
|
54
|
+
for key in chunked_results.keys():
|
|
55
|
+
final_results[key] = comprehend.unbatch_results(self.analysis_types[key],chunked_results[key])
|
|
56
|
+
print("Finished Unbatching",key," - Writing data to file...")
|
|
57
|
+
filename = f"{self.local_path}{self.prefix}{key}{self.postfix}.json"
|
|
58
|
+
with open(filename, "w") as fp:
|
|
59
|
+
json.dump(final_results[key],fp)
|
|
60
|
+
print("DONE!")
|
|
61
|
+
# Save final_results for reload if necessary
|
|
62
|
+
with open(f"{self.local_path}{self.prefix}final_results{self.postfix}.json", "w") as fp:
|
|
63
|
+
json.dump(final_results,fp)
|
|
64
|
+
return final_results
|
|
65
|
+
|
|
66
|
+
def check_results(self,results):
|
|
67
|
+
print("Checking for errors...")
|
|
68
|
+
for key in results.keys():
|
|
69
|
+
errors = results[key]['errors']
|
|
70
|
+
print(f"Errors for {key}: {errors}")
|
|
71
|
+
print()
|
|
72
|
+
print("Checking that we have results for all docs")
|
|
73
|
+
for key in results.keys():
|
|
74
|
+
num_results= len(results[key]['results'])
|
|
75
|
+
print(f"Number of results for {key}: {num_results}")
|
|
76
|
+
return errors
|
|
77
|
+
|
|
78
|
+
def add_results_to_df(self,results,df):
|
|
79
|
+
for key in results.keys():
|
|
80
|
+
rs = results[key]['results']
|
|
81
|
+
newresults = {}
|
|
82
|
+
for oldkey in rs.keys():
|
|
83
|
+
newresults[int(oldkey)] = rs[oldkey] # Need to change keys to int to properly add to dataframe
|
|
84
|
+
df[key] = pd.Series(newresults)
|
|
85
|
+
return df
|
|
86
|
+
|
|
87
|
+
def nlp_analytics(self,df):
|
|
88
|
+
temp_df = df.copy()
|
|
89
|
+
temp_df = self.keyphrase_analytics(temp_df)
|
|
90
|
+
temp_df = self.named_entity_analytics(temp_df)
|
|
91
|
+
temp_df = self.targeted_sentiment_analytics(temp_df)
|
|
92
|
+
temp_df = self.syntax_analytics(temp_df)
|
|
93
|
+
return temp_df
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def keyphrase_analytics(self,df):
|
|
97
|
+
util = Util()
|
|
98
|
+
df["key_phrases"] = df.KeyPhraseResults.apply(self.parse_keyPhraseResults)
|
|
99
|
+
df["key_phrase_counts"] = df.key_phrases.apply(util.count_keys)
|
|
100
|
+
df["key_phrases_total"] = df.key_phrase_counts.apply(util.tuple_values_total)
|
|
101
|
+
if (len(df)>1):
|
|
102
|
+
df["key_phrases_scaled"] = util.scale_min_max(df[['key_phrases_total']])
|
|
103
|
+
else:
|
|
104
|
+
df["key_phrases_scaled"] = 1
|
|
105
|
+
# Normalise based on text_scaled
|
|
106
|
+
df['key_phrases_norm'] = util.normalise_scaled(df,'key_phrases_scaled')
|
|
107
|
+
return df
|
|
108
|
+
|
|
109
|
+
def named_entity_analytics(self,df):
|
|
110
|
+
util = Util()
|
|
111
|
+
df["named_entities"] = df.TargetedSentimentResults.apply(self.parse_namedEntities)
|
|
112
|
+
df['named_entity_counts'] = df.named_entities.apply(util.count_entities)
|
|
113
|
+
df["named_entity_ratios"] = df.named_entity_counts.apply(util.ratios)
|
|
114
|
+
df["named_entities_total"] = df.named_entity_counts.apply(util.tuple_values_total)
|
|
115
|
+
if (len(df)>1):
|
|
116
|
+
df["named_entities_scaled"] = util.scale_min_max(df[['named_entities_total']])
|
|
117
|
+
else:
|
|
118
|
+
df["named_entities_scaled"] = 1
|
|
119
|
+
df['named_entities_norm'] = util.normalise_scaled(df,'named_entities_scaled')
|
|
120
|
+
return df
|
|
121
|
+
|
|
122
|
+
def targeted_sentiment_analytics(self,df):
|
|
123
|
+
util = Util()
|
|
124
|
+
df["targeted_sentiment"] = df.TargetedSentimentResults.apply(self.parse_targetedSentimentResults)
|
|
125
|
+
df['targeted_sentiment_counts'] = df.targeted_sentiment.apply(util.count_entities)
|
|
126
|
+
df["targeted_sentiment_ratios"] = df.targeted_sentiment_counts.apply(util.ratios)
|
|
127
|
+
df["targeted_sentiment_total"] = df.targeted_sentiment_counts.apply(util.tuple_values_total)
|
|
128
|
+
if (len(df)>1):
|
|
129
|
+
df["targeted_sentiment_scaled"] = util.scale_min_max(df[['targeted_sentiment_total']])
|
|
130
|
+
else:
|
|
131
|
+
df["targeted_sentiment_scaled"] = 1
|
|
132
|
+
df['targeted_sentiment_norm'] = util.normalise_scaled(df,'targeted_sentiment_scaled')
|
|
133
|
+
return df
|
|
134
|
+
|
|
135
|
+
def syntax_analytics(self,df):
|
|
136
|
+
util = Util()
|
|
137
|
+
df["pos_tags"] = df.SyntaxResults.apply(self.parse_syntaxResults)
|
|
138
|
+
df['pos_tag_counts'] = df.pos_tags.apply(util.count_labels)
|
|
139
|
+
df["pos_tag_ratios"] = df.pos_tag_counts.apply(util.ratios)
|
|
140
|
+
df["pos_tags_total"] = df.pos_tag_counts.apply(util.tuple_values_total)
|
|
141
|
+
if (len(df)>1):
|
|
142
|
+
df["pos_tags_scaled"] = util.scale_min_max(df[['pos_tags_total']])
|
|
143
|
+
else:
|
|
144
|
+
df["pos_tags_scaled"] = 1
|
|
145
|
+
df['pos_tags_norm'] = util.normalise_scaled(df,'pos_tags_scaled')
|
|
146
|
+
return df
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# Parse key_phrases results - include all above threshold
|
|
150
|
+
def parse_keyPhraseResults(self,keyPhraseResults,threshold=0.95,min_count=1):
|
|
151
|
+
util = Util()
|
|
152
|
+
phrases = {}
|
|
153
|
+
filtered = [str.lower(r['Text']) for r in keyPhraseResults if r['Score'] > threshold]
|
|
154
|
+
for phrase in filtered:
|
|
155
|
+
phrases[phrase] = phrases.get(phrase,0)+1
|
|
156
|
+
|
|
157
|
+
filtered_phrases = {k:v for k,v in phrases.items() if v >= min_count}
|
|
158
|
+
return util.sort_dict_by_value(filtered_phrases)
|
|
159
|
+
|
|
160
|
+
# Parse syntax results - include specific postags
|
|
161
|
+
def parse_syntaxResults(self,syntax_results,postags_keep = ['ADV','VERB','AUX','ADJ','NOUN','PRON','PROPN']):
|
|
162
|
+
sequence = list()
|
|
163
|
+
for token in syntax_results:
|
|
164
|
+
tag = token['PartOfSpeech']['Tag']
|
|
165
|
+
if tag in postags_keep:
|
|
166
|
+
sequence.append((str.lower(token['Text']),tag))
|
|
167
|
+
return sequence
|
|
168
|
+
|
|
169
|
+
# Parse targeted sentiment results - keep non-neutral above threshold
|
|
170
|
+
|
|
171
|
+
def parse_targetedSentimentResults(self,targetedSentiment_results,threshold = 0.4):
|
|
172
|
+
sents = dict()
|
|
173
|
+
for grp in targetedSentiment_results:
|
|
174
|
+
for mention in grp["Mentions"]:
|
|
175
|
+
if mention['Score'] >= threshold:
|
|
176
|
+
if not "NEUTRAL" in mention['MentionSentiment']['Sentiment']:
|
|
177
|
+
k = mention['MentionSentiment']['Sentiment']
|
|
178
|
+
text = str.lower(mention['Text'])
|
|
179
|
+
sents.setdefault(k,{text}).add(text)
|
|
180
|
+
for k,v in sents.items():
|
|
181
|
+
sents[k] = list(v) # change set to list
|
|
182
|
+
return sents
|
|
183
|
+
|
|
184
|
+
# Parse targeted sentiment results for named entities
|
|
185
|
+
def parse_namedEntities(self,targetedSentimentResults,threshold = 0.1):
|
|
186
|
+
ents = dict()
|
|
187
|
+
for grp in targetedSentimentResults:
|
|
188
|
+
for mention in grp["Mentions"]:
|
|
189
|
+
if mention['Score'] >= threshold:
|
|
190
|
+
k = mention['Type']
|
|
191
|
+
text = str.lower(mention['Text'])
|
|
192
|
+
ents.setdefault(k,{text}).add(text)
|
|
193
|
+
for k,v in ents.items():
|
|
194
|
+
ents[k] = list(v) # change set to list
|
|
195
|
+
return ents
|
|
196
|
+
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
|
|
2
|
+
import logging,coloredlogs
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
from reflexive.common.parameters import Parameters
|
|
7
|
+
from reflexive.common.util import Util
|
|
8
|
+
|
|
9
|
+
coloredlogs.install(level='INFO')
|
|
10
|
+
|
|
11
|
+
class General:
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
def __init__(self,parameters:Parameters):
|
|
16
|
+
#print(parameters)
|
|
17
|
+
self.__parameters = parameters.all_parameters()
|
|
18
|
+
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def general_analytics(self,df):
|
|
22
|
+
util = Util()
|
|
23
|
+
custom_df = df.copy()
|
|
24
|
+
custom_df["text_length"] = df.text.apply(lambda x: len(x))
|
|
25
|
+
if (len(custom_df)>1):
|
|
26
|
+
custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
|
|
27
|
+
else:
|
|
28
|
+
custom_df["text_scaled"] = 1
|
|
29
|
+
return custom_df
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def remove_IQR_outliers(self,df):
|
|
33
|
+
tempdf = df.copy()
|
|
34
|
+
# Calculate text length
|
|
35
|
+
tempdf["text_length"] = tempdf.text.apply(lambda t: len(t))
|
|
36
|
+
fence = Util.outlier_fence(tempdf.text_length)
|
|
37
|
+
print(fence)
|
|
38
|
+
# Check change with removed outliers
|
|
39
|
+
checkdf = tempdf[tempdf.text_length<fence['UPPER']]
|
|
40
|
+
checkdf.reset_index(drop=True,inplace=True)
|
|
41
|
+
print("Original:",len(tempdf))
|
|
42
|
+
print(tempdf.describe())
|
|
43
|
+
print()
|
|
44
|
+
print("Outliers:",len(tempdf)-len(checkdf))
|
|
45
|
+
print()
|
|
46
|
+
print("No outliers:",len(checkdf))
|
|
47
|
+
print(checkdf.describe())
|
|
48
|
+
return checkdf
|
|
49
|
+
|
|
50
|
+
# Parse text for domain terms
|
|
51
|
+
def parse_domain_terms(self,text,domain_terms):
|
|
52
|
+
matched_terms = {}
|
|
53
|
+
for dtk,dtv in domain_terms.items():
|
|
54
|
+
matched_terms[dtk] = []
|
|
55
|
+
for term in dtv:
|
|
56
|
+
if term[0]=='_': #acronym - treat as whole word
|
|
57
|
+
regex = r"\b{}\b".format(term[1:])
|
|
58
|
+
matches = re.findall(regex,str.lower(text))
|
|
59
|
+
if len(matches)>0:
|
|
60
|
+
matched_terms[dtk].append((term[1:],len(matches)))
|
|
61
|
+
else:
|
|
62
|
+
count = str.lower(text).count(term)
|
|
63
|
+
if count > 0:
|
|
64
|
+
matched_terms[dtk].append((term,count))
|
|
65
|
+
return matched_terms
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_top_ngrams(self,text_series,min_val=3):
|
|
69
|
+
ngrams = {}
|
|
70
|
+
for text in text_series:
|
|
71
|
+
self.__ngrams345(text,ngrams)
|
|
72
|
+
#print("Generated 3,4,5 ngrams:", len(ngrams))
|
|
73
|
+
f_ngrams = self.filter_dict_by_value(ngrams,min_val)
|
|
74
|
+
return self.sort_dict_by_value(f_ngrams)
|
|
75
|
+
|
|
76
|
+
def get_top_ngrams_for_text(self,text,top_ngrams):
|
|
77
|
+
ngrams = self.__ngrams345(text,{})
|
|
78
|
+
return {key: ngrams[key] for key in top_ngrams.keys() if key in ngrams}
|
|
79
|
+
|
|
80
|
+
def ngram_counts(self,ref_top_ngrams):
|
|
81
|
+
return sum(ref_top_ngrams.values())
|
|
82
|
+
|
|
83
|
+
# Given text and number of terms, create ngrams from the text
|
|
84
|
+
def __make_ngrams(self,text, n=1):
|
|
85
|
+
# Replace all none alphanumeric characters with spaces
|
|
86
|
+
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', text.lower())
|
|
87
|
+
|
|
88
|
+
tokens = [token for token in s.split(" ") if token != ""]
|
|
89
|
+
ngrams = zip(*[tokens[i:] for i in range(n)])
|
|
90
|
+
return [" ".join(ngram) for ngram in ngrams]
|
|
91
|
+
|
|
92
|
+
# Generate 3,4,5 -grams
|
|
93
|
+
def __ngrams345(self,text,ngrams):
|
|
94
|
+
ngrams3 = self.__make_ngrams(text,3)
|
|
95
|
+
for n in ngrams3:
|
|
96
|
+
ngrams[n] = ngrams.get(n,0)+1
|
|
97
|
+
ngrams4 = self.__make_ngrams(text,4)
|
|
98
|
+
for n in ngrams4:
|
|
99
|
+
ngrams[n] = ngrams.get(n,0)+1
|
|
100
|
+
ngrams5 = self.__make_ngrams(text,5)
|
|
101
|
+
for n in ngrams5:
|
|
102
|
+
ngrams[n] = ngrams.get(n,0)+1
|
|
103
|
+
return ngrams
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# Count domain terms
|
|
107
|
+
def count_domain_terms(self,terms):
|
|
108
|
+
counts = {}
|
|
109
|
+
for k,v in terms.items():
|
|
110
|
+
for term in v:
|
|
111
|
+
counts[k] = counts.setdefault(k,0) + term[1]
|
|
112
|
+
return counts
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# Ratio between action POS and object POS
|
|
116
|
+
def action_object_ratio(self,pos_ratios,action_pos = ['VERB'],object_pos = ['NOUN','PROPN']):
|
|
117
|
+
ap = [s[1] for s in pos_ratios if s[0] in action_pos]
|
|
118
|
+
if ap:
|
|
119
|
+
aps = sum(ap)
|
|
120
|
+
else:
|
|
121
|
+
aps = 0
|
|
122
|
+
op = [s[1] for s in pos_ratios if s[0] in object_pos]
|
|
123
|
+
if op:
|
|
124
|
+
ops = sum(op)
|
|
125
|
+
else:
|
|
126
|
+
ops = 1 #avoid divide zero error - only happens with aps of 1
|
|
127
|
+
#print("aps",aps,"ops",ops)
|
|
128
|
+
return aps/ops
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
#
|
|
2
|
+
|
|
3
|
+
import logging,coloredlogs
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from reflexive.common.parameters import Parameters
|
|
8
|
+
from reflexive.common.local import Local
|
|
9
|
+
from reflexive.common.util import Util
|
|
10
|
+
from reflexive.aws_connect.s3 import S3
|
|
11
|
+
from reflexive.aws_connect.comprehend import Comprehend
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
coloredlogs.install(level='INFO')
|
|
15
|
+
|
|
16
|
+
class ReflexiveExpressions:
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
def __init__(self,parameters:Parameters,aws_s3:S3,local:Local,comprehend:Comprehend):
|
|
21
|
+
#print(parameters)
|
|
22
|
+
self.__params = parameters
|
|
23
|
+
self.__parameters = parameters.all_parameters()
|
|
24
|
+
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
25
|
+
self.prefix = self.__parameters['prefix']
|
|
26
|
+
self.postfix = self.__parameters['postfix']
|
|
27
|
+
self.local_path = self.__parameters['local_path']
|
|
28
|
+
self.__s3 = aws_s3
|
|
29
|
+
self.__local = local
|
|
30
|
+
self.__comprehend = comprehend
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
######## REFLEXIVE EXPRESSION ANALYSIS FUNCTIONS
|
|
34
|
+
|
|
35
|
+
def analyse_reflexive_expressions(self,df): #,s3_bucket_name,access_role_arn,entity_recogniser_arn):
|
|
36
|
+
#self.__bucket_name = s3_bucket_name
|
|
37
|
+
text = df.text.replace('\r\n','\n') # Comprehend treats \r\n as one character
|
|
38
|
+
# Upload reflections to S3 for analysis
|
|
39
|
+
self.__s3.upload_docs(text)
|
|
40
|
+
|
|
41
|
+
# Save a copy of reflections locally for offline analysis
|
|
42
|
+
self.__local.save_docs(text)
|
|
43
|
+
|
|
44
|
+
# Submit the job
|
|
45
|
+
return self.__comprehend.submit_custom_entity_job("reflexive_expressions_analysis") #submitReflexiveExpressionsJob(access_role_arn, entity_recogniser_arn)
|
|
46
|
+
|
|
47
|
+
def check_job_status(self):
|
|
48
|
+
return self.__comprehend.check_job_status()
|
|
49
|
+
|
|
50
|
+
def get_job_details(self):
|
|
51
|
+
return self.__comprehend.get_job_details()
|
|
52
|
+
|
|
53
|
+
def download_and_extract(self):
|
|
54
|
+
local_output_dir = f"{self.local_path}{self.prefix}output{self.postfix}"
|
|
55
|
+
job_details = self.get_job_details()
|
|
56
|
+
s3Uri = job_details['OutputDataConfig']['S3Uri']
|
|
57
|
+
return self.__s3.results_download_save_extract(s3Uri,local_output_dir)
|
|
58
|
+
|
|
59
|
+
def extractAnalysisFromResults(self,results):
|
|
60
|
+
analysis_output = dict()
|
|
61
|
+
for result in results:
|
|
62
|
+
j = json.loads(result)
|
|
63
|
+
#print(j)
|
|
64
|
+
idx = j["File"].split('_')[-1].split('.')[0]
|
|
65
|
+
analysis_output[int(idx)] = j["Entities"]
|
|
66
|
+
return analysis_output
|
|
67
|
+
|
|
68
|
+
def add_to_dataframe(self,df,results):
|
|
69
|
+
# Extract analysis from raw results
|
|
70
|
+
analysis_output = self.extractAnalysisFromResults(results)
|
|
71
|
+
# Add results to dataframe
|
|
72
|
+
results_df = df.copy()
|
|
73
|
+
results_df['reflexiveResults'] = pd.Series(analysis_output)
|
|
74
|
+
return results_df
|
|
75
|
+
|
|
76
|
+
def reflexive_analytics(self,df):
|
|
77
|
+
util = Util()
|
|
78
|
+
custom_df = df.copy()
|
|
79
|
+
# custom_df["text_length"] = df.text.apply(lambda x: len(x))
|
|
80
|
+
# if (len(custom_df)>1):
|
|
81
|
+
# custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
|
|
82
|
+
# else:
|
|
83
|
+
# custom_df["text_scaled"] = 1
|
|
84
|
+
custom_df["reflexive_results"] = df.reflexiveResults
|
|
85
|
+
# The expressions and their reflexive expression labels
|
|
86
|
+
custom_df["reflexive_expressions"] = df.reflexiveResults.apply(self.parse_reflexiveResults)
|
|
87
|
+
# The counts for each labels
|
|
88
|
+
custom_df["reflexive_counts"] = custom_df.reflexive_expressions.apply(util.count_labels)
|
|
89
|
+
# Ratios between reflexive expressions
|
|
90
|
+
custom_df["reflexive_ratio"] = custom_df.reflexive_counts.apply(util.ratios)
|
|
91
|
+
# Ratio vector
|
|
92
|
+
custom_df['ratio_vector'] = custom_df.reflexive_ratio.apply(self.make_ratio_vector)
|
|
93
|
+
# Get the diversity of reflexive types - out of 8 possible types
|
|
94
|
+
custom_df["reflexive_type_diversity"] = custom_df.reflexive_counts.apply(lambda x: len(x)/8)
|
|
95
|
+
# A total of all labels
|
|
96
|
+
custom_df["reflexive_total"] = custom_df.reflexive_counts.apply(util.tuple_values_total)
|
|
97
|
+
# MinMax scale the reflexive_total
|
|
98
|
+
if (len(custom_df)>1):
|
|
99
|
+
custom_df["reflexive_scaled"] = util.scale_min_max(custom_df[['reflexive_total']])
|
|
100
|
+
else:
|
|
101
|
+
custom_df["reflexive_scaled"] = 1
|
|
102
|
+
# Normalise based on text_scaled
|
|
103
|
+
custom_df['reflexive_norm'] = util.normalise_scaled(custom_df,'reflexive_scaled')
|
|
104
|
+
return custom_df
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# Parse reflexive results - include all above threshold
|
|
108
|
+
def parse_reflexiveResults(self,reflexiveResults,threshold=0.5):
|
|
109
|
+
final_refs = list()
|
|
110
|
+
for ref in reflexiveResults:
|
|
111
|
+
if ref['Score'] > threshold:
|
|
112
|
+
final_refs.append((str.lower(ref['Text']),ref['Type']))
|
|
113
|
+
return final_refs
|
|
114
|
+
|
|
115
|
+
# Function for creating a vector out of reflexive ratio - could be used for others
|
|
116
|
+
def make_ratio_vector(self,ratio_list,ref_codes = ['RR','ER','VR','AR','EP','AF','CN','EV']):
|
|
117
|
+
ratio_dict = dict(ratio_list)
|
|
118
|
+
vec = []
|
|
119
|
+
for rc in ref_codes:
|
|
120
|
+
if rc in ratio_dict.keys():
|
|
121
|
+
vec.append(ratio_dict[rc])
|
|
122
|
+
else:
|
|
123
|
+
vec.append(0)
|
|
124
|
+
return vec
|
|
File without changes
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
#
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
import time
|
|
6
|
+
import json
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from reflexive.common.parameters import Parameters
|
|
10
|
+
from reflexive.common.util import Util
|
|
11
|
+
from reflexive.aws_connect.s3 import S3
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
try:
|
|
15
|
+
import coloredlogs
|
|
16
|
+
coloredlogs.install(level='INFO')
|
|
17
|
+
except:
|
|
18
|
+
print("Colored logs not available")
|
|
19
|
+
|
|
20
|
+
class Comprehend:
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
def __init__(self,parameters:Parameters):
|
|
25
|
+
#print(parameters)
|
|
26
|
+
self.__parameters = parameters.all_parameters()
|
|
27
|
+
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
28
|
+
self.region = self.__parameters['region']
|
|
29
|
+
self.access_role_arn = self.__parameters['comprehend_access_role_arn']
|
|
30
|
+
self.entity_recogniser_arn = self.__parameters['reflexive_entity_arn']
|
|
31
|
+
self.local_path = self.__parameters['local_path']
|
|
32
|
+
self.prefix = self.__parameters['prefix']
|
|
33
|
+
self.postfix = self.__parameters['postfix']
|
|
34
|
+
self.bucket_name = self.__parameters["bucket_name"]
|
|
35
|
+
self.files_folder = f"{self.prefix}files{self.postfix}"
|
|
36
|
+
self.results_folder = f"{self.prefix}results{self.postfix}"
|
|
37
|
+
self.input_uri = f"s3://{self.bucket_name}/{self.files_folder}/{self.prefix}"
|
|
38
|
+
self.output_uri = f"s3://{self.bucket_name}/{self.results_folder}/"
|
|
39
|
+
self.analysis_types = self.__parameters['analysis_types']
|
|
40
|
+
# create client
|
|
41
|
+
try:
|
|
42
|
+
self.logger.debug(f"Region:{self.region}")
|
|
43
|
+
self.__comp_client = boto3.client(service_name='comprehend',region_name=self.region)
|
|
44
|
+
except Exception as err:
|
|
45
|
+
self.logger.error("Unable to create Comprehend client: ",err)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def client(self):
|
|
49
|
+
return self.__comp_client
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
#### CUSTOM ENTITY
|
|
53
|
+
|
|
54
|
+
def submit_custom_entity_job(self,job_name): #access_role_arn,entity_recogniser_arn):
|
|
55
|
+
job_str = f"{self.prefix}{job_name}{self.postfix}"
|
|
56
|
+
|
|
57
|
+
response = self.__comp_client.start_entities_detection_job(
|
|
58
|
+
InputDataConfig={
|
|
59
|
+
'S3Uri': self.input_uri,
|
|
60
|
+
'InputFormat': 'ONE_DOC_PER_FILE'
|
|
61
|
+
},
|
|
62
|
+
OutputDataConfig={
|
|
63
|
+
'S3Uri': self.output_uri
|
|
64
|
+
},
|
|
65
|
+
DataAccessRoleArn=self.access_role_arn,
|
|
66
|
+
JobName=job_str,
|
|
67
|
+
EntityRecognizerArn=self.entity_recogniser_arn,
|
|
68
|
+
LanguageCode='en'
|
|
69
|
+
)
|
|
70
|
+
self.job_id = response['JobId']
|
|
71
|
+
return response
|
|
72
|
+
|
|
73
|
+
# Check job status
|
|
74
|
+
def check_job_status(self):
|
|
75
|
+
job_status = self.__comp_client.describe_entities_detection_job(
|
|
76
|
+
JobId=self.job_id
|
|
77
|
+
)
|
|
78
|
+
self.__job_properties = job_status['EntitiesDetectionJobProperties']
|
|
79
|
+
return self.__job_properties['JobStatus']
|
|
80
|
+
|
|
81
|
+
def get_job_details(self):
|
|
82
|
+
return self.__job_properties
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# Use AWS comprehend to get bulk key phrases from single batch of chunked text
|
|
86
|
+
def get_single_batch_analysis(self,index,chunk):
|
|
87
|
+
comprehend = self.client()
|
|
88
|
+
results = {}
|
|
89
|
+
print("Analysing chunk",index)
|
|
90
|
+
print(" . key_phrase")
|
|
91
|
+
kpresult = comprehend.batch_detect_key_phrases(TextList=chunk,LanguageCode='en')
|
|
92
|
+
results['KeyPhraseResults'] = kpresult
|
|
93
|
+
#key_phrase_results.append(kpresult)
|
|
94
|
+
time.sleep(2)
|
|
95
|
+
print(" . sentiment")
|
|
96
|
+
senresult = comprehend.batch_detect_sentiment(TextList=chunk,LanguageCode='en')
|
|
97
|
+
results['SentimentResults'] = senresult
|
|
98
|
+
#sentiment_results.append(senresult)
|
|
99
|
+
time.sleep(2)
|
|
100
|
+
print(" . targeted_sentiment")
|
|
101
|
+
tsenresult = comprehend.batch_detect_targeted_sentiment(TextList=chunk,LanguageCode='en')
|
|
102
|
+
results['TargetedSentimentResults'] = tsenresult
|
|
103
|
+
#target_sent_results.append(tsenresult)
|
|
104
|
+
time.sleep(2)
|
|
105
|
+
print(" . syntax")
|
|
106
|
+
synresult = comprehend.batch_detect_syntax(TextList=chunk,LanguageCode='en')
|
|
107
|
+
results['SyntaxResults'] = synresult
|
|
108
|
+
#syntax_results.append(synresult)
|
|
109
|
+
time.sleep(2)
|
|
110
|
+
return results
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Use AWS comprehend to get bulk key phrases from chunked text
|
|
114
|
+
def get_multiple_batch_analysis(self,chunked_text):
|
|
115
|
+
chunk_results = {}
|
|
116
|
+
for key in self.analysis_types.keys():
|
|
117
|
+
chunk_results[key] = []
|
|
118
|
+
|
|
119
|
+
for idx,chunk in enumerate(chunked_text):
|
|
120
|
+
if len(chunked_text) > 4999:
|
|
121
|
+
print("WARNING: Text too long to analyse - index",idx,"skipped!")
|
|
122
|
+
else:
|
|
123
|
+
try:
|
|
124
|
+
results = self.get_single_batch_analysis(index=idx,chunk=chunk)
|
|
125
|
+
except(Exception) as error:
|
|
126
|
+
print("There was an error with index",idx,error)
|
|
127
|
+
finally:
|
|
128
|
+
if results:
|
|
129
|
+
for key in results.keys():
|
|
130
|
+
chunk_results[key].append(results[key])
|
|
131
|
+
|
|
132
|
+
return chunk_results
|
|
133
|
+
|
|
134
|
+
# Take batched responses and concenate single lists of results, errors, and http responses
|
|
135
|
+
def unbatch_results(self,result_type,results,batch_size=25):
|
|
136
|
+
unbatched_results = {}
|
|
137
|
+
unbatched_errors = {}
|
|
138
|
+
batch_responses = {}
|
|
139
|
+
for idx,batch in enumerate(results):
|
|
140
|
+
#print("Response for batch:",idx)
|
|
141
|
+
batch_responses[idx] = batch['ResponseMetadata']
|
|
142
|
+
result_list = batch['ResultList']
|
|
143
|
+
error_list = batch['ErrorList']
|
|
144
|
+
for r in result_list:
|
|
145
|
+
ridx = idx*batch_size + r['Index']
|
|
146
|
+
rdata = r[result_type]
|
|
147
|
+
unbatched_results[ridx] = rdata
|
|
148
|
+
for e in error_list:
|
|
149
|
+
eidx = e['Index']
|
|
150
|
+
unbatched_errors[eidx] = 'ERROR' + e['ErrorCode'] + ': ' + e['ErrorMessage']
|
|
151
|
+
unbatched = {}
|
|
152
|
+
unbatched['results'] = unbatched_results
|
|
153
|
+
unbatched['errors'] = unbatched_errors
|
|
154
|
+
unbatched['responses'] = batch_responses
|
|
155
|
+
return unbatched
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def check_long_text(self,df):
|
|
160
|
+
# Check for long reflections (too long for batch analysis)
|
|
161
|
+
long_df = df.copy()
|
|
162
|
+
long_df = long_df[long_df.text.str.len()>5000]
|
|
163
|
+
long_df['length'] = long_df.text.str.len()
|
|
164
|
+
return long_df
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# def extract_result(self,result,batch,batch_params):
|
|
168
|
+
# match batch:
|
|
169
|
+
# case "KeyPhraseResults":
|
|
170
|
+
# extracted = [r['Text'] for r in result if r['Score'] >= batch_params["min_score"]]
|
|
171
|
+
# case "SentimentResults":
|
|
172
|
+
# extracted = result
|
|
173
|
+
# case "TargetedSentimentResults":
|
|
174
|
+
# extracted = dict()
|
|
175
|
+
# for r in result:
|
|
176
|
+
# for mention in r['Mentions']:
|
|
177
|
+
# if (mention['Score'] >= batch_params["min_score"]):
|
|
178
|
+
# text = mention['Text']
|
|
179
|
+
# key = f"{mention['Type']}_{mention['MentionSentiment']['Sentiment']}"
|
|
180
|
+
# if key in extracted.keys():
|
|
181
|
+
# extracted[key].add(text)
|
|
182
|
+
# else:
|
|
183
|
+
# extracted[key] = {text}
|
|
184
|
+
# case "SyntaxResults":
|
|
185
|
+
# tags = []
|
|
186
|
+
# tokens = []
|
|
187
|
+
# for r in result:
|
|
188
|
+
# pos = r['PartOfSpeech']
|
|
189
|
+
# tag = pos['Tag']
|
|
190
|
+
# if pos['Score'] < batch_params["max_score"]:
|
|
191
|
+
# tag = tag+"_?"
|
|
192
|
+
# tags.append(tag)
|
|
193
|
+
# tokens.append(r['Text'])
|
|
194
|
+
|
|
195
|
+
# extracted = {'tokens':tokens,'tags':tags}
|
|
196
|
+
# case other:
|
|
197
|
+
# extracted = []
|
|
198
|
+
# return extracted
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
|