reflexive 0.1.9__py3-none-any.whl → 1.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reflexive/__init__.py +5 -9
- reflexive/analyse.py +431 -0
- reflexive/cfg.py +118 -0
- reflexive/session.py +265 -0
- reflexive/util.py +125 -0
- reflexive/{visual/display.py → visualise.py} +22 -22
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/METADATA +1 -1
- reflexive-1.0.13.dist-info/RECORD +12 -0
- reflexive/analyse/__init__.py +0 -0
- reflexive/analyse/aws_nlp.py +0 -196
- reflexive/analyse/general.py +0 -128
- reflexive/analyse/reflexive_expressions.py +0 -124
- reflexive/aws_connect/__init__.py +0 -0
- reflexive/aws_connect/comprehend.py +0 -205
- reflexive/aws_connect/s3.py +0 -89
- reflexive/common/__init__.py +0 -0
- reflexive/common/local.py +0 -48
- reflexive/common/parameters.py +0 -77
- reflexive/common/util.py +0 -108
- reflexive/visual/__init__.py +0 -0
- reflexive-0.1.9.dist-info/RECORD +0 -20
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/LICENSE +0 -0
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/LICENSE.txt +0 -0
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/WHEEL +0 -0
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/top_level.txt +0 -0
reflexive/session.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
|
|
2
|
+
import boto3
|
|
3
|
+
import time
|
|
4
|
+
import tarfile
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from reflexive import cfg
|
|
8
|
+
import reflexive as rfx
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
class AWS:
|
|
15
|
+
|
|
16
|
+
config = None
|
|
17
|
+
aws_session = None
|
|
18
|
+
|
|
19
|
+
def __init__(self,config:cfg.Config):
|
|
20
|
+
# on initialisation create a new session with provided profile (or with default profile)
|
|
21
|
+
#logger.error(config.get_parameters())
|
|
22
|
+
if config==None:
|
|
23
|
+
config = cfg.Config()
|
|
24
|
+
self.config = config
|
|
25
|
+
self.new_session()
|
|
26
|
+
|
|
27
|
+
def get_parameters(self):
|
|
28
|
+
return self.__dict__
|
|
29
|
+
|
|
30
|
+
def new_session(self):
|
|
31
|
+
logger.info("In new_session")
|
|
32
|
+
try:
|
|
33
|
+
self.aws_session = boto3.Session(profile_name=self.config.aws_profile)
|
|
34
|
+
self.config.aws_region = self.aws_session.region_name
|
|
35
|
+
self.config.aws_access_key = self.aws_session.get_credentials().access_key
|
|
36
|
+
logger.info("Created new AWS session in region %s for profile: %s",self.config.aws_region,self.config.aws_profile)
|
|
37
|
+
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logger.error("Unable to create an AWS session: %s",repr(e))
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
self.config.aws_account_number = self.aws_session.client('sts').get_caller_identity().get('Account')
|
|
43
|
+
logger.info("Retrieved account number from AWS")
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logger.error("Unable to retrieve account number from AWS: %s",repr(e))
|
|
46
|
+
|
|
47
|
+
return self.aws_session
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class S3:
|
|
51
|
+
|
|
52
|
+
aws = None
|
|
53
|
+
config = None
|
|
54
|
+
__s3_client = None
|
|
55
|
+
|
|
56
|
+
def __init__(self,aws:AWS):
|
|
57
|
+
self.aws = aws
|
|
58
|
+
self.config = self.aws.config
|
|
59
|
+
|
|
60
|
+
# create client
|
|
61
|
+
try:
|
|
62
|
+
logger.debug(f"Region:{self.aws.aws_session.region_name}")
|
|
63
|
+
self.__s3_client = aws.aws_session.client(service_name='s3')
|
|
64
|
+
except Exception as err:
|
|
65
|
+
logger.error("Unable to create S3 client: ",err)
|
|
66
|
+
|
|
67
|
+
# Return the S3 client
|
|
68
|
+
def client(self):
|
|
69
|
+
return self.__s3_client
|
|
70
|
+
|
|
71
|
+
# Function to upload reflections to S3
|
|
72
|
+
def upload_docs(self,text_series):
|
|
73
|
+
|
|
74
|
+
files_folder = f"{self.config.prefix}files{self.config.postfix}"
|
|
75
|
+
|
|
76
|
+
s3 = self.__s3_client
|
|
77
|
+
s3ap = self.config.s3_accesspoint_arn
|
|
78
|
+
logger.debug(f"ACCESS POINT: {s3ap}")
|
|
79
|
+
|
|
80
|
+
logger.info(f"Uploading {len(text_series)} reflections to S3 ({files_folder})...")
|
|
81
|
+
logger.debug(f"({s3ap}/{files_folder})")
|
|
82
|
+
for idx in text_series.index:
|
|
83
|
+
file_name = f"{self.config.prefix}{idx}.txt"
|
|
84
|
+
file_body = text_series.iloc[idx]
|
|
85
|
+
logger.info(f"Uploading {file_name}")
|
|
86
|
+
#print(file_body)
|
|
87
|
+
response = s3.put_object(Body=file_body,Bucket=s3ap,Key=f"{files_folder}/{file_name}")
|
|
88
|
+
if response['ResponseMetadata']['HTTPStatusCode'] != 200:
|
|
89
|
+
logger.error("------------------------------------------------------------")
|
|
90
|
+
logger.error(f"ERROR: There was a problem with {file_name}")
|
|
91
|
+
logger.error(response)
|
|
92
|
+
logger.error("------------------------------------------------------------")
|
|
93
|
+
else:
|
|
94
|
+
logger.info('Success')
|
|
95
|
+
logger.info("Finished uploading reflections to S3.")
|
|
96
|
+
return response
|
|
97
|
+
|
|
98
|
+
# download and save results
|
|
99
|
+
def results_download_save_extract(self,s3Uri,local_file_path):
|
|
100
|
+
s3 = self.__s3_client
|
|
101
|
+
output_key = s3Uri.split(self.config.s3_bucket_name)[1]
|
|
102
|
+
# download from S3 to local path
|
|
103
|
+
with open(f"{local_file_path}.tar.gz",'wb') as output_data:
|
|
104
|
+
s3.download_fileobj(self.config.s3_bucket_name,output_key[1:],output_data)
|
|
105
|
+
|
|
106
|
+
# extract the files from tar archive
|
|
107
|
+
files = list()
|
|
108
|
+
with tarfile.open(f"{local_file_path}.tar.gz", "r:gz") as tf:
|
|
109
|
+
for member in tf.getmembers():
|
|
110
|
+
f = tf.extractfile(member)
|
|
111
|
+
if f is not None:
|
|
112
|
+
content = f.read()
|
|
113
|
+
files.append(content)
|
|
114
|
+
#print("Number of files:",len(files))
|
|
115
|
+
# extract results and save and return
|
|
116
|
+
raw_results = files[0].decode("utf-8").split('\n')
|
|
117
|
+
raw_results.pop() # pop last item off as empty entry due to final \n
|
|
118
|
+
json_results = json.dumps(raw_results)
|
|
119
|
+
with open(f"{local_file_path}.json","w") as fp:
|
|
120
|
+
fp.write(json_results)
|
|
121
|
+
return json_results
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class Comprehend:
|
|
125
|
+
|
|
126
|
+
aws = None
|
|
127
|
+
config = None
|
|
128
|
+
__comp_client = None
|
|
129
|
+
|
|
130
|
+
def __init__(self,aws:AWS):
|
|
131
|
+
self.aws = aws
|
|
132
|
+
self.config = self.aws.config
|
|
133
|
+
|
|
134
|
+
# create client
|
|
135
|
+
try:
|
|
136
|
+
logger.debug(f"Region:{self.aws.aws_session.region_name}")
|
|
137
|
+
self.__comp_client = self.aws.aws_session.client(service_name='comprehend')
|
|
138
|
+
except Exception as err:
|
|
139
|
+
logger.error("Unable to create Comprehend client: ",err)
|
|
140
|
+
|
|
141
|
+
def client(self):
|
|
142
|
+
return self.__comp_client
|
|
143
|
+
|
|
144
|
+
# Use AWS comprehend to get bulk key phrases from single batch of chunked text
|
|
145
|
+
def get_single_batch_analysis(self,index,chunk):
|
|
146
|
+
comp_client = self.client()
|
|
147
|
+
results = {}
|
|
148
|
+
print("Analysing chunk",index)
|
|
149
|
+
print(" . key_phrase")
|
|
150
|
+
kpresult = comp_client.batch_detect_key_phrases(TextList=chunk,LanguageCode='en')
|
|
151
|
+
results['KeyPhraseResults'] = kpresult
|
|
152
|
+
#key_phrase_results.append(kpresult)
|
|
153
|
+
time.sleep(2)
|
|
154
|
+
print(" . sentiment")
|
|
155
|
+
senresult = comp_client.batch_detect_sentiment(TextList=chunk,LanguageCode='en')
|
|
156
|
+
results['SentimentResults'] = senresult
|
|
157
|
+
#sentiment_results.append(senresult)
|
|
158
|
+
time.sleep(2)
|
|
159
|
+
print(" . targeted_sentiment")
|
|
160
|
+
tsenresult = comp_client.batch_detect_targeted_sentiment(TextList=chunk,LanguageCode='en')
|
|
161
|
+
results['TargetedSentimentResults'] = tsenresult
|
|
162
|
+
#target_sent_results.append(tsenresult)
|
|
163
|
+
time.sleep(2)
|
|
164
|
+
print(" . syntax")
|
|
165
|
+
synresult = comp_client.batch_detect_syntax(TextList=chunk,LanguageCode='en')
|
|
166
|
+
results['SyntaxResults'] = synresult
|
|
167
|
+
#syntax_results.append(synresult)
|
|
168
|
+
time.sleep(2)
|
|
169
|
+
return results
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# Use AWS comprehend to get bulk key phrases from chunked text
|
|
173
|
+
def get_multiple_batch_analysis(self,chunked_text):
|
|
174
|
+
chunk_results = {}
|
|
175
|
+
for key in self.config.analysis_types.keys():
|
|
176
|
+
chunk_results[key] = []
|
|
177
|
+
|
|
178
|
+
for idx,chunk in enumerate(chunked_text):
|
|
179
|
+
if len(chunked_text) > 4999:
|
|
180
|
+
print("WARNING: Text too long to analyse - index",idx,"skipped!")
|
|
181
|
+
else:
|
|
182
|
+
try:
|
|
183
|
+
results = self.get_single_batch_analysis(index=idx,chunk=chunk)
|
|
184
|
+
except(Exception) as error:
|
|
185
|
+
print("There was an error with index",idx,error)
|
|
186
|
+
finally:
|
|
187
|
+
if results:
|
|
188
|
+
for key in results.keys():
|
|
189
|
+
chunk_results[key].append(results[key])
|
|
190
|
+
|
|
191
|
+
return chunk_results
|
|
192
|
+
|
|
193
|
+
# Take batched responses and concenate single lists of results, errors, and http responses
|
|
194
|
+
def unbatch_results(self,result_type,results,batch_size=25):
|
|
195
|
+
unbatched_results = {}
|
|
196
|
+
unbatched_errors = {}
|
|
197
|
+
batch_responses = {}
|
|
198
|
+
for idx,batch in enumerate(results):
|
|
199
|
+
#print("Response for batch:",idx)
|
|
200
|
+
batch_responses[idx] = batch['ResponseMetadata']
|
|
201
|
+
result_list = batch['ResultList']
|
|
202
|
+
error_list = batch['ErrorList']
|
|
203
|
+
for r in result_list:
|
|
204
|
+
ridx = idx*batch_size + r['Index']
|
|
205
|
+
rdata = r[result_type]
|
|
206
|
+
unbatched_results[ridx] = rdata
|
|
207
|
+
for e in error_list:
|
|
208
|
+
eidx = e['Index']
|
|
209
|
+
unbatched_errors[eidx] = 'ERROR' + e['ErrorCode'] + ': ' + e['ErrorMessage']
|
|
210
|
+
unbatched = {}
|
|
211
|
+
unbatched['results'] = unbatched_results
|
|
212
|
+
unbatched['errors'] = unbatched_errors
|
|
213
|
+
unbatched['responses'] = batch_responses
|
|
214
|
+
return unbatched
|
|
215
|
+
|
|
216
|
+
def check_long_text(self,df):
|
|
217
|
+
# Check for long reflections (too long for batch analysis)
|
|
218
|
+
long_df = df.copy()
|
|
219
|
+
long_df = long_df[long_df.text.str.len()>5000]
|
|
220
|
+
long_df['length'] = long_df.text.str.len()
|
|
221
|
+
return long_df
|
|
222
|
+
|
|
223
|
+
# #### CUSTOM ENTITY
|
|
224
|
+
|
|
225
|
+
def submit_custom_entity_job(self,job_name): #access_role_arn,entity_recogniser_arn):
|
|
226
|
+
job_str = f"{self.config.prefix}{job_name}{self.config.postfix}"
|
|
227
|
+
|
|
228
|
+
response = self.__comp_client.start_entities_detection_job(
|
|
229
|
+
InputDataConfig={
|
|
230
|
+
'S3Uri': self.config.s3_input_uri,
|
|
231
|
+
'InputFormat': 'ONE_DOC_PER_FILE'
|
|
232
|
+
},
|
|
233
|
+
OutputDataConfig={
|
|
234
|
+
'S3Uri': self.config.s3_output_uri
|
|
235
|
+
},
|
|
236
|
+
DataAccessRoleArn=self.config.comprehend_access_role_arn,
|
|
237
|
+
JobName=job_str,
|
|
238
|
+
EntityRecognizerArn=self.config.reflexive_entity_arn,
|
|
239
|
+
LanguageCode='en'
|
|
240
|
+
)
|
|
241
|
+
self.job_id = response['JobId']
|
|
242
|
+
self.check_job_status() # force the creation of __job_properties
|
|
243
|
+
return response
|
|
244
|
+
|
|
245
|
+
def get_current_job_id(self):
|
|
246
|
+
return self.job_id
|
|
247
|
+
|
|
248
|
+
# Check job status
|
|
249
|
+
def check_job_status(self):
|
|
250
|
+
job_status = self.__comp_client.describe_entities_detection_job(
|
|
251
|
+
JobId=self.job_id
|
|
252
|
+
)
|
|
253
|
+
self.__job_properties = job_status['EntitiesDetectionJobProperties']
|
|
254
|
+
return self.__job_properties['JobStatus']
|
|
255
|
+
|
|
256
|
+
def get_job_details(self):
|
|
257
|
+
return self.__job_properties
|
|
258
|
+
|
|
259
|
+
#checked
|
|
260
|
+
def download_and_extract(self,s3):
|
|
261
|
+
local_output_dir = f"{self.config.local_path}{self.config.prefix}output{self.config.postfix}"
|
|
262
|
+
job_details = self.get_job_details()
|
|
263
|
+
s3Uri = job_details['OutputDataConfig']['S3Uri']
|
|
264
|
+
return s3.results_download_save_extract(s3Uri,local_output_dir)
|
|
265
|
+
|
reflexive/util.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
from sklearn.preprocessing import MinMaxScaler
|
|
8
|
+
|
|
9
|
+
# File functions
|
|
10
|
+
def get_data_path_name(config,name,ext):
|
|
11
|
+
return f"{config.local_path}{config.prefix}{name}{config.postfix}.{ext}"
|
|
12
|
+
|
|
13
|
+
def set_sub_dir(config,sub_dir=None):
|
|
14
|
+
# check dir sub_dir exists
|
|
15
|
+
if sub_dir:
|
|
16
|
+
local_dir = f"{config.local_path}{sub_dir}/"
|
|
17
|
+
logger.debug(f"local_dir: {local_dir}")
|
|
18
|
+
dirExists = os.path.exists(local_dir)
|
|
19
|
+
if not dirExists:
|
|
20
|
+
logger.info(f"Creating subdirectory: {local_dir}")
|
|
21
|
+
os.makedirs(local_dir)
|
|
22
|
+
else:
|
|
23
|
+
local_dir = local_path
|
|
24
|
+
return local_dir
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Function to write dictionaries to both json and csv
|
|
29
|
+
def writeDictJsonCSV(dictionary,path_file):
|
|
30
|
+
with open(f"{path_file}.json",'w') as fp:
|
|
31
|
+
fp.write(json.dumps(dictionary))
|
|
32
|
+
|
|
33
|
+
ngram_df = pd.DataFrame.from_dict(dictionary,orient='index')
|
|
34
|
+
ngram_df.to_csv(f"{path_file}.csv")
|
|
35
|
+
|
|
36
|
+
# Data functions
|
|
37
|
+
def sort_dict_by_value(d):
|
|
38
|
+
return dict(sorted(d.items(), key=lambda x:x[1], reverse=True))
|
|
39
|
+
|
|
40
|
+
def filter_dict_by_value(ngrams,min_val=3):
|
|
41
|
+
filtered_ngrams = {}
|
|
42
|
+
for k,v in ngrams.items():
|
|
43
|
+
if v >=min_val:
|
|
44
|
+
filtered_ngrams[k] = v
|
|
45
|
+
return filtered_ngrams
|
|
46
|
+
|
|
47
|
+
# Input a series and output a list of lists with each maxn elements
|
|
48
|
+
def series_to_chunked_list(series,maxn=25):
|
|
49
|
+
l = list(series)
|
|
50
|
+
return __chunk_list(l,maxn)
|
|
51
|
+
|
|
52
|
+
# Chunk a list into a list of lists with maxn elements
|
|
53
|
+
def __chunk_list(l,maxn=25):
|
|
54
|
+
return [l[i:i + maxn] for i in range(0, len(l), maxn)]
|
|
55
|
+
|
|
56
|
+
# Count named entities
|
|
57
|
+
def count_entities(entities):
|
|
58
|
+
counts = []
|
|
59
|
+
for k,v in entities.items():
|
|
60
|
+
counts.append((k,len(v)))
|
|
61
|
+
return sorted(counts, key=lambda x: x[1], reverse=True)
|
|
62
|
+
|
|
63
|
+
# Function for calculating proportions of features
|
|
64
|
+
def ratios(elements):
|
|
65
|
+
etotal = sum([v[1] for v in elements])
|
|
66
|
+
if etotal==0:
|
|
67
|
+
return elements
|
|
68
|
+
else:
|
|
69
|
+
proportioned = []
|
|
70
|
+
for element in elements:
|
|
71
|
+
prop_val = round((element[1]/etotal),4)
|
|
72
|
+
proportioned.append((element[0],prop_val))
|
|
73
|
+
return proportioned
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# Count labels associated with strings
|
|
78
|
+
def count_labels(string_labels):
|
|
79
|
+
counts = dict()
|
|
80
|
+
for rt in string_labels:
|
|
81
|
+
counts[rt[1]] = counts.setdefault(rt[1],0) + 1
|
|
82
|
+
return sorted(counts.items(), key=lambda x: x[1], reverse=True)
|
|
83
|
+
|
|
84
|
+
def count_keys(key_count_dict):
|
|
85
|
+
counts = dict()
|
|
86
|
+
for k,v in key_count_dict.items():
|
|
87
|
+
counts[k] = counts.setdefault(k,0) + v
|
|
88
|
+
return sorted(counts.items(), key=lambda x: x[1], reverse=True)
|
|
89
|
+
|
|
90
|
+
# Total the values in list of tuples
|
|
91
|
+
def tuple_values_total(tuples):
|
|
92
|
+
tvs = [t[1] for t in tuples]
|
|
93
|
+
return sum(tvs)
|
|
94
|
+
|
|
95
|
+
#### SCALING AND NORMALISING
|
|
96
|
+
|
|
97
|
+
# Outliers
|
|
98
|
+
|
|
99
|
+
def outlier_fence(series):
|
|
100
|
+
bounds = {}
|
|
101
|
+
stats = series.describe()
|
|
102
|
+
iqr = stats['75%'] - stats['25%']
|
|
103
|
+
bounds["IQR"]=iqr
|
|
104
|
+
upper = stats['75%']+1.5*iqr
|
|
105
|
+
bounds["UPPER"]=upper
|
|
106
|
+
lower = stats['25%']-1.5*iqr
|
|
107
|
+
bounds["LOWER"]=lower
|
|
108
|
+
return bounds
|
|
109
|
+
|
|
110
|
+
# MinMax Scaling
|
|
111
|
+
def scale_min_max(df_cols):
|
|
112
|
+
scaler = MinMaxScaler()
|
|
113
|
+
return scaler.fit_transform(df_cols)
|
|
114
|
+
|
|
115
|
+
# Normalise domain term counts
|
|
116
|
+
def normalise_domain_counts(domain_counts,text_size):
|
|
117
|
+
norms = {}
|
|
118
|
+
for k,v in domain_counts.items():
|
|
119
|
+
norms[k] = round(v*text_size,3)
|
|
120
|
+
return norms
|
|
121
|
+
|
|
122
|
+
def normalise_scaled(df,feature,norm_feature = 'text_scaled'):
|
|
123
|
+
tempdf = df[[feature,norm_feature]].copy()
|
|
124
|
+
tempdf['norm_scaled'] = tempdf.apply(lambda r: round(r[feature]/(r[norm_feature]+0.01),4),axis=1)
|
|
125
|
+
return tempdf['norm_scaled']
|
|
@@ -1,30 +1,30 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
#from reflexive.common.parameters import Parameters
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
try:
|
|
8
|
-
import coloredlogs
|
|
9
|
-
coloredlogs.install(level='INFO')
|
|
10
|
-
except:
|
|
11
|
-
print("Colored logs not available")
|
|
1
|
+
from reflexive import session
|
|
2
|
+
from reflexive import cfg
|
|
12
3
|
|
|
13
4
|
class Display:
|
|
5
|
+
aws:session.AWS = None
|
|
6
|
+
config:cfg.Config = None
|
|
14
7
|
|
|
15
|
-
|
|
8
|
+
defaults = {
|
|
9
|
+
"priority_tags": ["AR","EP","VR_EV_CN","ER_AF","RR","KP"],
|
|
10
|
+
"colours": {"VR_EV_CN": "#ff6644","ER_AF": "#dd44cc","AR": "#00cc00","EP": "#aacc33","RR": "#00aaff","KP":"#aaaacc"}}
|
|
16
11
|
|
|
17
|
-
def __init__(self):
|
|
18
|
-
self.
|
|
19
|
-
self.
|
|
20
|
-
self.
|
|
21
|
-
self.
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
12
|
+
def __init__(self,aws):
|
|
13
|
+
self.aws = aws
|
|
14
|
+
self.aws = aws
|
|
15
|
+
self.config = self.aws.config
|
|
16
|
+
self.set_default_parameters()
|
|
17
|
+
|
|
18
|
+
def set_default_parameters(self):
|
|
19
|
+
priority_tags = self.defaults['priority_tags']
|
|
20
|
+
colours = self.defaults['colours']
|
|
21
|
+
options = {"ents": colours.keys(), "colors": colours.values}
|
|
22
|
+
self.config.set_display_parameters(priority_tags,colours,options)
|
|
23
|
+
|
|
24
|
+
|
|
25
25
|
def add_reflexive_offsets(self,df):
|
|
26
26
|
temp_df = df.copy()
|
|
27
|
-
temp_df['reflexive_offsets'] = temp_df.
|
|
27
|
+
temp_df['reflexive_offsets'] = temp_df.ReflexiveResults.apply(self.collect_reflexive_offsets)
|
|
28
28
|
return temp_df
|
|
29
29
|
|
|
30
30
|
def add_keyphrase_offsets(self,df):
|
|
@@ -49,7 +49,7 @@ class Display:
|
|
|
49
49
|
#pseudonym = record['pseudonym']
|
|
50
50
|
#point_round = record['point_round']
|
|
51
51
|
#title = f"{pseudonym} ({point_round}) - {timestamp}"
|
|
52
|
-
tags = self.
|
|
52
|
+
tags = self.config.display_priority_tags
|
|
53
53
|
text = record['text']
|
|
54
54
|
reflexive_offsets = record['reflexive_offsets']
|
|
55
55
|
keyphrase_offsets = record['keyphrase_offsets']
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
reflexive/__init__.py,sha256=Ke3gGqrVgPP2IRgifljQL8Ep3qVuuOf4LgZUkxdJQ2k,119
|
|
2
|
+
reflexive/analyse.py,sha256=UzWwgjAFNjeWFkCQ2o99g2vWajf17_OtSq4dFCvuPYU,17489
|
|
3
|
+
reflexive/cfg.py,sha256=Ges35G234P2lvOQHgPZQae5hMSOGyBsmp1bY_yQEKkk,4303
|
|
4
|
+
reflexive/session.py,sha256=MbqwTsYTgq_e_gw3mb1eRv6USs-zZ2cTCrvUNWuKfAQ,10067
|
|
5
|
+
reflexive/util.py,sha256=WQ1oyzDi1i8wQ6IBwBPk6IFy07YKhg-Ug2FsOGVJRJQ,3649
|
|
6
|
+
reflexive/visualise.py,sha256=weBNqd3uiCEg3bvLLNDPVxSkdjapm1jZrTw8cU3uZx8,4032
|
|
7
|
+
reflexive-1.0.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
8
|
+
reflexive-1.0.13.dist-info/LICENSE.txt,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
|
|
9
|
+
reflexive-1.0.13.dist-info/METADATA,sha256=oT1mUVaMPVDIZ7oQdqe628lqmJROtjJhXfwuzDaRWmk,12038
|
|
10
|
+
reflexive-1.0.13.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
11
|
+
reflexive-1.0.13.dist-info/top_level.txt,sha256=pOMr-QGleRBRCFBozgvM-UUUmOjD_-naJfu1522E2V8,10
|
|
12
|
+
reflexive-1.0.13.dist-info/RECORD,,
|
reflexive/analyse/__init__.py
DELETED
|
File without changes
|
reflexive/analyse/aws_nlp.py
DELETED
|
@@ -1,196 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
from reflexive.common.parameters import Parameters
|
|
4
|
-
from reflexive.common.local import Local
|
|
5
|
-
from reflexive.aws_connect.comprehend import Comprehend
|
|
6
|
-
from reflexive.common.util import Util
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
|
-
import logging
|
|
10
|
-
import pandas as pd
|
|
11
|
-
|
|
12
|
-
try:
|
|
13
|
-
import coloredlogs
|
|
14
|
-
coloredlogs.install(level='INFO')
|
|
15
|
-
except:
|
|
16
|
-
print("Colored logs not available")
|
|
17
|
-
|
|
18
|
-
class Nlp:
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
def __init__(self,parameters:Parameters,local:Local,comprehend:Comprehend):
|
|
23
|
-
self.__parameters = parameters.all_parameters()
|
|
24
|
-
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
25
|
-
self.local_path = self.__parameters['local_path']
|
|
26
|
-
self.prefix = self.__parameters['prefix']
|
|
27
|
-
self.postfix = self.__parameters['postfix']
|
|
28
|
-
self.analysis_types = self.__parameters['analysis_types']
|
|
29
|
-
self.__local = local
|
|
30
|
-
self.__comprehend = comprehend
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
#### COMPREHEND ANALYSIS
|
|
35
|
-
|
|
36
|
-
def comprehend_analysis(self,df):
|
|
37
|
-
util = Util()
|
|
38
|
-
comprehend = self.__comprehend
|
|
39
|
-
self.analysis_types = self.__parameters['analysis_types']
|
|
40
|
-
#print(type(df.text))
|
|
41
|
-
# chunk the text for batch analysis
|
|
42
|
-
chunked_text = util.series_to_chunked_list(series=df.text)
|
|
43
|
-
print("Number of chunks:",len(chunked_text))
|
|
44
|
-
# start batch analysis
|
|
45
|
-
chunked_results = comprehend.get_multiple_batch_analysis(chunked_text)
|
|
46
|
-
print("Finished Analysis.")
|
|
47
|
-
# write to file
|
|
48
|
-
print("Writing data to file...")
|
|
49
|
-
with open(f"{self.local_path}{self.prefix}analysis_chunks{self.postfix}.json", "w") as fp:
|
|
50
|
-
json.dump(chunked_results,fp)
|
|
51
|
-
print("DONE!")
|
|
52
|
-
# unchunk
|
|
53
|
-
final_results = {}
|
|
54
|
-
for key in chunked_results.keys():
|
|
55
|
-
final_results[key] = comprehend.unbatch_results(self.analysis_types[key],chunked_results[key])
|
|
56
|
-
print("Finished Unbatching",key," - Writing data to file...")
|
|
57
|
-
filename = f"{self.local_path}{self.prefix}{key}{self.postfix}.json"
|
|
58
|
-
with open(filename, "w") as fp:
|
|
59
|
-
json.dump(final_results[key],fp)
|
|
60
|
-
print("DONE!")
|
|
61
|
-
# Save final_results for reload if necessary
|
|
62
|
-
with open(f"{self.local_path}{self.prefix}final_results{self.postfix}.json", "w") as fp:
|
|
63
|
-
json.dump(final_results,fp)
|
|
64
|
-
return final_results
|
|
65
|
-
|
|
66
|
-
def check_results(self,results):
|
|
67
|
-
print("Checking for errors...")
|
|
68
|
-
for key in results.keys():
|
|
69
|
-
errors = results[key]['errors']
|
|
70
|
-
print(f"Errors for {key}: {errors}")
|
|
71
|
-
print()
|
|
72
|
-
print("Checking that we have results for all docs")
|
|
73
|
-
for key in results.keys():
|
|
74
|
-
num_results= len(results[key]['results'])
|
|
75
|
-
print(f"Number of results for {key}: {num_results}")
|
|
76
|
-
return errors
|
|
77
|
-
|
|
78
|
-
def add_results_to_df(self,results,df):
|
|
79
|
-
for key in results.keys():
|
|
80
|
-
rs = results[key]['results']
|
|
81
|
-
newresults = {}
|
|
82
|
-
for oldkey in rs.keys():
|
|
83
|
-
newresults[int(oldkey)] = rs[oldkey] # Need to change keys to int to properly add to dataframe
|
|
84
|
-
df[key] = pd.Series(newresults)
|
|
85
|
-
return df
|
|
86
|
-
|
|
87
|
-
def nlp_analytics(self,df):
|
|
88
|
-
temp_df = df.copy()
|
|
89
|
-
temp_df = self.keyphrase_analytics(temp_df)
|
|
90
|
-
temp_df = self.named_entity_analytics(temp_df)
|
|
91
|
-
temp_df = self.targeted_sentiment_analytics(temp_df)
|
|
92
|
-
temp_df = self.syntax_analytics(temp_df)
|
|
93
|
-
return temp_df
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def keyphrase_analytics(self,df):
|
|
97
|
-
util = Util()
|
|
98
|
-
df["key_phrases"] = df.KeyPhraseResults.apply(self.parse_keyPhraseResults)
|
|
99
|
-
df["key_phrase_counts"] = df.key_phrases.apply(util.count_keys)
|
|
100
|
-
df["key_phrases_total"] = df.key_phrase_counts.apply(util.tuple_values_total)
|
|
101
|
-
if (len(df)>1):
|
|
102
|
-
df["key_phrases_scaled"] = util.scale_min_max(df[['key_phrases_total']])
|
|
103
|
-
else:
|
|
104
|
-
df["key_phrases_scaled"] = 1
|
|
105
|
-
# Normalise based on text_scaled
|
|
106
|
-
df['key_phrases_norm'] = util.normalise_scaled(df,'key_phrases_scaled')
|
|
107
|
-
return df
|
|
108
|
-
|
|
109
|
-
def named_entity_analytics(self,df):
|
|
110
|
-
util = Util()
|
|
111
|
-
df["named_entities"] = df.TargetedSentimentResults.apply(self.parse_namedEntities)
|
|
112
|
-
df['named_entity_counts'] = df.named_entities.apply(util.count_entities)
|
|
113
|
-
df["named_entity_ratios"] = df.named_entity_counts.apply(util.ratios)
|
|
114
|
-
df["named_entities_total"] = df.named_entity_counts.apply(util.tuple_values_total)
|
|
115
|
-
if (len(df)>1):
|
|
116
|
-
df["named_entities_scaled"] = util.scale_min_max(df[['named_entities_total']])
|
|
117
|
-
else:
|
|
118
|
-
df["named_entities_scaled"] = 1
|
|
119
|
-
df['named_entities_norm'] = util.normalise_scaled(df,'named_entities_scaled')
|
|
120
|
-
return df
|
|
121
|
-
|
|
122
|
-
def targeted_sentiment_analytics(self,df):
|
|
123
|
-
util = Util()
|
|
124
|
-
df["targeted_sentiment"] = df.TargetedSentimentResults.apply(self.parse_targetedSentimentResults)
|
|
125
|
-
df['targeted_sentiment_counts'] = df.targeted_sentiment.apply(util.count_entities)
|
|
126
|
-
df["targeted_sentiment_ratios"] = df.targeted_sentiment_counts.apply(util.ratios)
|
|
127
|
-
df["targeted_sentiment_total"] = df.targeted_sentiment_counts.apply(util.tuple_values_total)
|
|
128
|
-
if (len(df)>1):
|
|
129
|
-
df["targeted_sentiment_scaled"] = util.scale_min_max(df[['targeted_sentiment_total']])
|
|
130
|
-
else:
|
|
131
|
-
df["targeted_sentiment_scaled"] = 1
|
|
132
|
-
df['targeted_sentiment_norm'] = util.normalise_scaled(df,'targeted_sentiment_scaled')
|
|
133
|
-
return df
|
|
134
|
-
|
|
135
|
-
def syntax_analytics(self,df):
|
|
136
|
-
util = Util()
|
|
137
|
-
df["pos_tags"] = df.SyntaxResults.apply(self.parse_syntaxResults)
|
|
138
|
-
df['pos_tag_counts'] = df.pos_tags.apply(util.count_labels)
|
|
139
|
-
df["pos_tag_ratios"] = df.pos_tag_counts.apply(util.ratios)
|
|
140
|
-
df["pos_tags_total"] = df.pos_tag_counts.apply(util.tuple_values_total)
|
|
141
|
-
if (len(df)>1):
|
|
142
|
-
df["pos_tags_scaled"] = util.scale_min_max(df[['pos_tags_total']])
|
|
143
|
-
else:
|
|
144
|
-
df["pos_tags_scaled"] = 1
|
|
145
|
-
df['pos_tags_norm'] = util.normalise_scaled(df,'pos_tags_scaled')
|
|
146
|
-
return df
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
# Parse key_phrases results - include all above threshold
|
|
150
|
-
def parse_keyPhraseResults(self,keyPhraseResults,threshold=0.95,min_count=1):
|
|
151
|
-
util = Util()
|
|
152
|
-
phrases = {}
|
|
153
|
-
filtered = [str.lower(r['Text']) for r in keyPhraseResults if r['Score'] > threshold]
|
|
154
|
-
for phrase in filtered:
|
|
155
|
-
phrases[phrase] = phrases.get(phrase,0)+1
|
|
156
|
-
|
|
157
|
-
filtered_phrases = {k:v for k,v in phrases.items() if v >= min_count}
|
|
158
|
-
return util.sort_dict_by_value(filtered_phrases)
|
|
159
|
-
|
|
160
|
-
# Parse syntax results - include specific postags
|
|
161
|
-
def parse_syntaxResults(self,syntax_results,postags_keep = ['ADV','VERB','AUX','ADJ','NOUN','PRON','PROPN']):
|
|
162
|
-
sequence = list()
|
|
163
|
-
for token in syntax_results:
|
|
164
|
-
tag = token['PartOfSpeech']['Tag']
|
|
165
|
-
if tag in postags_keep:
|
|
166
|
-
sequence.append((str.lower(token['Text']),tag))
|
|
167
|
-
return sequence
|
|
168
|
-
|
|
169
|
-
# Parse targeted sentiment results - keep non-neutral above threshold
|
|
170
|
-
|
|
171
|
-
def parse_targetedSentimentResults(self,targetedSentiment_results,threshold = 0.4):
|
|
172
|
-
sents = dict()
|
|
173
|
-
for grp in targetedSentiment_results:
|
|
174
|
-
for mention in grp["Mentions"]:
|
|
175
|
-
if mention['Score'] >= threshold:
|
|
176
|
-
if not "NEUTRAL" in mention['MentionSentiment']['Sentiment']:
|
|
177
|
-
k = mention['MentionSentiment']['Sentiment']
|
|
178
|
-
text = str.lower(mention['Text'])
|
|
179
|
-
sents.setdefault(k,{text}).add(text)
|
|
180
|
-
for k,v in sents.items():
|
|
181
|
-
sents[k] = list(v) # change set to list
|
|
182
|
-
return sents
|
|
183
|
-
|
|
184
|
-
# Parse targeted sentiment results for named entities
|
|
185
|
-
def parse_namedEntities(self,targetedSentimentResults,threshold = 0.1):
|
|
186
|
-
ents = dict()
|
|
187
|
-
for grp in targetedSentimentResults:
|
|
188
|
-
for mention in grp["Mentions"]:
|
|
189
|
-
if mention['Score'] >= threshold:
|
|
190
|
-
k = mention['Type']
|
|
191
|
-
text = str.lower(mention['Text'])
|
|
192
|
-
ents.setdefault(k,{text}).add(text)
|
|
193
|
-
for k,v in ents.items():
|
|
194
|
-
ents[k] = list(v) # change set to list
|
|
195
|
-
return ents
|
|
196
|
-
|