reflexive 1.2.8__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reflexive/__init__.py +3 -18
- reflexive/analysis_functions.py +383 -0
- reflexive/display_functions.py +165 -0
- reflexive/res_analysis.py +217 -0
- reflexive/service.py +58 -0
- {reflexive-1.2.8.dist-info → reflexive-2.0.0.dist-info}/METADATA +1 -1
- reflexive-2.0.0.dist-info/RECORD +9 -0
- reflexive/analyse.py +0 -430
- reflexive/cfg.py +0 -116
- reflexive/res.py +0 -225
- reflexive/res_functions.py +0 -62
- reflexive/session.py +0 -264
- reflexive/util.py +0 -127
- reflexive/visualise.py +0 -355
- reflexive-1.2.8.dist-info/RECORD +0 -12
- {reflexive-1.2.8.dist-info → reflexive-2.0.0.dist-info}/WHEEL +0 -0
- {reflexive-1.2.8.dist-info → reflexive-2.0.0.dist-info}/licenses/LICENSE +0 -0
reflexive/res.py
DELETED
|
@@ -1,225 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from pandas import DataFrame
|
|
3
|
-
from spacy import displacy
|
|
4
|
-
from time import sleep
|
|
5
|
-
from reflexive import Config
|
|
6
|
-
from reflexive import AWS
|
|
7
|
-
from reflexive import S3
|
|
8
|
-
from reflexive import Comprehend
|
|
9
|
-
from reflexive import Nlp
|
|
10
|
-
from reflexive import Display
|
|
11
|
-
from reflexive import RES_graph
|
|
12
|
-
|
|
13
|
-
class Res_analyse:
|
|
14
|
-
|
|
15
|
-
config:Config
|
|
16
|
-
aws:AWS
|
|
17
|
-
s3:S3
|
|
18
|
-
comprehend:Comprehend
|
|
19
|
-
nlp:Nlp
|
|
20
|
-
|
|
21
|
-
def __init__(self,aws_profile="default") -> None:
|
|
22
|
-
return self._setup(aws_profile)
|
|
23
|
-
|
|
24
|
-
def _setup(self,aws_profile) -> None:
|
|
25
|
-
self.config = Config(aws_profile)
|
|
26
|
-
self.config.set_parameters(name_prefix="RES")
|
|
27
|
-
self.aws = AWS(self.config)
|
|
28
|
-
return None
|
|
29
|
-
|
|
30
|
-
def set_parameters(self,
|
|
31
|
-
s3_access_point:str,
|
|
32
|
-
s3_bucket_name:str,
|
|
33
|
-
comprehend_service_role_name:str,
|
|
34
|
-
reflexive_entity_name:str,
|
|
35
|
-
reflexive_entity_version:str) -> str:
|
|
36
|
-
self.config.set_s3_parameters(s3_access_point,s3_bucket_name)
|
|
37
|
-
self.config.set_comprehend_parameters(comprehend_service_role_name)
|
|
38
|
-
self.config.set_comprehend_custom_entity_parameters(reflexive_entity_name,reflexive_entity_version)
|
|
39
|
-
params = self.config.get_parameters()
|
|
40
|
-
return json.dumps(params, indent=2)
|
|
41
|
-
|
|
42
|
-
def setup_aws(self) -> None:
|
|
43
|
-
# Create a new S3 client
|
|
44
|
-
self.s3 = S3(self.aws)
|
|
45
|
-
# Create a new Comprehend client
|
|
46
|
-
self.comprehend = Comprehend(self.aws)
|
|
47
|
-
# Create an Nlp object to perform analysis on the text
|
|
48
|
-
self.nlp = Nlp(self.aws)
|
|
49
|
-
return None
|
|
50
|
-
|
|
51
|
-
def get_basic_analytics(self,df:DataFrame) -> DataFrame:
|
|
52
|
-
|
|
53
|
-
# Text length - this is needed for comprehend analytics
|
|
54
|
-
df = self.nlp.text_length(df)
|
|
55
|
-
#df = nlp.remove_IQR_outliers(df)
|
|
56
|
-
# Comprehend analysis
|
|
57
|
-
results = self.nlp.comprehend_analysis(self.comprehend,df)
|
|
58
|
-
#print(results)
|
|
59
|
-
errors = self.nlp.check_results(results)
|
|
60
|
-
#print(errors)
|
|
61
|
-
if errors=={}:
|
|
62
|
-
print("No errors, so adding results to dataframe")
|
|
63
|
-
df = self.nlp.add_results_to_df(results,df)
|
|
64
|
-
df = self.nlp.comprehend_analytics(df)
|
|
65
|
-
return df
|
|
66
|
-
|
|
67
|
-
def get_reflexive_analytics(self,df:DataFrame) -> DataFrame:
|
|
68
|
-
# Reflexive expression analysis
|
|
69
|
-
response = self.nlp.analyse_reflexive_expressions(df,self.s3,self.comprehend)
|
|
70
|
-
#print(response)
|
|
71
|
-
job_id = self.comprehend.get_current_job_id()
|
|
72
|
-
print("Job ID:",job_id)
|
|
73
|
-
status = self.comprehend.check_job_status()
|
|
74
|
-
print("Status:",status)
|
|
75
|
-
|
|
76
|
-
# Get the details of the job
|
|
77
|
-
# details = comp.get_job_details()
|
|
78
|
-
# print("Job details:",details)
|
|
79
|
-
|
|
80
|
-
inc = 0
|
|
81
|
-
while status=="SUBMITTED" or status=="IN_PROGRESS":
|
|
82
|
-
print("Waiting 10 seconds...")
|
|
83
|
-
sleep(10)
|
|
84
|
-
status = self.comprehend.check_job_status()
|
|
85
|
-
print(f"Job status {inc}:",status)
|
|
86
|
-
inc += 1
|
|
87
|
-
|
|
88
|
-
# Download from S3 and extract results
|
|
89
|
-
print("Downloading and extracting results...")
|
|
90
|
-
results = self.comprehend.download_and_extract(self.s3)
|
|
91
|
-
print("RESULTS:")
|
|
92
|
-
print(results)
|
|
93
|
-
|
|
94
|
-
# Extract output of analysis and add to df
|
|
95
|
-
return self.nlp.add_to_dataframe(df,results)
|
|
96
|
-
|
|
97
|
-
class Res_display:
|
|
98
|
-
|
|
99
|
-
res_analyse:Res_analyse
|
|
100
|
-
vis:Display
|
|
101
|
-
|
|
102
|
-
def __init__(self,res:Res_analyse) -> None:
|
|
103
|
-
return self._setup(res)
|
|
104
|
-
|
|
105
|
-
def _setup(self,res:Res_analyse) -> None:
|
|
106
|
-
self.res_analyse = res
|
|
107
|
-
self.vis = Display(res.aws)
|
|
108
|
-
return None
|
|
109
|
-
|
|
110
|
-
def show_text(self,df:DataFrame,inline=True) -> str:
|
|
111
|
-
df = self.vis.add_offsets(df)
|
|
112
|
-
disp_data = self.vis.create_displacy(df)
|
|
113
|
-
if inline:
|
|
114
|
-
displacy.render(disp_data,manual=True,style="ent", options=self.res_analyse.config.display_options)
|
|
115
|
-
html_out = "Set inline to false to produce HTML"
|
|
116
|
-
else:
|
|
117
|
-
html_out = displacy.render(disp_data,manual=True,style="ent", options=self.res_analyse.config.display_options,page=True,jupyter=False)
|
|
118
|
-
return html_out
|
|
119
|
-
|
|
120
|
-
def get_interactions(self,df:DataFrame) -> DataFrame:
|
|
121
|
-
#Get RE sequence
|
|
122
|
-
df = self._add_res_sequence(df)
|
|
123
|
-
df = self._add_res_interactions(df)
|
|
124
|
-
df = self._add_res_weights(df)
|
|
125
|
-
df = self._add_res_adj_matrix(df)
|
|
126
|
-
return df
|
|
127
|
-
|
|
128
|
-
def show_graph(self,df:DataFrame,scale=10,inline=True) -> str:
|
|
129
|
-
for am in df.res_adj_matrix:
|
|
130
|
-
if scale > 1:
|
|
131
|
-
sm = self._scale_adj_matrix(am,scale)
|
|
132
|
-
else:
|
|
133
|
-
sm = am
|
|
134
|
-
g = RES_graph(sm)
|
|
135
|
-
g.show()
|
|
136
|
-
return ""
|
|
137
|
-
|
|
138
|
-
def _scale_adj_matrix(self,adj_matrix,scale):
|
|
139
|
-
new_adj = []
|
|
140
|
-
for row in adj_matrix:
|
|
141
|
-
new_row = []
|
|
142
|
-
for c in row:
|
|
143
|
-
new_row.append(round(c*scale,1))
|
|
144
|
-
new_adj.append(new_row)
|
|
145
|
-
return new_adj
|
|
146
|
-
|
|
147
|
-
def _add_res_sequence(self,df):
|
|
148
|
-
temp_df = df.copy()
|
|
149
|
-
temp_df['res_sequence'] = temp_df.reflexive_expressions.apply(self._get_res_sequence)
|
|
150
|
-
return temp_df
|
|
151
|
-
|
|
152
|
-
def _add_res_interactions(self,df):
|
|
153
|
-
temp_df = df.copy()
|
|
154
|
-
temp_df['res_interactions'] = temp_df.res_sequence.apply(self._count_res_interactions)
|
|
155
|
-
return temp_df
|
|
156
|
-
|
|
157
|
-
def _add_res_weights(self,df):
|
|
158
|
-
temp_df = df.copy()
|
|
159
|
-
temp_df['res_weights'] = temp_df.res_interactions.apply(self._calc_res_weights)
|
|
160
|
-
return temp_df
|
|
161
|
-
|
|
162
|
-
def _add_res_adj_matrix(self,df):
|
|
163
|
-
temp_df = df.copy()
|
|
164
|
-
temp_df['res_adj_matrix'] = temp_df.res_weights.apply(self._create_adj_matrix)
|
|
165
|
-
return temp_df
|
|
166
|
-
|
|
167
|
-
def _get_res_sequence(self,reflexive_expressions):
|
|
168
|
-
re_seq = [label for re,label in reflexive_expressions]
|
|
169
|
-
res_seq = []
|
|
170
|
-
# Need to substitute new RES labels for old RE labels
|
|
171
|
-
for re in re_seq:
|
|
172
|
-
if re=='ER' or re=='VR':
|
|
173
|
-
res_seq.append('NR')
|
|
174
|
-
elif re=='EV':
|
|
175
|
-
res_seq.append('EP')
|
|
176
|
-
elif re=='CN':
|
|
177
|
-
res_seq.append('AF')
|
|
178
|
-
else:
|
|
179
|
-
res_seq.append(re)
|
|
180
|
-
return res_seq
|
|
181
|
-
|
|
182
|
-
def _empty_res_interactions(self) -> dict[tuple,int]:
|
|
183
|
-
RE_types = ['RR','NR','AR','AF','EP']
|
|
184
|
-
RE_interactions:dict[tuple,int] = dict()
|
|
185
|
-
for t1 in RE_types:
|
|
186
|
-
for t2 in RE_types:
|
|
187
|
-
entry = tuple(sorted((t1,t2)))
|
|
188
|
-
if entry not in RE_interactions.keys():
|
|
189
|
-
RE_interactions[entry] = 0
|
|
190
|
-
return RE_interactions
|
|
191
|
-
|
|
192
|
-
def _count_res_interactions(self,re_sequence:list[str]) -> dict[tuple,int]:
|
|
193
|
-
re_ints = self._empty_res_interactions()
|
|
194
|
-
limit = len(re_sequence)-1
|
|
195
|
-
for i,s in enumerate(re_sequence):
|
|
196
|
-
if i < limit:
|
|
197
|
-
rei = tuple(sorted((s,re_sequence[i+1])))
|
|
198
|
-
#print(i,rei)
|
|
199
|
-
re_ints[rei] += 1
|
|
200
|
-
return re_ints
|
|
201
|
-
|
|
202
|
-
def _calc_res_weights(self,interactions:dict[tuple,int])->dict[tuple,float]:
|
|
203
|
-
max_count = max(interactions.values())
|
|
204
|
-
weights = dict()
|
|
205
|
-
for edge,count in interactions.items():
|
|
206
|
-
weights[edge] = round(count/(max_count),2)
|
|
207
|
-
return weights
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
def _create_adj_matrix(self,weights:dict[tuple,float])->list[list[float]]:
|
|
211
|
-
re_types = ["RR","NR","AR","AF","EP"]
|
|
212
|
-
matrix = []
|
|
213
|
-
for r in re_types:
|
|
214
|
-
row = []
|
|
215
|
-
for c in re_types:
|
|
216
|
-
key = tuple(sorted((r,c)))
|
|
217
|
-
#print(key)
|
|
218
|
-
weight = weights.get(key,0)
|
|
219
|
-
row.append(weight)
|
|
220
|
-
matrix.append(row)
|
|
221
|
-
return matrix
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
reflexive/res_functions.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
# import text into dataframe
|
|
2
|
-
# accepts either an iterable of strings, or an iterable over text files
|
|
3
|
-
# returns a pandas series iterable of type string
|
|
4
|
-
|
|
5
|
-
# clean text and calculate length
|
|
6
|
-
# accepts an iterable of strings in form of pandas series of type string
|
|
7
|
-
# returns a pandas series iterable of type int
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
# chunk text and keep original index ref
|
|
11
|
-
# accepts an iterable of dataframe rows
|
|
12
|
-
# returns an iterable of dataframe rows with added column 'text_chunks' - list of strings
|
|
13
|
-
|
|
14
|
-
# upload docs to s3 and save local copy - side effects
|
|
15
|
-
# accepts an iterable of iterable of chunks (with ids)
|
|
16
|
-
# returns an an iterable of s3 responses? URLs to S3 file?
|
|
17
|
-
|
|
18
|
-
# initiate custom entity job on comprehend
|
|
19
|
-
# no parameters
|
|
20
|
-
# returns job id for checking status, and downloading
|
|
21
|
-
|
|
22
|
-
# check status
|
|
23
|
-
# accepts job id
|
|
24
|
-
# returns status
|
|
25
|
-
|
|
26
|
-
# download results
|
|
27
|
-
# accepts job id
|
|
28
|
-
# returns iterable of results
|
|
29
|
-
|
|
30
|
-
# unpack results and load into dataframe
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
# extract reflexive expressions into dataframe
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# get reflexive sequences
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# get interactions
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
# create count adj matrix
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
# create weighted adj matrix
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
# save dataframe to file
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
# visualise expressions in text
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
# visualise reflexive sequence
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
# visualise res graph
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
#
|
|
61
|
-
# Network analysis functions
|
|
62
|
-
#
|
reflexive/session.py
DELETED
|
@@ -1,264 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import boto3
|
|
3
|
-
import time
|
|
4
|
-
import tarfile
|
|
5
|
-
import json
|
|
6
|
-
|
|
7
|
-
from reflexive import cfg
|
|
8
|
-
|
|
9
|
-
import logging
|
|
10
|
-
#logging.basicConfig(level=logging.DEBUG)
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
class AWS:
|
|
14
|
-
|
|
15
|
-
config = None
|
|
16
|
-
aws_session = None
|
|
17
|
-
|
|
18
|
-
def __init__(self,config:cfg.Config):
|
|
19
|
-
# on initialisation create a new session with provided profile (or with default profile)
|
|
20
|
-
#logger.error(config.get_parameters())
|
|
21
|
-
if config is None:
|
|
22
|
-
config = cfg.Config()
|
|
23
|
-
self.config = config
|
|
24
|
-
self.new_session()
|
|
25
|
-
|
|
26
|
-
def get_parameters(self):
|
|
27
|
-
return self.__dict__
|
|
28
|
-
|
|
29
|
-
def new_session(self):
|
|
30
|
-
logger.info("In new_session")
|
|
31
|
-
try:
|
|
32
|
-
self.aws_session = boto3.Session(profile_name=self.config.aws_profile)
|
|
33
|
-
self.config.aws_region = self.aws_session.region_name
|
|
34
|
-
self.config.aws_access_key = self.aws_session.get_credentials().access_key
|
|
35
|
-
logger.info("Created new AWS session in region %s for profile: %s",self.config.aws_region,self.config.aws_profile)
|
|
36
|
-
|
|
37
|
-
except Exception as e:
|
|
38
|
-
logger.error("Unable to create an AWS session: %s",repr(e))
|
|
39
|
-
|
|
40
|
-
try:
|
|
41
|
-
self.config.aws_account_number = self.aws_session.client('sts').get_caller_identity().get('Account')
|
|
42
|
-
logger.info("Retrieved account number from AWS")
|
|
43
|
-
except Exception as e:
|
|
44
|
-
logger.error("Unable to retrieve account number from AWS: %s",repr(e))
|
|
45
|
-
|
|
46
|
-
return self.aws_session
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class S3:
|
|
50
|
-
|
|
51
|
-
aws = None
|
|
52
|
-
config = None
|
|
53
|
-
__s3_client = None
|
|
54
|
-
|
|
55
|
-
def __init__(self,aws:AWS):
|
|
56
|
-
self.aws = aws
|
|
57
|
-
self.config = self.aws.config
|
|
58
|
-
|
|
59
|
-
# create client
|
|
60
|
-
try:
|
|
61
|
-
logger.debug(f"Region:{self.aws.aws_session.region_name}")
|
|
62
|
-
self.__s3_client = aws.aws_session.client(service_name='s3')
|
|
63
|
-
except Exception as err:
|
|
64
|
-
logger.error("Unable to create S3 client: ",err)
|
|
65
|
-
|
|
66
|
-
# Return the S3 client
|
|
67
|
-
def client(self):
|
|
68
|
-
return self.__s3_client
|
|
69
|
-
|
|
70
|
-
# Function to upload reflections to S3
|
|
71
|
-
def upload_docs(self,text_series):
|
|
72
|
-
|
|
73
|
-
files_folder = f"{self.config.prefix}files{self.config.postfix}"
|
|
74
|
-
|
|
75
|
-
s3 = self.__s3_client
|
|
76
|
-
s3ap = self.config.s3_accesspoint_arn
|
|
77
|
-
logger.debug(f"ACCESS POINT: {s3ap}")
|
|
78
|
-
|
|
79
|
-
logger.info(f"Uploading {len(text_series)} reflections to S3 ({files_folder})...")
|
|
80
|
-
logger.debug(f"({s3ap}/{files_folder})")
|
|
81
|
-
for idx in text_series.index:
|
|
82
|
-
file_name = f"{self.config.prefix}{idx}.txt"
|
|
83
|
-
file_body = text_series.iloc[idx]
|
|
84
|
-
logger.info(f"Uploading {file_name}")
|
|
85
|
-
#print(file_body)
|
|
86
|
-
response = s3.put_object(Body=file_body,Bucket=s3ap,Key=f"{files_folder}/{file_name}")
|
|
87
|
-
if response['ResponseMetadata']['HTTPStatusCode'] != 200:
|
|
88
|
-
logger.error("------------------------------------------------------------")
|
|
89
|
-
logger.error(f"ERROR: There was a problem with {file_name}")
|
|
90
|
-
logger.error(response)
|
|
91
|
-
logger.error("------------------------------------------------------------")
|
|
92
|
-
else:
|
|
93
|
-
logger.info('Success')
|
|
94
|
-
logger.info("Finished uploading reflections to S3.")
|
|
95
|
-
return response
|
|
96
|
-
|
|
97
|
-
# download and save results
|
|
98
|
-
def results_download_save_extract(self,s3Uri,local_file_path):
|
|
99
|
-
s3 = self.__s3_client
|
|
100
|
-
output_key = s3Uri.split(self.config.s3_bucket_name)[1]
|
|
101
|
-
# download from S3 to local path
|
|
102
|
-
with open(f"{local_file_path}.tar.gz",'wb') as output_data:
|
|
103
|
-
s3.download_fileobj(self.config.s3_bucket_name,output_key[1:],output_data)
|
|
104
|
-
|
|
105
|
-
# extract the files from tar archive
|
|
106
|
-
files = list()
|
|
107
|
-
with tarfile.open(f"{local_file_path}.tar.gz", "r:gz") as tf:
|
|
108
|
-
for member in tf.getmembers():
|
|
109
|
-
f = tf.extractfile(member)
|
|
110
|
-
if f is not None:
|
|
111
|
-
content = f.read()
|
|
112
|
-
files.append(content)
|
|
113
|
-
#print("Number of files:",len(files))
|
|
114
|
-
# extract results and save and return
|
|
115
|
-
raw_results = files[0].decode("utf-8").split('\n')
|
|
116
|
-
raw_results.pop() # pop last item off as empty entry due to final \n
|
|
117
|
-
json_results = json.dumps(raw_results)
|
|
118
|
-
with open(f"{local_file_path}.json","w") as fp:
|
|
119
|
-
fp.write(json_results)
|
|
120
|
-
return json_results
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class Comprehend:
|
|
124
|
-
|
|
125
|
-
aws = None
|
|
126
|
-
config = None
|
|
127
|
-
__comp_client = None
|
|
128
|
-
|
|
129
|
-
def __init__(self,aws:AWS):
|
|
130
|
-
self.aws = aws
|
|
131
|
-
self.config = self.aws.config
|
|
132
|
-
|
|
133
|
-
# create client
|
|
134
|
-
try:
|
|
135
|
-
logger.debug(f"Region:{self.aws.aws_session.region_name}")
|
|
136
|
-
self.__comp_client = self.aws.aws_session.client(service_name='comprehend')
|
|
137
|
-
except Exception as err:
|
|
138
|
-
logger.error("Unable to create Comprehend client: ",err)
|
|
139
|
-
|
|
140
|
-
def client(self):
|
|
141
|
-
return self.__comp_client
|
|
142
|
-
|
|
143
|
-
# Use AWS comprehend to get bulk key phrases from single batch of chunked text
|
|
144
|
-
def get_single_batch_analysis(self,index,chunk):
|
|
145
|
-
comp_client = self.client()
|
|
146
|
-
results = {}
|
|
147
|
-
print("Analysing chunk",index)
|
|
148
|
-
print(" . key_phrase")
|
|
149
|
-
kpresult = comp_client.batch_detect_key_phrases(TextList=chunk,LanguageCode='en')
|
|
150
|
-
results['KeyPhraseResults'] = kpresult
|
|
151
|
-
#key_phrase_results.append(kpresult)
|
|
152
|
-
time.sleep(2)
|
|
153
|
-
print(" . sentiment")
|
|
154
|
-
senresult = comp_client.batch_detect_sentiment(TextList=chunk,LanguageCode='en')
|
|
155
|
-
results['SentimentResults'] = senresult
|
|
156
|
-
#sentiment_results.append(senresult)
|
|
157
|
-
time.sleep(2)
|
|
158
|
-
print(" . targeted_sentiment")
|
|
159
|
-
tsenresult = comp_client.batch_detect_targeted_sentiment(TextList=chunk,LanguageCode='en')
|
|
160
|
-
results['TargetedSentimentResults'] = tsenresult
|
|
161
|
-
#target_sent_results.append(tsenresult)
|
|
162
|
-
time.sleep(2)
|
|
163
|
-
print(" . syntax")
|
|
164
|
-
synresult = comp_client.batch_detect_syntax(TextList=chunk,LanguageCode='en')
|
|
165
|
-
results['SyntaxResults'] = synresult
|
|
166
|
-
#syntax_results.append(synresult)
|
|
167
|
-
time.sleep(2)
|
|
168
|
-
return results
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
# Use AWS comprehend to get bulk key phrases from chunked text
|
|
172
|
-
def get_multiple_batch_analysis(self,chunked_text):
|
|
173
|
-
chunk_results = {}
|
|
174
|
-
for key in self.config.analysis_types.keys():
|
|
175
|
-
chunk_results[key] = []
|
|
176
|
-
|
|
177
|
-
for idx,chunk in enumerate(chunked_text):
|
|
178
|
-
if len(chunked_text) > 4999:
|
|
179
|
-
print("WARNING: Text too long to analyse - index",idx,"skipped!")
|
|
180
|
-
else:
|
|
181
|
-
try:
|
|
182
|
-
results = self.get_single_batch_analysis(index=idx,chunk=chunk)
|
|
183
|
-
except(Exception) as error:
|
|
184
|
-
print("There was an error with index",idx,error)
|
|
185
|
-
finally:
|
|
186
|
-
if results:
|
|
187
|
-
for key in results.keys():
|
|
188
|
-
chunk_results[key].append(results[key])
|
|
189
|
-
|
|
190
|
-
return chunk_results
|
|
191
|
-
|
|
192
|
-
# Take batched responses and concenate single lists of results, errors, and http responses
|
|
193
|
-
def unbatch_results(self,result_type,results,batch_size=25):
|
|
194
|
-
unbatched_results = {}
|
|
195
|
-
unbatched_errors = {}
|
|
196
|
-
batch_responses = {}
|
|
197
|
-
for idx,batch in enumerate(results):
|
|
198
|
-
#print("Response for batch:",idx)
|
|
199
|
-
batch_responses[idx] = batch['ResponseMetadata']
|
|
200
|
-
result_list = batch['ResultList']
|
|
201
|
-
error_list = batch['ErrorList']
|
|
202
|
-
for r in result_list:
|
|
203
|
-
ridx = idx*batch_size + r['Index']
|
|
204
|
-
rdata = r[result_type]
|
|
205
|
-
unbatched_results[ridx] = rdata
|
|
206
|
-
for e in error_list:
|
|
207
|
-
eidx = e['Index']
|
|
208
|
-
unbatched_errors[eidx] = 'ERROR' + e['ErrorCode'] + ': ' + e['ErrorMessage']
|
|
209
|
-
unbatched = {}
|
|
210
|
-
unbatched['results'] = unbatched_results
|
|
211
|
-
unbatched['errors'] = unbatched_errors
|
|
212
|
-
unbatched['responses'] = batch_responses
|
|
213
|
-
return unbatched
|
|
214
|
-
|
|
215
|
-
def check_long_text(self,df):
|
|
216
|
-
# Check for long reflections (too long for batch analysis)
|
|
217
|
-
long_df = df.copy()
|
|
218
|
-
long_df = long_df[long_df.text.str.len()>5000]
|
|
219
|
-
long_df['length'] = long_df.text.str.len()
|
|
220
|
-
return long_df
|
|
221
|
-
|
|
222
|
-
# #### CUSTOM ENTITY
|
|
223
|
-
|
|
224
|
-
def submit_custom_entity_job(self,job_name): #access_role_arn,entity_recogniser_arn):
|
|
225
|
-
job_str = f"{self.config.prefix}{job_name}{self.config.postfix}"
|
|
226
|
-
|
|
227
|
-
response = self.__comp_client.start_entities_detection_job(
|
|
228
|
-
InputDataConfig={
|
|
229
|
-
'S3Uri': self.config.s3_input_uri,
|
|
230
|
-
'InputFormat': 'ONE_DOC_PER_FILE'
|
|
231
|
-
},
|
|
232
|
-
OutputDataConfig={
|
|
233
|
-
'S3Uri': self.config.s3_output_uri
|
|
234
|
-
},
|
|
235
|
-
DataAccessRoleArn=self.config.comprehend_access_role_arn,
|
|
236
|
-
JobName=job_str,
|
|
237
|
-
EntityRecognizerArn=self.config.reflexive_entity_arn,
|
|
238
|
-
LanguageCode='en'
|
|
239
|
-
)
|
|
240
|
-
self.job_id = response['JobId']
|
|
241
|
-
self.check_job_status() # force the creation of __job_properties
|
|
242
|
-
return response
|
|
243
|
-
|
|
244
|
-
def get_current_job_id(self):
|
|
245
|
-
return self.job_id
|
|
246
|
-
|
|
247
|
-
# Check job status
|
|
248
|
-
def check_job_status(self):
|
|
249
|
-
job_status = self.__comp_client.describe_entities_detection_job(
|
|
250
|
-
JobId=self.job_id
|
|
251
|
-
)
|
|
252
|
-
self.__job_properties = job_status['EntitiesDetectionJobProperties']
|
|
253
|
-
return self.__job_properties['JobStatus']
|
|
254
|
-
|
|
255
|
-
def get_job_details(self):
|
|
256
|
-
return self.__job_properties
|
|
257
|
-
|
|
258
|
-
#checked
|
|
259
|
-
def download_and_extract(self,s3):
|
|
260
|
-
local_output_dir = f"{self.config.local_path}{self.config.prefix}output{self.config.postfix}"
|
|
261
|
-
job_details = self.get_job_details()
|
|
262
|
-
s3Uri = job_details['OutputDataConfig']['S3Uri']
|
|
263
|
-
return s3.results_download_save_extract(s3Uri,local_output_dir)
|
|
264
|
-
|
reflexive/util.py
DELETED
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import json
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from sklearn.preprocessing import MinMaxScaler
|
|
5
|
-
import logging
|
|
6
|
-
#logging.basicConfig(level=logging.DEBUG)
|
|
7
|
-
logger = logging.getLogger(__name__)
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
# File functions
|
|
12
|
-
def get_data_path_name(config,name,ext):
|
|
13
|
-
return f"{config.local_path}{config.prefix}{name}{config.postfix}.{ext}"
|
|
14
|
-
|
|
15
|
-
def set_sub_dir(config,sub_dir=None):
|
|
16
|
-
# check dir sub_dir exists
|
|
17
|
-
if sub_dir:
|
|
18
|
-
local_dir = f"{config.local_path}{sub_dir}/"
|
|
19
|
-
logger.debug(f"local_dir: {local_dir}")
|
|
20
|
-
dirExists = os.path.exists(local_dir)
|
|
21
|
-
if not dirExists:
|
|
22
|
-
logger.info(f"Creating subdirectory: {local_dir}")
|
|
23
|
-
os.makedirs(local_dir)
|
|
24
|
-
else:
|
|
25
|
-
local_dir = config.local_path
|
|
26
|
-
return local_dir
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
# Function to write dictionaries to both json and csv
|
|
31
|
-
def writeDictJsonCSV(dictionary,path_file):
|
|
32
|
-
with open(f"{path_file}.json",'w') as fp:
|
|
33
|
-
fp.write(json.dumps(dictionary))
|
|
34
|
-
|
|
35
|
-
ngram_df = pd.DataFrame.from_dict(dictionary,orient='index')
|
|
36
|
-
ngram_df.to_csv(f"{path_file}.csv")
|
|
37
|
-
|
|
38
|
-
# Data functions
|
|
39
|
-
def sort_dict_by_value(d):
|
|
40
|
-
return dict(sorted(d.items(), key=lambda x:x[1], reverse=True))
|
|
41
|
-
|
|
42
|
-
def filter_dict_by_value(ngrams,min_val=3):
|
|
43
|
-
filtered_ngrams = {}
|
|
44
|
-
for k,v in ngrams.items():
|
|
45
|
-
if v >=min_val:
|
|
46
|
-
filtered_ngrams[k] = v
|
|
47
|
-
return filtered_ngrams
|
|
48
|
-
|
|
49
|
-
# Input a series and output a list of lists with each maxn elements
|
|
50
|
-
def series_to_chunked_list(series,maxn=25):
|
|
51
|
-
lst = list(series)
|
|
52
|
-
return __chunk_list(lst,maxn)
|
|
53
|
-
|
|
54
|
-
# Chunk a list into a list of lists with maxn elements
|
|
55
|
-
def __chunk_list(lst,maxn=25):
|
|
56
|
-
return [lst[i:i + maxn] for i in range(0, len(lst), maxn)]
|
|
57
|
-
|
|
58
|
-
# Count named entities
|
|
59
|
-
def count_entities(entities):
|
|
60
|
-
counts = []
|
|
61
|
-
for k,v in entities.items():
|
|
62
|
-
counts.append((k,len(v)))
|
|
63
|
-
return sorted(counts, key=lambda x: x[1], reverse=True)
|
|
64
|
-
|
|
65
|
-
# Function for calculating proportions of features
|
|
66
|
-
def ratios(elements):
|
|
67
|
-
etotal = sum([v[1] for v in elements])
|
|
68
|
-
if etotal==0:
|
|
69
|
-
return elements
|
|
70
|
-
else:
|
|
71
|
-
proportioned = []
|
|
72
|
-
for element in elements:
|
|
73
|
-
prop_val = round((element[1]/etotal),4)
|
|
74
|
-
proportioned.append((element[0],prop_val))
|
|
75
|
-
return proportioned
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
# Count labels associated with strings
|
|
80
|
-
def count_labels(string_labels):
|
|
81
|
-
counts = dict()
|
|
82
|
-
for rt in string_labels:
|
|
83
|
-
counts[rt[1]] = counts.setdefault(rt[1],0) + 1
|
|
84
|
-
return sorted(counts.items(), key=lambda x: x[1], reverse=True)
|
|
85
|
-
|
|
86
|
-
def count_keys(key_count_dict):
|
|
87
|
-
counts = dict()
|
|
88
|
-
for k,v in key_count_dict.items():
|
|
89
|
-
counts[k] = counts.setdefault(k,0) + v
|
|
90
|
-
return sorted(counts.items(), key=lambda x: x[1], reverse=True)
|
|
91
|
-
|
|
92
|
-
# Total the values in list of tuples
|
|
93
|
-
def tuple_values_total(tuples):
|
|
94
|
-
tvs = [t[1] for t in tuples]
|
|
95
|
-
return sum(tvs)
|
|
96
|
-
|
|
97
|
-
#### SCALING AND NORMALISING
|
|
98
|
-
|
|
99
|
-
# Outliers
|
|
100
|
-
|
|
101
|
-
def outlier_fence(series):
|
|
102
|
-
bounds = {}
|
|
103
|
-
stats = series.describe()
|
|
104
|
-
iqr = stats['75%'] - stats['25%']
|
|
105
|
-
bounds["IQR"]=iqr
|
|
106
|
-
upper = stats['75%']+1.5*iqr
|
|
107
|
-
bounds["UPPER"]=upper
|
|
108
|
-
lower = stats['25%']-1.5*iqr
|
|
109
|
-
bounds["LOWER"]=lower
|
|
110
|
-
return bounds
|
|
111
|
-
|
|
112
|
-
# MinMax Scaling
|
|
113
|
-
def scale_min_max(df_cols):
|
|
114
|
-
scaler = MinMaxScaler()
|
|
115
|
-
return scaler.fit_transform(df_cols)
|
|
116
|
-
|
|
117
|
-
# Normalise domain term counts
|
|
118
|
-
def normalise_domain_counts(domain_counts,text_size):
|
|
119
|
-
norms = {}
|
|
120
|
-
for k,v in domain_counts.items():
|
|
121
|
-
norms[k] = round(v*text_size,3)
|
|
122
|
-
return norms
|
|
123
|
-
|
|
124
|
-
def normalise_scaled(df,feature,norm_feature = 'text_scaled'):
|
|
125
|
-
tempdf = df[[feature,norm_feature]].copy()
|
|
126
|
-
tempdf['norm_scaled'] = tempdf.apply(lambda r: round(r[feature]/(r[norm_feature]+0.01),4),axis=1)
|
|
127
|
-
return tempdf['norm_scaled']
|