reflexive 2.2.0__tar.gz → 2.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reflexive-2.2.0/src/reflexive/analysis_functions.py → reflexive-2.2.2/.history/src/reflexive/analysis_functions_20251021145809.py +2 -0
- reflexive-2.2.2/.history/src/reflexive/analysis_functions_20251021150142.py +463 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/PKG-INFO +1 -1
- {reflexive-2.2.0 → reflexive-2.2.2}/pyproject.toml +1 -1
- reflexive-2.2.2/src/reflexive/analysis_functions.py +463 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/res_analysis.py +4 -3
- {reflexive-2.2.0 → reflexive-2.2.2}/.gitignore +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/=3.12 +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/LICENSE +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/README.md +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.1-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.1.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.2-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.2.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.3-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.3.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.4-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.4.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.5-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.5.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.6-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.6.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.7-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.7.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.8-py3-none-any.whl +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.8.tar.gz +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/__init__.py +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/display_functions.py +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/service.py +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/__init__.py +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/synth_ref-datascientist.txt +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/synth_ref-footballer.txt +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/synth_ref-surgeon.txt +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/teach_ref-3.txt +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/teach_ref-4.txt +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/teach_ref-5.txt +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/test_reflexive-v2.ipynb +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/test_reflexive.ipynb +0 -0
- {reflexive-2.2.0 → reflexive-2.2.2}/tests/test_reflexive_extras.py +0 -0
|
@@ -140,6 +140,8 @@ def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
|
|
|
140
140
|
|
|
141
141
|
def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
|
|
142
142
|
max_count = max(interactions.values())
|
|
143
|
+
if max_count == 0:
|
|
144
|
+
max_count = 0.0001
|
|
143
145
|
weights = dict()
|
|
144
146
|
for edge,count in interactions.items():
|
|
145
147
|
weights[edge] = round(count/(max_count),2)
|
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
from pandas import (DataFrame,Series)
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from zoneinfo import ZoneInfo
|
|
5
|
+
from time import sleep
|
|
6
|
+
from functools import partial
|
|
7
|
+
import tarfile
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from numpy import (
|
|
11
|
+
asarray,
|
|
12
|
+
dot
|
|
13
|
+
)
|
|
14
|
+
from numpy.linalg import norm
|
|
15
|
+
from itertools import chain
|
|
16
|
+
from graph_tool.all import (
|
|
17
|
+
Graph,
|
|
18
|
+
similarity,
|
|
19
|
+
adjacency)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
### PIPELINE FUNCTIONS
|
|
23
|
+
|
|
24
|
+
# Clean text using supplied function and calculate text length
|
|
25
|
+
# Used by RES_analyser.preprocess_text()
|
|
26
|
+
|
|
27
|
+
def _clean_text(df:DataFrame,text_cleaner:Callable[[str],str])->DataFrame:
|
|
28
|
+
return (df
|
|
29
|
+
.assign(text=lambda d: d.text.apply(text_cleaner))
|
|
30
|
+
.assign(text_length=lambda d: [len(row.text) for row in d.itertuples()]))
|
|
31
|
+
|
|
32
|
+
# Upload text using supplied uploader function
|
|
33
|
+
# Used by RES_analyser.upload_text_to_s3()
|
|
34
|
+
|
|
35
|
+
def _upload_text(df:DataFrame,uploader:Callable,res_analyser)->DataFrame:
|
|
36
|
+
upload = partial(uploader,aws_service=res_analyser.aws_service,config=res_analyser.config,logger=res_analyser.logger)
|
|
37
|
+
return df.assign(uploaded=lambda d: [upload(str(row.Index),row.text) for row in d.itertuples()])
|
|
38
|
+
|
|
39
|
+
# Initiate comprehend analysis on S3 text documents
|
|
40
|
+
|
|
41
|
+
def _analyse_text(analyser:Callable,res_analyser)->dict:
|
|
42
|
+
analyse = partial(analyser,
|
|
43
|
+
aws_service=res_analyser.aws_service,
|
|
44
|
+
config=res_analyser.config,
|
|
45
|
+
logger = res_analyser.logger)
|
|
46
|
+
job_status = analyse()
|
|
47
|
+
return job_status['EntitiesDetectionJobProperties']
|
|
48
|
+
|
|
49
|
+
# Add comprehend analysis results to dataframe
|
|
50
|
+
def _analysis_to_dataframe(df:DataFrame,results:list)->DataFrame:
|
|
51
|
+
analysis = _extract_analysis(results=results)
|
|
52
|
+
df['res_results']=Series(analysis)
|
|
53
|
+
return df
|
|
54
|
+
|
|
55
|
+
# Offsets to dataframe
|
|
56
|
+
def _add_offsets(df:DataFrame,offset_cleaner,orphan_joiner)->DataFrame:
|
|
57
|
+
return (df
|
|
58
|
+
.assign(offsets=lambda d: d.res_results.apply(offset_cleaner))
|
|
59
|
+
.assign(offsets_clean=lambda d: [orphan_joiner(row.text,row.offsets) for row in d.itertuples()]))
|
|
60
|
+
|
|
61
|
+
def _offset_cleaner(res_results):
|
|
62
|
+
offsets = _collect_offsets(res_results)
|
|
63
|
+
tuples = _offset_tuples(offsets)
|
|
64
|
+
return _sorted_offsets(tuples)
|
|
65
|
+
|
|
66
|
+
def _orphan_joiner(text,offsets):
|
|
67
|
+
otuples = _orphaned_I(text,offsets)
|
|
68
|
+
offs = _orphaned_word(text,otuples)
|
|
69
|
+
return _regroup(offs)
|
|
70
|
+
|
|
71
|
+
def _collect_offsets(rrs):
|
|
72
|
+
new_rrs = {}
|
|
73
|
+
for rr in rrs:
|
|
74
|
+
if rr['Score']>0.6:
|
|
75
|
+
ent_type = rr['Type']
|
|
76
|
+
if ent_type in ['VR','ER']:
|
|
77
|
+
label = "NR"
|
|
78
|
+
elif ent_type in ['EP','EV']:
|
|
79
|
+
label = "EP"
|
|
80
|
+
elif ent_type in ['CN','AF']:
|
|
81
|
+
label = "AF"
|
|
82
|
+
else:
|
|
83
|
+
label = ent_type
|
|
84
|
+
new_rrs.setdefault(label,[]).append((rr['BeginOffset'],rr['EndOffset']))
|
|
85
|
+
return new_rrs
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
#####
|
|
90
|
+
|
|
91
|
+
def _add_res_sequence(df):
|
|
92
|
+
temp_df = df.copy()
|
|
93
|
+
temp_df['res_sequence'] = temp_df.offsets_clean.apply(_get_res_sequence)
|
|
94
|
+
return temp_df
|
|
95
|
+
|
|
96
|
+
def _add_res_interactions(df):
|
|
97
|
+
temp_df = df.copy()
|
|
98
|
+
temp_df['res_interactions'] = temp_df.res_sequence.apply(_count_res_interactions)
|
|
99
|
+
return temp_df
|
|
100
|
+
|
|
101
|
+
def _add_res_weights(df):
|
|
102
|
+
temp_df = df.copy()
|
|
103
|
+
temp_df['res_weights'] = temp_df.res_interactions.apply(_calc_res_weights)
|
|
104
|
+
return temp_df
|
|
105
|
+
|
|
106
|
+
def _add_semantic_weights(df,ranking_factors={}):
|
|
107
|
+
temp_df = df.copy()
|
|
108
|
+
ranks = partial(_calc_semantic_weights,factors=ranking_factors)
|
|
109
|
+
temp_df['semantic_weights'] = temp_df.res_weights.apply(ranks)
|
|
110
|
+
return temp_df
|
|
111
|
+
|
|
112
|
+
def _add_res_adj_matrix(df):
|
|
113
|
+
temp_df = df.copy()
|
|
114
|
+
temp_df['res_adj_matrix'] = temp_df.semantic_weights.apply(_create_adj_matrix)
|
|
115
|
+
return temp_df
|
|
116
|
+
|
|
117
|
+
def _get_res_sequence(offsets_clean):
|
|
118
|
+
return [label for label in offsets_clean.values()]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _empty_res_interactions() -> dict[tuple,int]:
|
|
122
|
+
RE_types = ['RR','NR','AR','AF','EP']
|
|
123
|
+
RE_interactions:dict[tuple,int] = dict()
|
|
124
|
+
for t1 in RE_types:
|
|
125
|
+
for t2 in RE_types:
|
|
126
|
+
entry = tuple(sorted((t1,t2)))
|
|
127
|
+
if entry not in RE_interactions.keys():
|
|
128
|
+
RE_interactions[entry] = 0
|
|
129
|
+
return RE_interactions
|
|
130
|
+
|
|
131
|
+
def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
|
|
132
|
+
re_ints = _empty_res_interactions()
|
|
133
|
+
limit = len(re_sequence)-1
|
|
134
|
+
for i,s in enumerate(re_sequence):
|
|
135
|
+
if i < limit:
|
|
136
|
+
rei = tuple(sorted((s,re_sequence[i+1])))
|
|
137
|
+
#print(i,rei)
|
|
138
|
+
re_ints[rei] += 1
|
|
139
|
+
return re_ints
|
|
140
|
+
|
|
141
|
+
def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
|
|
142
|
+
max_count = max(interactions.values())
|
|
143
|
+
weights = dict()
|
|
144
|
+
for edge,count in interactions.items():
|
|
145
|
+
if max_count != 0:
|
|
146
|
+
weights[edge] = round(count/(max_count),2)
|
|
147
|
+
else:
|
|
148
|
+
weights[edge] = 0
|
|
149
|
+
return weights
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _calc_semantic_weights(weights:dict[tuple,float], factors:dict[tuple,float]={})->dict[tuple,float]:
|
|
154
|
+
if not factors:
|
|
155
|
+
return weights
|
|
156
|
+
else:
|
|
157
|
+
for edge,w in weights.items():
|
|
158
|
+
weights[edge] = factors[edge] * w
|
|
159
|
+
return weights
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _create_adj_matrix(weights:dict[tuple,float])->list[list[float]]:
|
|
163
|
+
re_types = ["RR","NR","AR","AF","EP"]
|
|
164
|
+
matrix = []
|
|
165
|
+
for r in re_types:
|
|
166
|
+
row = []
|
|
167
|
+
for c in re_types:
|
|
168
|
+
key = tuple(sorted((r,c)))
|
|
169
|
+
#print(key)
|
|
170
|
+
weight = weights.get(key,0)
|
|
171
|
+
row.append(weight)
|
|
172
|
+
matrix.append(row)
|
|
173
|
+
return matrix
|
|
174
|
+
|
|
175
|
+
### SIMILARITY ANALYSIS
|
|
176
|
+
|
|
177
|
+
def _jaccard_similarity(g1:Graph,g2:Graph)->float:
|
|
178
|
+
return similarity(g1, g2,
|
|
179
|
+
eweight1=g1.ep['e_weights'], eweight2=g2.ep['e_weights'],
|
|
180
|
+
#label1=g1.vp['v_labels'], label2=g2.vp['v_labels'],
|
|
181
|
+
norm=True, p=1.0, distance=False, asymmetric=False)
|
|
182
|
+
|
|
183
|
+
# def _cosine_similarity(m1,m2)->float:
|
|
184
|
+
# v1 = list(chain.from_iterable(m1))
|
|
185
|
+
# v2 = list(chain.from_iterable(m2))
|
|
186
|
+
# return np.dot(v1,v2)/(norm(v1)*norm(v2))
|
|
187
|
+
|
|
188
|
+
def _vectorise_adj(matrix):
|
|
189
|
+
return list(chain.from_iterable((matrix[i][j] for j in range(i,5)) for i in range(5)))
|
|
190
|
+
|
|
191
|
+
def zero_pos(am):
|
|
192
|
+
nm = []
|
|
193
|
+
for r,row in enumerate(am):
|
|
194
|
+
nr = []
|
|
195
|
+
for c,weight in enumerate(row):
|
|
196
|
+
if r < 3 or c < 3:
|
|
197
|
+
nr.append(weight)
|
|
198
|
+
else:
|
|
199
|
+
nr.append(0)
|
|
200
|
+
nm.append(nr)
|
|
201
|
+
return nm
|
|
202
|
+
|
|
203
|
+
def zero_mod(am):
|
|
204
|
+
nm = []
|
|
205
|
+
for r,row in enumerate(am):
|
|
206
|
+
nr = []
|
|
207
|
+
for c,weight in enumerate(row):
|
|
208
|
+
if r >= 3 or c >= 3:
|
|
209
|
+
nr.append(weight)
|
|
210
|
+
else:
|
|
211
|
+
nr.append(0)
|
|
212
|
+
nm.append(nr)
|
|
213
|
+
return nm
|
|
214
|
+
|
|
215
|
+
def _adj_vector(adj_matrix):
|
|
216
|
+
return _vectorise_adj(adj_matrix)
|
|
217
|
+
|
|
218
|
+
def _positional_vector(adj_matrix):
|
|
219
|
+
return _vectorise_adj(zero_pos(adj_matrix))
|
|
220
|
+
|
|
221
|
+
def _modal_vector(adj_matrix):
|
|
222
|
+
return _vectorise_adj(zero_mod(adj_matrix))
|
|
223
|
+
|
|
224
|
+
def _cosine(A,B):
|
|
225
|
+
return dot(A,B)/(norm(A)*norm(B))
|
|
226
|
+
|
|
227
|
+
def _am4idx(df,idx:int):
|
|
228
|
+
return df.res_adj_matrix[idx]
|
|
229
|
+
|
|
230
|
+
def _similarity(m1,m2,vector_func):
|
|
231
|
+
return float(_cosine(vector_func(m1),vector_func(m2)))
|
|
232
|
+
|
|
233
|
+
def _res_similarity(df,idx1,idx2,vector_func):
|
|
234
|
+
return _similarity(_am4idx(df,idx1),_am4idx(df,idx2),vector_func)
|
|
235
|
+
|
|
236
|
+
_interaction_similarity = partial(_res_similarity,vector_func=_adj_vector)
|
|
237
|
+
_positional_similarity = partial(_res_similarity,vector_func=_positional_vector)
|
|
238
|
+
_modal_similarity = partial(_res_similarity,vector_func=_modal_vector)
|
|
239
|
+
|
|
240
|
+
def _similarities(df,idx1,idx2):
|
|
241
|
+
return {"interaction": _interaction_similarity(df,idx1,idx2),
|
|
242
|
+
"positional": _positional_similarity(df,idx1,idx2),
|
|
243
|
+
"modal":_modal_similarity(df,idx1,idx2)}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
### PIPELINE SUPPORT FUNCTIONS
|
|
248
|
+
|
|
249
|
+
# Clean return characters and strip whitespace
|
|
250
|
+
# Used by preprocess_text()
|
|
251
|
+
def _whitespace_cleaner(text:str)->str:
|
|
252
|
+
return text.strip().replace('\r\n','\n')
|
|
253
|
+
|
|
254
|
+
# Upload text to S3
|
|
255
|
+
def _s3_text_uploader(idx:str,text:str,aws_service,config:dict,logger)->bool:
|
|
256
|
+
try:
|
|
257
|
+
response = aws_service.s3_client.put_object(Body=text,
|
|
258
|
+
Bucket=aws_service.aws_params["s3_bucket_name"],
|
|
259
|
+
Key=f"{config["s3_source_dir"]}/{idx}.txt")
|
|
260
|
+
except Exception as e:
|
|
261
|
+
logger.error("There was an error when uploading text to s3 %s",repr(e))
|
|
262
|
+
return False
|
|
263
|
+
else:
|
|
264
|
+
if response['ResponseMetadata']['HTTPStatusCode']==200:
|
|
265
|
+
logger.debug(f"File {idx} uploaded successfully")
|
|
266
|
+
return True
|
|
267
|
+
else:
|
|
268
|
+
logger.error(f"File {idx} did not upload successfully to S3: {response}")
|
|
269
|
+
return False
|
|
270
|
+
|
|
271
|
+
# Analyse text with comprehend custom entity recognizer
|
|
272
|
+
def _comprehend_cer_analyser(aws_service,config,logger)->dict:
|
|
273
|
+
try:
|
|
274
|
+
response = aws_service.comprehend_client.start_entities_detection_job(
|
|
275
|
+
InputDataConfig={
|
|
276
|
+
'S3Uri': _comprehend_input_uri(aws_service.aws_params["s3_bucket_name"],
|
|
277
|
+
config["s3_source_dir"]),
|
|
278
|
+
'InputFormat': 'ONE_DOC_PER_FILE'
|
|
279
|
+
},
|
|
280
|
+
OutputDataConfig={
|
|
281
|
+
'S3Uri': _comprehend_output_uri(aws_service.aws_params["s3_bucket_name"],
|
|
282
|
+
config["s3_target_dir"])
|
|
283
|
+
},
|
|
284
|
+
DataAccessRoleArn=_comprehend_access_role_arn(aws_service.aws_params["comprehend_service_role_name"],
|
|
285
|
+
aws_service.aws_account_number),
|
|
286
|
+
JobName=f"res_analysis_{_date_string()}",
|
|
287
|
+
EntityRecognizerArn=_comprehend_cer_arn(aws_service.aws_session.region_name,
|
|
288
|
+
aws_service.aws_account_number,
|
|
289
|
+
aws_service.aws_params["reflexive_entity_name"],
|
|
290
|
+
aws_service.aws_params["reflexive_entity_version"]),
|
|
291
|
+
LanguageCode='en'
|
|
292
|
+
)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.error("There was an error when analysing text with comprehend %s",repr(e))
|
|
295
|
+
return {"ERROR":repr(e)}
|
|
296
|
+
else:
|
|
297
|
+
return aws_service.comprehend_client.describe_entities_detection_job(JobId=response['JobId'])
|
|
298
|
+
|
|
299
|
+
# Monitor a CER Analysis Job
|
|
300
|
+
def _cer_job_progress(status:dict,aws_service,tz,output)->dict:
|
|
301
|
+
# Submitted
|
|
302
|
+
job_name = status['JobName']
|
|
303
|
+
job_id = status['JobId']
|
|
304
|
+
submit_time = status['SubmitTime'].astimezone(ZoneInfo(tz))
|
|
305
|
+
output(f"RES_ANALYSIS JOB {job_name} ({job_id}) submitted at: {submit_time}")
|
|
306
|
+
|
|
307
|
+
# In progress
|
|
308
|
+
while status['JobStatus'] in ["SUBMITTED","IN_PROGRESS"]:
|
|
309
|
+
time = datetime.now().astimezone(ZoneInfo(tz))
|
|
310
|
+
job_status = status['JobStatus']
|
|
311
|
+
output(f"{time} [{job_id}] {job_name} status: {job_status}")
|
|
312
|
+
sleep(10)
|
|
313
|
+
properties = aws_service.comprehend_client.describe_entities_detection_job(JobId=job_id)
|
|
314
|
+
status=properties['EntitiesDetectionJobProperties']
|
|
315
|
+
|
|
316
|
+
# Finished (complete or error)
|
|
317
|
+
job_status = status['JobStatus']
|
|
318
|
+
end_time = status['EndTime'].astimezone(ZoneInfo(tz))
|
|
319
|
+
time_taken = end_time - submit_time
|
|
320
|
+
output_url = status['OutputDataConfig']['S3Uri']
|
|
321
|
+
output(f"RES_ANALYSIS JOB {job_name} ({job_id}) finished with status: {job_status} at: {end_time}")
|
|
322
|
+
output(f"Analysis time: {str(time_taken)}")
|
|
323
|
+
output(f"Results available at: {output_url}")
|
|
324
|
+
return status
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
# Download from S3 to local
|
|
328
|
+
def _download_from_s3(res_analyser,status)->str:
|
|
329
|
+
local_file_path = f"{res_analyser.config['local_data_dir']}/{status['JobName']}.tar.gz"
|
|
330
|
+
bucket_name = res_analyser.aws_service.aws_params["s3_bucket_name"]
|
|
331
|
+
try:
|
|
332
|
+
output_key = status['OutputDataConfig']['S3Uri'].split(bucket_name)[1]
|
|
333
|
+
with open(f"{local_file_path}",'wb') as output_data:
|
|
334
|
+
res_analyser.aws_service.s3_client.download_fileobj(bucket_name,output_key[1:],output_data)
|
|
335
|
+
except Exception as e:
|
|
336
|
+
res_analyser.logger.error("An error occured when downloading results from S3: %s",repr(e))
|
|
337
|
+
local_file_path = None
|
|
338
|
+
return local_file_path
|
|
339
|
+
|
|
340
|
+
# Extract results from tar.gz file and save as json
|
|
341
|
+
def _extract_save_results(res_analyser,local_file_path)->list:
|
|
342
|
+
# extract the tar archive
|
|
343
|
+
files = list()
|
|
344
|
+
with tarfile.open(f"{local_file_path}", "r:gz") as tf:
|
|
345
|
+
for member in tf.getmembers():
|
|
346
|
+
f = tf.extractfile(member)
|
|
347
|
+
if f is not None:
|
|
348
|
+
content = f.read()
|
|
349
|
+
files.append(content)
|
|
350
|
+
# extract results and save and return
|
|
351
|
+
raw_results = files[0].decode("utf-8").split('\n')
|
|
352
|
+
raw_results.pop() # pop last item off as empty entry due to final \n
|
|
353
|
+
#
|
|
354
|
+
#json_results = json.dumps(raw_results)
|
|
355
|
+
#res_analyser.logger.info("raw_results>> ",raw_results)
|
|
356
|
+
results = [json.loads(result) for result in raw_results]
|
|
357
|
+
with open(f"{local_file_path[:-7]}.json","w") as fp:
|
|
358
|
+
json.dump(results,fp)
|
|
359
|
+
return results
|
|
360
|
+
|
|
361
|
+
# Get a dict of (index,entities) from cer analysis results
|
|
362
|
+
def _extract_analysis(results):
|
|
363
|
+
file_ents = ((result["File"],result["Entities"]) for result in results)
|
|
364
|
+
idx_ents = ((int(file.split('_')[-1].split('.')[0]),ents) for file,ents in file_ents)
|
|
365
|
+
return dict(idx_ents)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
# Comprehend access role arn
|
|
370
|
+
def _comprehend_access_role_arn(comprehend_service_role_name,aws_account_number):
|
|
371
|
+
return f"arn:aws:iam::{aws_account_number}:role/service-role/{comprehend_service_role_name}"
|
|
372
|
+
|
|
373
|
+
# Comprehend input url
|
|
374
|
+
def _comprehend_input_uri(s3_bucket_name,s3_files,prefix=""):
|
|
375
|
+
return f"s3://{s3_bucket_name}/{s3_files}/{prefix}"
|
|
376
|
+
|
|
377
|
+
# Comprehend output url
|
|
378
|
+
def _comprehend_output_uri(s3_bucket_name,s3_results):
|
|
379
|
+
return f"s3://{s3_bucket_name}/{s3_results}/"
|
|
380
|
+
|
|
381
|
+
# Comprehend entity recognizer arn
|
|
382
|
+
def _comprehend_cer_arn(region,account_number,cer_name,cer_version):
|
|
383
|
+
return f"arn:aws:comprehend:{region}:{account_number}:entity-recognizer/{cer_name}/version/{cer_version}"
|
|
384
|
+
|
|
385
|
+
## Offset functions
|
|
386
|
+
|
|
387
|
+
def _offset_tuples(offsets):
|
|
388
|
+
for k,vs in offsets.items():
|
|
389
|
+
for b,e in vs:
|
|
390
|
+
yield (b,(e,k))
|
|
391
|
+
|
|
392
|
+
def _sorted_offsets(offsets):
|
|
393
|
+
return sorted(offsets)
|
|
394
|
+
|
|
395
|
+
def _orphaned_I(text,offsets):
|
|
396
|
+
for b,(e,t) in offsets:
|
|
397
|
+
if 'I' in text[(b-2):(b-1)].strip():
|
|
398
|
+
#print(text[(b-2):e],t)
|
|
399
|
+
yield (b-2, (e,t))
|
|
400
|
+
else:
|
|
401
|
+
yield (b, (e,t))
|
|
402
|
+
|
|
403
|
+
def _orphaned_word(text,offsets):
|
|
404
|
+
coffs = {}
|
|
405
|
+
p = (0,(-2,''))
|
|
406
|
+
for b,(e,t) in offsets:
|
|
407
|
+
#print(p[1][0])
|
|
408
|
+
if (p[1][0]+3)>=b:
|
|
409
|
+
#print("Prev:",p,f"|{df.text[0][p[0]:p[1][0]]}|")
|
|
410
|
+
#print("<--->",f"|{df.text[0][(p[1][0]+1):(b-1)]}|")
|
|
411
|
+
#print("This:",b,e,t,f"|{df.text[0][b:e]}|")
|
|
412
|
+
#print()
|
|
413
|
+
if len((text[p[0]:p[1][0]]).split(' '))<2:
|
|
414
|
+
#print(f"Removing {p[0]},{p[1][0]},{p[1][1]}")
|
|
415
|
+
coffs.pop(p[0])
|
|
416
|
+
#print(f"Replacing {b},{e},{t} with {p[0]},{e},{t}")
|
|
417
|
+
coffs[p[0]] = (e,t)
|
|
418
|
+
p=(p[0],(e,t))
|
|
419
|
+
else:
|
|
420
|
+
coffs[b] = (e,t)
|
|
421
|
+
p = (b,(e,t))
|
|
422
|
+
else:
|
|
423
|
+
coffs[b] = (e,t)
|
|
424
|
+
p = (b,(e,t))
|
|
425
|
+
return coffs.items()
|
|
426
|
+
|
|
427
|
+
def _regroup(offsets):
|
|
428
|
+
grouped = (((b,e),k) for (b,(e,k)) in offsets)
|
|
429
|
+
return dict(grouped)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
### UTILITY FUNCTIONS
|
|
435
|
+
|
|
436
|
+
# Create a reverse date string YYYYmmdd based on current local time
|
|
437
|
+
def _date_string()->str:
|
|
438
|
+
return datetime.today().strftime('%Y%m%d')
|
|
439
|
+
|
|
440
|
+
# Get the current local working dir
|
|
441
|
+
def _local_path(dir)->str:
|
|
442
|
+
return os.getcwd()+dir
|
|
443
|
+
|
|
444
|
+
# Check if local directory exists
|
|
445
|
+
def _dir_exists_local(dir:str)->bool:
|
|
446
|
+
return os.path.exists(_local_path(dir))
|
|
447
|
+
|
|
448
|
+
# Return function to create directory
|
|
449
|
+
def _create_dir(dir)->str:
|
|
450
|
+
os.makedirs(_local_path(dir))
|
|
451
|
+
return _local_path(dir)
|
|
452
|
+
|
|
453
|
+
# Create local directory if required
|
|
454
|
+
def _create_local_dir(dir,logger)->str:
|
|
455
|
+
if not _dir_exists_local(dir):
|
|
456
|
+
try:
|
|
457
|
+
path = _create_dir(dir)
|
|
458
|
+
except Exception as e:
|
|
459
|
+
logger.error("There was an error creating the local directory: %s",repr(e))
|
|
460
|
+
finally:
|
|
461
|
+
return path
|
|
462
|
+
else:
|
|
463
|
+
return _local_path(dir)
|
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
from pandas import (DataFrame,Series)
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from zoneinfo import ZoneInfo
|
|
5
|
+
from time import sleep
|
|
6
|
+
from functools import partial
|
|
7
|
+
import tarfile
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from numpy import (
|
|
11
|
+
asarray,
|
|
12
|
+
dot
|
|
13
|
+
)
|
|
14
|
+
from numpy.linalg import norm
|
|
15
|
+
from itertools import chain
|
|
16
|
+
from graph_tool.all import (
|
|
17
|
+
Graph,
|
|
18
|
+
similarity,
|
|
19
|
+
adjacency)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
### PIPELINE FUNCTIONS
|
|
23
|
+
|
|
24
|
+
# Clean text using supplied function and calculate text length
|
|
25
|
+
# Used by RES_analyser.preprocess_text()
|
|
26
|
+
|
|
27
|
+
def _clean_text(df:DataFrame,text_cleaner:Callable[[str],str])->DataFrame:
|
|
28
|
+
return (df
|
|
29
|
+
.assign(text=lambda d: d.text.apply(text_cleaner))
|
|
30
|
+
.assign(text_length=lambda d: [len(row.text) for row in d.itertuples()]))
|
|
31
|
+
|
|
32
|
+
# Upload text using supplied uploader function
|
|
33
|
+
# Used by RES_analyser.upload_text_to_s3()
|
|
34
|
+
|
|
35
|
+
def _upload_text(df:DataFrame,uploader:Callable,res_analyser)->DataFrame:
|
|
36
|
+
upload = partial(uploader,aws_service=res_analyser.aws_service,config=res_analyser.config,logger=res_analyser.logger)
|
|
37
|
+
return df.assign(uploaded=lambda d: [upload(str(row.Index),row.text) for row in d.itertuples()])
|
|
38
|
+
|
|
39
|
+
# Initiate comprehend analysis on S3 text documents
|
|
40
|
+
|
|
41
|
+
def _analyse_text(analyser:Callable,res_analyser)->dict:
|
|
42
|
+
analyse = partial(analyser,
|
|
43
|
+
aws_service=res_analyser.aws_service,
|
|
44
|
+
config=res_analyser.config,
|
|
45
|
+
logger = res_analyser.logger)
|
|
46
|
+
job_status = analyse()
|
|
47
|
+
return job_status['EntitiesDetectionJobProperties']
|
|
48
|
+
|
|
49
|
+
# Add comprehend analysis results to dataframe
|
|
50
|
+
def _analysis_to_dataframe(df:DataFrame,results:list)->DataFrame:
|
|
51
|
+
analysis = _extract_analysis(results=results)
|
|
52
|
+
df['res_results']=Series(analysis)
|
|
53
|
+
return df
|
|
54
|
+
|
|
55
|
+
# Offsets to dataframe
|
|
56
|
+
def _add_offsets(df:DataFrame,offset_cleaner,orphan_joiner)->DataFrame:
|
|
57
|
+
return (df
|
|
58
|
+
.assign(offsets=lambda d: d.res_results.apply(offset_cleaner))
|
|
59
|
+
.assign(offsets_clean=lambda d: [orphan_joiner(row.text,row.offsets) for row in d.itertuples()]))
|
|
60
|
+
|
|
61
|
+
def _offset_cleaner(res_results):
|
|
62
|
+
offsets = _collect_offsets(res_results)
|
|
63
|
+
tuples = _offset_tuples(offsets)
|
|
64
|
+
return _sorted_offsets(tuples)
|
|
65
|
+
|
|
66
|
+
def _orphan_joiner(text,offsets):
|
|
67
|
+
otuples = _orphaned_I(text,offsets)
|
|
68
|
+
offs = _orphaned_word(text,otuples)
|
|
69
|
+
return _regroup(offs)
|
|
70
|
+
|
|
71
|
+
def _collect_offsets(rrs):
|
|
72
|
+
new_rrs = {}
|
|
73
|
+
for rr in rrs:
|
|
74
|
+
if rr['Score']>0.6:
|
|
75
|
+
ent_type = rr['Type']
|
|
76
|
+
if ent_type in ['VR','ER']:
|
|
77
|
+
label = "NR"
|
|
78
|
+
elif ent_type in ['EP','EV']:
|
|
79
|
+
label = "EP"
|
|
80
|
+
elif ent_type in ['CN','AF']:
|
|
81
|
+
label = "AF"
|
|
82
|
+
else:
|
|
83
|
+
label = ent_type
|
|
84
|
+
new_rrs.setdefault(label,[]).append((rr['BeginOffset'],rr['EndOffset']))
|
|
85
|
+
return new_rrs
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
#####
|
|
90
|
+
|
|
91
|
+
def _add_res_sequence(df):
|
|
92
|
+
temp_df = df.copy()
|
|
93
|
+
temp_df['res_sequence'] = temp_df.offsets_clean.apply(_get_res_sequence)
|
|
94
|
+
return temp_df
|
|
95
|
+
|
|
96
|
+
def _add_res_interactions(df):
|
|
97
|
+
temp_df = df.copy()
|
|
98
|
+
temp_df['res_interactions'] = temp_df.res_sequence.apply(_count_res_interactions)
|
|
99
|
+
return temp_df
|
|
100
|
+
|
|
101
|
+
def _add_res_weights(df):
|
|
102
|
+
temp_df = df.copy()
|
|
103
|
+
temp_df['res_weights'] = temp_df.res_interactions.apply(_calc_res_weights)
|
|
104
|
+
return temp_df
|
|
105
|
+
|
|
106
|
+
def _add_semantic_weights(df,ranking_factors={}):
|
|
107
|
+
temp_df = df.copy()
|
|
108
|
+
ranks = partial(_calc_semantic_weights,factors=ranking_factors)
|
|
109
|
+
temp_df['semantic_weights'] = temp_df.res_weights.apply(ranks)
|
|
110
|
+
return temp_df
|
|
111
|
+
|
|
112
|
+
def _add_res_adj_matrix(df):
|
|
113
|
+
temp_df = df.copy()
|
|
114
|
+
temp_df['res_adj_matrix'] = temp_df.semantic_weights.apply(_create_adj_matrix)
|
|
115
|
+
return temp_df
|
|
116
|
+
|
|
117
|
+
def _get_res_sequence(offsets_clean):
|
|
118
|
+
return [label for label in offsets_clean.values()]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _empty_res_interactions() -> dict[tuple,int]:
|
|
122
|
+
RE_types = ['RR','NR','AR','AF','EP']
|
|
123
|
+
RE_interactions:dict[tuple,int] = dict()
|
|
124
|
+
for t1 in RE_types:
|
|
125
|
+
for t2 in RE_types:
|
|
126
|
+
entry = tuple(sorted((t1,t2)))
|
|
127
|
+
if entry not in RE_interactions.keys():
|
|
128
|
+
RE_interactions[entry] = 0
|
|
129
|
+
return RE_interactions
|
|
130
|
+
|
|
131
|
+
def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
|
|
132
|
+
re_ints = _empty_res_interactions()
|
|
133
|
+
limit = len(re_sequence)-1
|
|
134
|
+
for i,s in enumerate(re_sequence):
|
|
135
|
+
if i < limit:
|
|
136
|
+
rei = tuple(sorted((s,re_sequence[i+1])))
|
|
137
|
+
#print(i,rei)
|
|
138
|
+
re_ints[rei] += 1
|
|
139
|
+
return re_ints
|
|
140
|
+
|
|
141
|
+
def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
|
|
142
|
+
max_count = max(interactions.values())
|
|
143
|
+
weights = dict()
|
|
144
|
+
for edge,count in interactions.items():
|
|
145
|
+
if max_count != 0:
|
|
146
|
+
weights[edge] = round(count/(max_count),2)
|
|
147
|
+
else:
|
|
148
|
+
weights[edge] = 0
|
|
149
|
+
return weights
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _calc_semantic_weights(weights:dict[tuple,float], factors:dict[tuple,float]={})->dict[tuple,float]:
|
|
154
|
+
if not factors:
|
|
155
|
+
return weights
|
|
156
|
+
else:
|
|
157
|
+
for edge,w in weights.items():
|
|
158
|
+
weights[edge] = factors[edge] * w
|
|
159
|
+
return weights
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _create_adj_matrix(weights:dict[tuple,float])->list[list[float]]:
|
|
163
|
+
re_types = ["RR","NR","AR","AF","EP"]
|
|
164
|
+
matrix = []
|
|
165
|
+
for r in re_types:
|
|
166
|
+
row = []
|
|
167
|
+
for c in re_types:
|
|
168
|
+
key = tuple(sorted((r,c)))
|
|
169
|
+
#print(key)
|
|
170
|
+
weight = weights.get(key,0)
|
|
171
|
+
row.append(weight)
|
|
172
|
+
matrix.append(row)
|
|
173
|
+
return matrix
|
|
174
|
+
|
|
175
|
+
### SIMILARITY ANALYSIS
|
|
176
|
+
|
|
177
|
+
def _jaccard_similarity(g1:Graph,g2:Graph)->float:
|
|
178
|
+
return similarity(g1, g2,
|
|
179
|
+
eweight1=g1.ep['e_weights'], eweight2=g2.ep['e_weights'],
|
|
180
|
+
#label1=g1.vp['v_labels'], label2=g2.vp['v_labels'],
|
|
181
|
+
norm=True, p=1.0, distance=False, asymmetric=False)
|
|
182
|
+
|
|
183
|
+
# def _cosine_similarity(m1,m2)->float:
|
|
184
|
+
# v1 = list(chain.from_iterable(m1))
|
|
185
|
+
# v2 = list(chain.from_iterable(m2))
|
|
186
|
+
# return np.dot(v1,v2)/(norm(v1)*norm(v2))
|
|
187
|
+
|
|
188
|
+
def _vectorise_adj(matrix):
|
|
189
|
+
return list(chain.from_iterable((matrix[i][j] for j in range(i,5)) for i in range(5)))
|
|
190
|
+
|
|
191
|
+
def zero_pos(am):
|
|
192
|
+
nm = []
|
|
193
|
+
for r,row in enumerate(am):
|
|
194
|
+
nr = []
|
|
195
|
+
for c,weight in enumerate(row):
|
|
196
|
+
if r < 3 or c < 3:
|
|
197
|
+
nr.append(weight)
|
|
198
|
+
else:
|
|
199
|
+
nr.append(0)
|
|
200
|
+
nm.append(nr)
|
|
201
|
+
return nm
|
|
202
|
+
|
|
203
|
+
def zero_mod(am):
|
|
204
|
+
nm = []
|
|
205
|
+
for r,row in enumerate(am):
|
|
206
|
+
nr = []
|
|
207
|
+
for c,weight in enumerate(row):
|
|
208
|
+
if r >= 3 or c >= 3:
|
|
209
|
+
nr.append(weight)
|
|
210
|
+
else:
|
|
211
|
+
nr.append(0)
|
|
212
|
+
nm.append(nr)
|
|
213
|
+
return nm
|
|
214
|
+
|
|
215
|
+
def _adj_vector(adj_matrix):
|
|
216
|
+
return _vectorise_adj(adj_matrix)
|
|
217
|
+
|
|
218
|
+
def _positional_vector(adj_matrix):
|
|
219
|
+
return _vectorise_adj(zero_pos(adj_matrix))
|
|
220
|
+
|
|
221
|
+
def _modal_vector(adj_matrix):
|
|
222
|
+
return _vectorise_adj(zero_mod(adj_matrix))
|
|
223
|
+
|
|
224
|
+
def _cosine(A,B):
|
|
225
|
+
return dot(A,B)/(norm(A)*norm(B))
|
|
226
|
+
|
|
227
|
+
def _am4idx(df,idx:int):
|
|
228
|
+
return df.res_adj_matrix[idx]
|
|
229
|
+
|
|
230
|
+
def _similarity(m1,m2,vector_func):
|
|
231
|
+
return float(_cosine(vector_func(m1),vector_func(m2)))
|
|
232
|
+
|
|
233
|
+
def _res_similarity(df,idx1,idx2,vector_func):
|
|
234
|
+
return _similarity(_am4idx(df,idx1),_am4idx(df,idx2),vector_func)
|
|
235
|
+
|
|
236
|
+
_interaction_similarity = partial(_res_similarity,vector_func=_adj_vector)
|
|
237
|
+
_positional_similarity = partial(_res_similarity,vector_func=_positional_vector)
|
|
238
|
+
_modal_similarity = partial(_res_similarity,vector_func=_modal_vector)
|
|
239
|
+
|
|
240
|
+
def _similarities(df,idx1,idx2):
|
|
241
|
+
return {"interaction": _interaction_similarity(df,idx1,idx2),
|
|
242
|
+
"positional": _positional_similarity(df,idx1,idx2),
|
|
243
|
+
"modal":_modal_similarity(df,idx1,idx2)}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
### PIPELINE SUPPORT FUNCTIONS
|
|
248
|
+
|
|
249
|
+
# Clean return characters and strip whitespace
|
|
250
|
+
# Used by preprocess_text()
|
|
251
|
+
def _whitespace_cleaner(text:str)->str:
|
|
252
|
+
return text.strip().replace('\r\n','\n')
|
|
253
|
+
|
|
254
|
+
# Upload text to S3
|
|
255
|
+
def _s3_text_uploader(idx:str,text:str,aws_service,config:dict,logger)->bool:
|
|
256
|
+
try:
|
|
257
|
+
response = aws_service.s3_client.put_object(Body=text,
|
|
258
|
+
Bucket=aws_service.aws_params["s3_bucket_name"],
|
|
259
|
+
Key=f"{config["s3_source_dir"]}/{idx}.txt")
|
|
260
|
+
except Exception as e:
|
|
261
|
+
logger.error("There was an error when uploading text to s3 %s",repr(e))
|
|
262
|
+
return False
|
|
263
|
+
else:
|
|
264
|
+
if response['ResponseMetadata']['HTTPStatusCode']==200:
|
|
265
|
+
logger.debug(f"File {idx} uploaded successfully")
|
|
266
|
+
return True
|
|
267
|
+
else:
|
|
268
|
+
logger.error(f"File {idx} did not upload successfully to S3: {response}")
|
|
269
|
+
return False
|
|
270
|
+
|
|
271
|
+
# Analyse text with comprehend custom entity recognizer
|
|
272
|
+
def _comprehend_cer_analyser(aws_service,config,logger)->dict:
|
|
273
|
+
try:
|
|
274
|
+
response = aws_service.comprehend_client.start_entities_detection_job(
|
|
275
|
+
InputDataConfig={
|
|
276
|
+
'S3Uri': _comprehend_input_uri(aws_service.aws_params["s3_bucket_name"],
|
|
277
|
+
config["s3_source_dir"]),
|
|
278
|
+
'InputFormat': 'ONE_DOC_PER_FILE'
|
|
279
|
+
},
|
|
280
|
+
OutputDataConfig={
|
|
281
|
+
'S3Uri': _comprehend_output_uri(aws_service.aws_params["s3_bucket_name"],
|
|
282
|
+
config["s3_target_dir"])
|
|
283
|
+
},
|
|
284
|
+
DataAccessRoleArn=_comprehend_access_role_arn(aws_service.aws_params["comprehend_service_role_name"],
|
|
285
|
+
aws_service.aws_account_number),
|
|
286
|
+
JobName=f"res_analysis_{_date_string()}",
|
|
287
|
+
EntityRecognizerArn=_comprehend_cer_arn(aws_service.aws_session.region_name,
|
|
288
|
+
aws_service.aws_account_number,
|
|
289
|
+
aws_service.aws_params["reflexive_entity_name"],
|
|
290
|
+
aws_service.aws_params["reflexive_entity_version"]),
|
|
291
|
+
LanguageCode='en'
|
|
292
|
+
)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.error("There was an error when analysing text with comprehend %s",repr(e))
|
|
295
|
+
return {"ERROR":repr(e)}
|
|
296
|
+
else:
|
|
297
|
+
return aws_service.comprehend_client.describe_entities_detection_job(JobId=response['JobId'])
|
|
298
|
+
|
|
299
|
+
# Monitor a CER Analysis Job
|
|
300
|
+
def _cer_job_progress(status:dict,aws_service,tz,output)->dict:
|
|
301
|
+
# Submitted
|
|
302
|
+
job_name = status['JobName']
|
|
303
|
+
job_id = status['JobId']
|
|
304
|
+
submit_time = status['SubmitTime'].astimezone(ZoneInfo(tz))
|
|
305
|
+
output(f"RES_ANALYSIS JOB {job_name} ({job_id}) submitted at: {submit_time}")
|
|
306
|
+
|
|
307
|
+
# In progress
|
|
308
|
+
while status['JobStatus'] in ["SUBMITTED","IN_PROGRESS"]:
|
|
309
|
+
time = datetime.now().astimezone(ZoneInfo(tz))
|
|
310
|
+
job_status = status['JobStatus']
|
|
311
|
+
output(f"{time} [{job_id}] {job_name} status: {job_status}")
|
|
312
|
+
sleep(10)
|
|
313
|
+
properties = aws_service.comprehend_client.describe_entities_detection_job(JobId=job_id)
|
|
314
|
+
status=properties['EntitiesDetectionJobProperties']
|
|
315
|
+
|
|
316
|
+
# Finished (complete or error)
|
|
317
|
+
job_status = status['JobStatus']
|
|
318
|
+
end_time = status['EndTime'].astimezone(ZoneInfo(tz))
|
|
319
|
+
time_taken = end_time - submit_time
|
|
320
|
+
output_url = status['OutputDataConfig']['S3Uri']
|
|
321
|
+
output(f"RES_ANALYSIS JOB {job_name} ({job_id}) finished with status: {job_status} at: {end_time}")
|
|
322
|
+
output(f"Analysis time: {str(time_taken)}")
|
|
323
|
+
output(f"Results available at: {output_url}")
|
|
324
|
+
return status
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
# Download from S3 to local
|
|
328
|
+
def _download_from_s3(res_analyser,status)->str:
|
|
329
|
+
local_file_path = f"{res_analyser.config['local_data_dir']}/{status['JobName']}.tar.gz"
|
|
330
|
+
bucket_name = res_analyser.aws_service.aws_params["s3_bucket_name"]
|
|
331
|
+
try:
|
|
332
|
+
output_key = status['OutputDataConfig']['S3Uri'].split(bucket_name)[1]
|
|
333
|
+
with open(f"{local_file_path}",'wb') as output_data:
|
|
334
|
+
res_analyser.aws_service.s3_client.download_fileobj(bucket_name,output_key[1:],output_data)
|
|
335
|
+
except Exception as e:
|
|
336
|
+
res_analyser.logger.error("An error occured when downloading results from S3: %s",repr(e))
|
|
337
|
+
local_file_path = None
|
|
338
|
+
return local_file_path
|
|
339
|
+
|
|
340
|
+
# Extract results from tar.gz file and save as json
|
|
341
|
+
def _extract_save_results(res_analyser,local_file_path)->list:
|
|
342
|
+
# extract the tar archive
|
|
343
|
+
files = list()
|
|
344
|
+
with tarfile.open(f"{local_file_path}", "r:gz") as tf:
|
|
345
|
+
for member in tf.getmembers():
|
|
346
|
+
f = tf.extractfile(member)
|
|
347
|
+
if f is not None:
|
|
348
|
+
content = f.read()
|
|
349
|
+
files.append(content)
|
|
350
|
+
# extract results and save and return
|
|
351
|
+
raw_results = files[0].decode("utf-8").split('\n')
|
|
352
|
+
raw_results.pop() # pop last item off as empty entry due to final \n
|
|
353
|
+
#
|
|
354
|
+
#json_results = json.dumps(raw_results)
|
|
355
|
+
#res_analyser.logger.info("raw_results>> ",raw_results)
|
|
356
|
+
results = [json.loads(result) for result in raw_results]
|
|
357
|
+
with open(f"{local_file_path[:-7]}.json","w") as fp:
|
|
358
|
+
json.dump(results,fp)
|
|
359
|
+
return results
|
|
360
|
+
|
|
361
|
+
# Get a dict of (index,entities) from cer analysis results
|
|
362
|
+
def _extract_analysis(results):
|
|
363
|
+
file_ents = ((result["File"],result["Entities"]) for result in results)
|
|
364
|
+
idx_ents = ((int(file.split('_')[-1].split('.')[0]),ents) for file,ents in file_ents)
|
|
365
|
+
return dict(idx_ents)
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
# Comprehend access role arn
|
|
370
|
+
def _comprehend_access_role_arn(comprehend_service_role_name,aws_account_number):
|
|
371
|
+
return f"arn:aws:iam::{aws_account_number}:role/service-role/{comprehend_service_role_name}"
|
|
372
|
+
|
|
373
|
+
# Comprehend input url
|
|
374
|
+
def _comprehend_input_uri(s3_bucket_name,s3_files,prefix=""):
|
|
375
|
+
return f"s3://{s3_bucket_name}/{s3_files}/{prefix}"
|
|
376
|
+
|
|
377
|
+
# Comprehend output url
|
|
378
|
+
def _comprehend_output_uri(s3_bucket_name,s3_results):
|
|
379
|
+
return f"s3://{s3_bucket_name}/{s3_results}/"
|
|
380
|
+
|
|
381
|
+
# Comprehend entity recognizer arn
|
|
382
|
+
def _comprehend_cer_arn(region,account_number,cer_name,cer_version):
|
|
383
|
+
return f"arn:aws:comprehend:{region}:{account_number}:entity-recognizer/{cer_name}/version/{cer_version}"
|
|
384
|
+
|
|
385
|
+
## Offset functions
|
|
386
|
+
|
|
387
|
+
def _offset_tuples(offsets):
|
|
388
|
+
for k,vs in offsets.items():
|
|
389
|
+
for b,e in vs:
|
|
390
|
+
yield (b,(e,k))
|
|
391
|
+
|
|
392
|
+
def _sorted_offsets(offsets):
|
|
393
|
+
return sorted(offsets)
|
|
394
|
+
|
|
395
|
+
def _orphaned_I(text,offsets):
|
|
396
|
+
for b,(e,t) in offsets:
|
|
397
|
+
if 'I' in text[(b-2):(b-1)].strip():
|
|
398
|
+
#print(text[(b-2):e],t)
|
|
399
|
+
yield (b-2, (e,t))
|
|
400
|
+
else:
|
|
401
|
+
yield (b, (e,t))
|
|
402
|
+
|
|
403
|
+
def _orphaned_word(text,offsets):
|
|
404
|
+
coffs = {}
|
|
405
|
+
p = (0,(-2,''))
|
|
406
|
+
for b,(e,t) in offsets:
|
|
407
|
+
#print(p[1][0])
|
|
408
|
+
if (p[1][0]+3)>=b:
|
|
409
|
+
#print("Prev:",p,f"|{df.text[0][p[0]:p[1][0]]}|")
|
|
410
|
+
#print("<--->",f"|{df.text[0][(p[1][0]+1):(b-1)]}|")
|
|
411
|
+
#print("This:",b,e,t,f"|{df.text[0][b:e]}|")
|
|
412
|
+
#print()
|
|
413
|
+
if len((text[p[0]:p[1][0]]).split(' '))<2:
|
|
414
|
+
#print(f"Removing {p[0]},{p[1][0]},{p[1][1]}")
|
|
415
|
+
coffs.pop(p[0])
|
|
416
|
+
#print(f"Replacing {b},{e},{t} with {p[0]},{e},{t}")
|
|
417
|
+
coffs[p[0]] = (e,t)
|
|
418
|
+
p=(p[0],(e,t))
|
|
419
|
+
else:
|
|
420
|
+
coffs[b] = (e,t)
|
|
421
|
+
p = (b,(e,t))
|
|
422
|
+
else:
|
|
423
|
+
coffs[b] = (e,t)
|
|
424
|
+
p = (b,(e,t))
|
|
425
|
+
return coffs.items()
|
|
426
|
+
|
|
427
|
+
def _regroup(offsets):
|
|
428
|
+
grouped = (((b,e),k) for (b,(e,k)) in offsets)
|
|
429
|
+
return dict(grouped)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
### UTILITY FUNCTIONS
|
|
435
|
+
|
|
436
|
+
# Create a reverse date string YYYYmmdd based on current local time
|
|
437
|
+
def _date_string()->str:
|
|
438
|
+
return datetime.today().strftime('%Y%m%d')
|
|
439
|
+
|
|
440
|
+
# Get the current local working dir
|
|
441
|
+
def _local_path(dir)->str:
|
|
442
|
+
return os.getcwd()+dir
|
|
443
|
+
|
|
444
|
+
# Check if local directory exists
|
|
445
|
+
def _dir_exists_local(dir:str)->bool:
|
|
446
|
+
return os.path.exists(_local_path(dir))
|
|
447
|
+
|
|
448
|
+
# Return function to create directory
|
|
449
|
+
def _create_dir(dir)->str:
|
|
450
|
+
os.makedirs(_local_path(dir))
|
|
451
|
+
return _local_path(dir)
|
|
452
|
+
|
|
453
|
+
# Create local directory if required
|
|
454
|
+
def _create_local_dir(dir,logger)->str:
|
|
455
|
+
if not _dir_exists_local(dir):
|
|
456
|
+
try:
|
|
457
|
+
path = _create_dir(dir)
|
|
458
|
+
except Exception as e:
|
|
459
|
+
logger.error("There was an error creating the local directory: %s",repr(e))
|
|
460
|
+
finally:
|
|
461
|
+
return path
|
|
462
|
+
else:
|
|
463
|
+
return _local_path(dir)
|
|
@@ -152,9 +152,10 @@ class RES_analyser:
|
|
|
152
152
|
file_names = []
|
|
153
153
|
texts = []
|
|
154
154
|
for file_name in sorted(os.listdir(file_path)):
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
155
|
+
if not file_name.startswith('.'):
|
|
156
|
+
file_names.append(file_name.split('.')[0])
|
|
157
|
+
with open(os.path.join(file_path,file_name),'r') as fp:
|
|
158
|
+
texts.append(fp.read())
|
|
158
159
|
return {"doc_name":file_names,"text":texts}
|
|
159
160
|
|
|
160
161
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|