reflexive 1.2.8__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reflexive-2.0.0/=3.12 +48 -0
- {reflexive-1.2.8 → reflexive-2.0.0}/PKG-INFO +1 -1
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.15-py3-none-any.whl → reflexive-2.0.0/dist_v1/reflexive-1.2.1-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.1.tar.gz +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.2-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.2.tar.gz +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.3-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.3.tar.gz +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.4-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.4.tar.gz +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.5-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.5.tar.gz +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.6-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.6.tar.gz +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.7-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.7.tar.gz +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.8-py3-none-any.whl +0 -0
- reflexive-2.0.0/dist_v1/reflexive-1.2.8.tar.gz +0 -0
- {reflexive-1.2.8 → reflexive-2.0.0}/pyproject.toml +1 -1
- reflexive-2.0.0/src/reflexive/__init__.py +4 -0
- reflexive-2.0.0/src/reflexive/analysis_functions.py +383 -0
- reflexive-2.0.0/src/reflexive/display_functions.py +165 -0
- reflexive-2.0.0/src/reflexive/res_analysis.py +217 -0
- reflexive-2.0.0/src/reflexive/service.py +58 -0
- reflexive-2.0.0/tests/annotated_0.html +119 -0
- reflexive-2.0.0/tests/annotated_1.html +174 -0
- reflexive-2.0.0/tests/annotated_2.html +169 -0
- reflexive-2.0.0/tests/data/df-250207.pkl +0 -0
- reflexive-2.0.0/tests/data/res_analysis_20250206.json +1 -0
- reflexive-2.0.0/tests/data/res_analysis_20250206.tar.gz +0 -0
- reflexive-2.0.0/tests/data/res_analysis_20250207.json +1 -0
- reflexive-2.0.0/tests/data/res_analysis_20250207.tar.gz +0 -0
- reflexive-2.0.0/tests/data_source/synth_ref-datascientist.txt +9 -0
- reflexive-2.0.0/tests/data_source/synth_ref-footballer.txt +7 -0
- reflexive-2.0.0/tests/data_source/synth_ref-surgeon.txt +9 -0
- reflexive-2.0.0/tests/data_source/teach_ref-3.txt +9 -0
- reflexive-2.0.0/tests/data_source/teach_ref-4.txt +7 -0
- reflexive-2.0.0/tests/data_source/teach_ref-5.txt +5 -0
- reflexive-2.0.0/tests/df-250207.pkl +0 -0
- reflexive-2.0.0/tests/test_reflexive.ipynb +1893 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.13-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.13.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.14-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.14.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.15.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.16-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.16.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.17-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.17.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.18-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.18.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.19-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.19.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.20-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.0.20.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.1.0-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.1.0.tar.gz +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.1.1-py3-none-any.whl +0 -0
- reflexive-1.2.8/old reflexive dist/reflexive-1.1.1.tar.gz +0 -0
- reflexive-1.2.8/src/reflexive/__init__.py +0 -19
- reflexive-1.2.8/src/reflexive/analyse.py +0 -430
- reflexive-1.2.8/src/reflexive/cfg.py +0 -116
- reflexive-1.2.8/src/reflexive/res.py +0 -225
- reflexive-1.2.8/src/reflexive/res_functions.py +0 -62
- reflexive-1.2.8/src/reflexive/session.py +0 -264
- reflexive-1.2.8/src/reflexive/util.py +0 -127
- reflexive-1.2.8/src/reflexive/visualise.py +0 -355
- {reflexive-1.2.8 → reflexive-2.0.0}/.gitignore +0 -0
- {reflexive-1.2.8 → reflexive-2.0.0}/LICENSE +0 -0
- {reflexive-1.2.8 → reflexive-2.0.0}/README.md +0 -0
- {reflexive-1.2.8 → reflexive-2.0.0}/tests/__init__.py +0 -0
- /reflexive-1.2.8/tests/test_reflexive.py → /reflexive-2.0.0/tests/test_reflexive_extras.py +0 -0
reflexive-2.0.0/=3.12
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Channels:
|
|
2
|
+
- conda-forge
|
|
3
|
+
- defaults
|
|
4
|
+
Platform: linux-aarch64
|
|
5
|
+
Collecting package metadata (repodata.json): ...working... done
|
|
6
|
+
Solving environment: ...working... done
|
|
7
|
+
|
|
8
|
+
## Package Plan ##
|
|
9
|
+
|
|
10
|
+
environment location: /opt/conda/envs/reflexive
|
|
11
|
+
|
|
12
|
+
added / updated specs:
|
|
13
|
+
- python
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
The following packages will be downloaded:
|
|
17
|
+
|
|
18
|
+
package | build
|
|
19
|
+
---------------------------|-----------------
|
|
20
|
+
libmpdec-4.0.0 | h68df207_0 108 KB conda-forge
|
|
21
|
+
pip-25.0 | pyh145f28c_0 1.2 MB conda-forge
|
|
22
|
+
python-3.13.1 |h3e021d1_104_cp313 32.0 MB conda-forge
|
|
23
|
+
python_abi-3.13 | 5_cp313 6 KB conda-forge
|
|
24
|
+
------------------------------------------------------------
|
|
25
|
+
Total: 33.3 MB
|
|
26
|
+
|
|
27
|
+
The following NEW packages will be INSTALLED:
|
|
28
|
+
|
|
29
|
+
bzip2 conda-forge/linux-aarch64::bzip2-1.0.8-h68df207_7
|
|
30
|
+
ld_impl_linux-aar~ conda-forge/linux-aarch64::ld_impl_linux-aarch64-2.43-h80caac9_2
|
|
31
|
+
libexpat conda-forge/linux-aarch64::libexpat-2.6.4-h5ad3122_0
|
|
32
|
+
libffi conda-forge/linux-aarch64::libffi-3.4.2-h3557bc0_5
|
|
33
|
+
libgcc-ng conda-forge/linux-aarch64::libgcc-ng-14.2.0-he9431aa_1
|
|
34
|
+
liblzma conda-forge/linux-aarch64::liblzma-5.6.4-h86ecc28_0
|
|
35
|
+
libmpdec conda-forge/linux-aarch64::libmpdec-4.0.0-h68df207_0
|
|
36
|
+
libsqlite conda-forge/linux-aarch64::libsqlite-3.48.0-h5eb1b54_1
|
|
37
|
+
libuuid conda-forge/linux-aarch64::libuuid-2.38.1-hb4cce97_0
|
|
38
|
+
libzlib conda-forge/linux-aarch64::libzlib-1.3.1-h86ecc28_2
|
|
39
|
+
ncurses conda-forge/linux-aarch64::ncurses-6.5-ha32ae93_3
|
|
40
|
+
pip conda-forge/noarch::pip-25.0-pyh145f28c_0
|
|
41
|
+
python conda-forge/linux-aarch64::python-3.13.1-h3e021d1_104_cp313
|
|
42
|
+
python_abi conda-forge/linux-aarch64::python_abi-3.13-5_cp313
|
|
43
|
+
readline conda-forge/linux-aarch64::readline-8.2-h8fc344f_1
|
|
44
|
+
tk conda-forge/linux-aarch64::tk-8.6.13-h194ca79_0
|
|
45
|
+
tzdata conda-forge/noarch::tzdata-2025a-h78e105d_0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
Proceed ([y]/n)?
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
from pandas import (DataFrame,Series)
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from zoneinfo import ZoneInfo
|
|
5
|
+
from time import sleep
|
|
6
|
+
from functools import partial
|
|
7
|
+
import tarfile
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import numpy as np
|
|
11
|
+
from numpy.linalg import norm
|
|
12
|
+
from itertools import chain
|
|
13
|
+
from graph_tool.all import (
|
|
14
|
+
Graph,
|
|
15
|
+
similarity,
|
|
16
|
+
adjacency)
|
|
17
|
+
|
|
18
|
+
### PIPELINE FUNCTIONS
|
|
19
|
+
|
|
20
|
+
# Clean text using supplied function and calculate text length
|
|
21
|
+
# Used by RES_analyser.preprocess_text()
|
|
22
|
+
|
|
23
|
+
def _clean_text(df:DataFrame,text_cleaner:Callable[[str],str])->DataFrame:
|
|
24
|
+
return (df
|
|
25
|
+
.assign(text=lambda d: d.text.apply(text_cleaner))
|
|
26
|
+
.assign(text_length=lambda d: [len(row.text) for row in d.itertuples()]))
|
|
27
|
+
|
|
28
|
+
# Upload text using supplied uploader function
|
|
29
|
+
# Used by RES_analyser.upload_text_to_s3()
|
|
30
|
+
|
|
31
|
+
def _upload_text(df:DataFrame,uploader:Callable,res_analyser)->DataFrame:
|
|
32
|
+
upload = partial(uploader,aws_service=res_analyser.aws_service,config=res_analyser.config,logger=res_analyser.logger)
|
|
33
|
+
return df.assign(uploaded=lambda d: [upload(str(row.Index),row.text) for row in d.itertuples()])
|
|
34
|
+
|
|
35
|
+
# Initiate comprehend analysis on S3 text documents
|
|
36
|
+
|
|
37
|
+
def _analyse_text(analyser:Callable,res_analyser)->dict:
|
|
38
|
+
analyse = partial(analyser,
|
|
39
|
+
aws_service=res_analyser.aws_service,
|
|
40
|
+
config=res_analyser.config,
|
|
41
|
+
logger = res_analyser.logger)
|
|
42
|
+
job_status = analyse()
|
|
43
|
+
return job_status['EntitiesDetectionJobProperties']
|
|
44
|
+
|
|
45
|
+
# Add comprehend analysis results to dataframe
|
|
46
|
+
def _analysis_to_dataframe(df:DataFrame,results:list)->DataFrame:
|
|
47
|
+
analysis = _extract_analysis(results=results)
|
|
48
|
+
df['res_results']=Series(analysis)
|
|
49
|
+
return df
|
|
50
|
+
|
|
51
|
+
# Offsets to dataframe
|
|
52
|
+
def _add_offsets(df:DataFrame,offset_cleaner,orphan_joiner)->DataFrame:
|
|
53
|
+
return (df
|
|
54
|
+
.assign(offsets=lambda d: d.res_results.apply(offset_cleaner))
|
|
55
|
+
.assign(offsets_clean=lambda d: [orphan_joiner(row.text,row.offsets) for row in d.itertuples()]))
|
|
56
|
+
|
|
57
|
+
def _offset_cleaner(res_results):
|
|
58
|
+
offsets = _collect_offsets(res_results)
|
|
59
|
+
tuples = _offset_tuples(offsets)
|
|
60
|
+
return _sorted_offsets(tuples)
|
|
61
|
+
|
|
62
|
+
def _orphan_joiner(text,offsets):
|
|
63
|
+
otuples = _orphaned_I(text,offsets)
|
|
64
|
+
offs = _orphaned_word(text,otuples)
|
|
65
|
+
return _regroup(offs)
|
|
66
|
+
|
|
67
|
+
def _collect_offsets(rrs):
|
|
68
|
+
new_rrs = {}
|
|
69
|
+
for rr in rrs:
|
|
70
|
+
if rr['Score']>0.6:
|
|
71
|
+
ent_type = rr['Type']
|
|
72
|
+
if ent_type in ['VR','ER']:
|
|
73
|
+
label = "NR"
|
|
74
|
+
elif ent_type in ['EP','EV']:
|
|
75
|
+
label = "EP"
|
|
76
|
+
elif ent_type in ['CN','AF']:
|
|
77
|
+
label = "AF"
|
|
78
|
+
else:
|
|
79
|
+
label = ent_type
|
|
80
|
+
new_rrs.setdefault(label,[]).append((rr['BeginOffset'],rr['EndOffset']))
|
|
81
|
+
return new_rrs
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
#####
|
|
86
|
+
|
|
87
|
+
def _add_res_sequence(df):
|
|
88
|
+
temp_df = df.copy()
|
|
89
|
+
temp_df['res_sequence'] = temp_df.offsets_clean.apply(_get_res_sequence)
|
|
90
|
+
return temp_df
|
|
91
|
+
|
|
92
|
+
def _add_res_interactions(df):
|
|
93
|
+
temp_df = df.copy()
|
|
94
|
+
temp_df['res_interactions'] = temp_df.res_sequence.apply(_count_res_interactions)
|
|
95
|
+
return temp_df
|
|
96
|
+
|
|
97
|
+
def _add_res_weights(df):
|
|
98
|
+
temp_df = df.copy()
|
|
99
|
+
temp_df['res_weights'] = temp_df.res_interactions.apply(_calc_res_weights)
|
|
100
|
+
return temp_df
|
|
101
|
+
|
|
102
|
+
def _add_res_adj_matrix(df):
|
|
103
|
+
temp_df = df.copy()
|
|
104
|
+
temp_df['res_adj_matrix'] = temp_df.res_weights.apply(_create_adj_matrix)
|
|
105
|
+
return temp_df
|
|
106
|
+
|
|
107
|
+
def _get_res_sequence(offsets_clean):
|
|
108
|
+
return [label for label in offsets_clean.values()]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _empty_res_interactions() -> dict[tuple,int]:
|
|
112
|
+
RE_types = ['RR','NR','AR','AF','EP']
|
|
113
|
+
RE_interactions:dict[tuple,int] = dict()
|
|
114
|
+
for t1 in RE_types:
|
|
115
|
+
for t2 in RE_types:
|
|
116
|
+
entry = tuple(sorted((t1,t2)))
|
|
117
|
+
if entry not in RE_interactions.keys():
|
|
118
|
+
RE_interactions[entry] = 0
|
|
119
|
+
return RE_interactions
|
|
120
|
+
|
|
121
|
+
def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
|
|
122
|
+
re_ints = _empty_res_interactions()
|
|
123
|
+
limit = len(re_sequence)-1
|
|
124
|
+
for i,s in enumerate(re_sequence):
|
|
125
|
+
if i < limit:
|
|
126
|
+
rei = tuple(sorted((s,re_sequence[i+1])))
|
|
127
|
+
#print(i,rei)
|
|
128
|
+
re_ints[rei] += 1
|
|
129
|
+
return re_ints
|
|
130
|
+
|
|
131
|
+
def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
|
|
132
|
+
max_count = max(interactions.values())
|
|
133
|
+
weights = dict()
|
|
134
|
+
for edge,count in interactions.items():
|
|
135
|
+
weights[edge] = round(count/(max_count),2)
|
|
136
|
+
return weights
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _create_adj_matrix(weights:dict[tuple,float])->list[list[float]]:
|
|
140
|
+
re_types = ["RR","NR","AR","AF","EP"]
|
|
141
|
+
matrix = []
|
|
142
|
+
for r in re_types:
|
|
143
|
+
row = []
|
|
144
|
+
for c in re_types:
|
|
145
|
+
key = tuple(sorted((r,c)))
|
|
146
|
+
#print(key)
|
|
147
|
+
weight = weights.get(key,0)
|
|
148
|
+
row.append(weight)
|
|
149
|
+
matrix.append(row)
|
|
150
|
+
return matrix
|
|
151
|
+
|
|
152
|
+
### GRAPH ANALYSIS
|
|
153
|
+
|
|
154
|
+
def _jaccard_similarity(g1:Graph,g2:Graph)->float:
|
|
155
|
+
return similarity(g1, g2,
|
|
156
|
+
eweight1=g1.ep['e_weights'], eweight2=g2.ep['e_weights'],
|
|
157
|
+
#label1=g1.vp['v_labels'], label2=g2.vp['v_labels'],
|
|
158
|
+
norm=True, p=1.0, distance=False, asymmetric=False)
|
|
159
|
+
|
|
160
|
+
def _cosine_similarity(m1,m2)->float:
|
|
161
|
+
v1 = list(chain.from_iterable(m1))
|
|
162
|
+
v2 = list(chain.from_iterable(m2))
|
|
163
|
+
return np.dot(v1,v2)/(norm(v1)*norm(v2))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
### PIPELINE SUPPORT FUNCTIONS
|
|
168
|
+
|
|
169
|
+
# Clean return characters and strip whitespace
|
|
170
|
+
# Used by preprocess_text()
|
|
171
|
+
def _whitespace_cleaner(text:str)->str:
|
|
172
|
+
return text.strip().replace('\r\n','\n')
|
|
173
|
+
|
|
174
|
+
# Upload text to S3
|
|
175
|
+
def _s3_text_uploader(idx:str,text:str,aws_service,config:dict,logger)->bool:
|
|
176
|
+
try:
|
|
177
|
+
response = aws_service.s3_client.put_object(Body=text,
|
|
178
|
+
Bucket=aws_service.aws_params["s3_bucket_name"],
|
|
179
|
+
Key=f"{config["s3_source_dir"]}/{idx}.txt")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.error("There was an error when uploading text to s3 %s",repr(e))
|
|
182
|
+
return False
|
|
183
|
+
else:
|
|
184
|
+
if response['ResponseMetadata']['HTTPStatusCode']==200:
|
|
185
|
+
logger.debug(f"File {idx} uploaded successfully")
|
|
186
|
+
return True
|
|
187
|
+
else:
|
|
188
|
+
logger.error(f"File {idx} did not upload successfully to S3: {response}")
|
|
189
|
+
return False
|
|
190
|
+
|
|
191
|
+
# Analyse text with comprehend custom entity recognizer
|
|
192
|
+
def _comprehend_cer_analyser(aws_service,config,logger)->dict:
|
|
193
|
+
try:
|
|
194
|
+
response = aws_service.comprehend_client.start_entities_detection_job(
|
|
195
|
+
InputDataConfig={
|
|
196
|
+
'S3Uri': _comprehend_input_uri(aws_service.aws_params["s3_bucket_name"],
|
|
197
|
+
config["s3_source_dir"]),
|
|
198
|
+
'InputFormat': 'ONE_DOC_PER_FILE'
|
|
199
|
+
},
|
|
200
|
+
OutputDataConfig={
|
|
201
|
+
'S3Uri': _comprehend_output_uri(aws_service.aws_params["s3_bucket_name"],
|
|
202
|
+
config["s3_target_dir"])
|
|
203
|
+
},
|
|
204
|
+
DataAccessRoleArn=_comprehend_access_role_arn(aws_service.aws_params["comprehend_service_role_name"],
|
|
205
|
+
aws_service.aws_account_number),
|
|
206
|
+
JobName=f"res_analysis_{_date_string()}",
|
|
207
|
+
EntityRecognizerArn=_comprehend_cer_arn(aws_service.aws_session.region_name,
|
|
208
|
+
aws_service.aws_account_number,
|
|
209
|
+
aws_service.aws_params["reflexive_entity_name"],
|
|
210
|
+
aws_service.aws_params["reflexive_entity_version"]),
|
|
211
|
+
LanguageCode='en'
|
|
212
|
+
)
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error("There was an error when analysing text with comprehend %s",repr(e))
|
|
215
|
+
return {"ERROR":repr(e)}
|
|
216
|
+
else:
|
|
217
|
+
return aws_service.comprehend_client.describe_entities_detection_job(JobId=response['JobId'])
|
|
218
|
+
|
|
219
|
+
# Monitor a CER Analysis Job
|
|
220
|
+
def _cer_job_progress(status:dict,aws_service,tz,output)->dict:
|
|
221
|
+
# Submitted
|
|
222
|
+
job_name = status['JobName']
|
|
223
|
+
job_id = status['JobId']
|
|
224
|
+
submit_time = status['SubmitTime'].astimezone(ZoneInfo(tz))
|
|
225
|
+
output(f"RES_ANALYSIS JOB {job_name} ({job_id}) submitted at: {submit_time}")
|
|
226
|
+
|
|
227
|
+
# In progress
|
|
228
|
+
while status['JobStatus'] in ["SUBMITTED","IN_PROGRESS"]:
|
|
229
|
+
time = datetime.now().astimezone(ZoneInfo(tz))
|
|
230
|
+
job_status = status['JobStatus']
|
|
231
|
+
output(f"{time} [{job_id}] {job_name} status: {job_status}")
|
|
232
|
+
sleep(10)
|
|
233
|
+
properties = aws_service.comprehend_client.describe_entities_detection_job(JobId=job_id)
|
|
234
|
+
status=properties['EntitiesDetectionJobProperties']
|
|
235
|
+
|
|
236
|
+
# Finished (complete or error)
|
|
237
|
+
job_status = status['JobStatus']
|
|
238
|
+
end_time = status['EndTime'].astimezone(ZoneInfo(tz))
|
|
239
|
+
time_taken = end_time - submit_time
|
|
240
|
+
output_url = status['OutputDataConfig']['S3Uri']
|
|
241
|
+
output(f"RES_ANALYSIS JOB {job_name} ({job_id}) finished with status: {job_status} at: {end_time}")
|
|
242
|
+
output(f"Analysis time: {str(time_taken)}")
|
|
243
|
+
output(f"Results available at: {output_url}")
|
|
244
|
+
return status
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# Download from S3 to local
|
|
248
|
+
def _download_from_s3(res_analyser,status)->str:
|
|
249
|
+
local_file_path = f"{res_analyser.config['local_data_dir']}/{status['JobName']}.tar.gz"
|
|
250
|
+
bucket_name = res_analyser.aws_service.aws_params["s3_bucket_name"]
|
|
251
|
+
try:
|
|
252
|
+
output_key = status['OutputDataConfig']['S3Uri'].split(bucket_name)[1]
|
|
253
|
+
with open(f"{local_file_path}",'wb') as output_data:
|
|
254
|
+
res_analyser.aws_service.s3_client.download_fileobj(bucket_name,output_key[1:],output_data)
|
|
255
|
+
except Exception as e:
|
|
256
|
+
res_analyser.logger.error("An error occured when downloading results from S3: %s",repr(e))
|
|
257
|
+
local_file_path = None
|
|
258
|
+
return local_file_path
|
|
259
|
+
|
|
260
|
+
# Extract results from tar.gz file and save as json
|
|
261
|
+
def _extract_save_results(res_analyser,local_file_path)->list:
|
|
262
|
+
# extract the tar archive
|
|
263
|
+
files = list()
|
|
264
|
+
with tarfile.open(f"{local_file_path}", "r:gz") as tf:
|
|
265
|
+
for member in tf.getmembers():
|
|
266
|
+
f = tf.extractfile(member)
|
|
267
|
+
if f is not None:
|
|
268
|
+
content = f.read()
|
|
269
|
+
files.append(content)
|
|
270
|
+
# extract results and save and return
|
|
271
|
+
raw_results = files[0].decode("utf-8").split('\n')
|
|
272
|
+
raw_results.pop() # pop last item off as empty entry due to final \n
|
|
273
|
+
#
|
|
274
|
+
#json_results = json.dumps(raw_results)
|
|
275
|
+
#res_analyser.logger.info("raw_results>> ",raw_results)
|
|
276
|
+
results = [json.loads(result) for result in raw_results]
|
|
277
|
+
with open(f"{local_file_path[:-7]}.json","w") as fp:
|
|
278
|
+
json.dump(results,fp)
|
|
279
|
+
return results
|
|
280
|
+
|
|
281
|
+
# Get a dict of (index,entities) from cer analysis results
|
|
282
|
+
def _extract_analysis(results):
|
|
283
|
+
file_ents = ((result["File"],result["Entities"]) for result in results)
|
|
284
|
+
idx_ents = ((int(file.split('_')[-1].split('.')[0]),ents) for file,ents in file_ents)
|
|
285
|
+
return dict(idx_ents)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# Comprehend access role arn
|
|
290
|
+
def _comprehend_access_role_arn(comprehend_service_role_name,aws_account_number):
|
|
291
|
+
return f"arn:aws:iam::{aws_account_number}:role/service-role/{comprehend_service_role_name}"
|
|
292
|
+
|
|
293
|
+
# Comprehend input url
|
|
294
|
+
def _comprehend_input_uri(s3_bucket_name,s3_files,prefix=""):
|
|
295
|
+
return f"s3://{s3_bucket_name}/{s3_files}/{prefix}"
|
|
296
|
+
|
|
297
|
+
# Comprehend output url
|
|
298
|
+
def _comprehend_output_uri(s3_bucket_name,s3_results):
|
|
299
|
+
return f"s3://{s3_bucket_name}/{s3_results}/"
|
|
300
|
+
|
|
301
|
+
# Comprehend entity recognizer arn
|
|
302
|
+
def _comprehend_cer_arn(region,account_number,cer_name,cer_version):
|
|
303
|
+
return f"arn:aws:comprehend:{region}:{account_number}:entity-recognizer/{cer_name}/version/{cer_version}"
|
|
304
|
+
|
|
305
|
+
## Offset functions
|
|
306
|
+
|
|
307
|
+
def _offset_tuples(offsets):
|
|
308
|
+
for k,vs in offsets.items():
|
|
309
|
+
for b,e in vs:
|
|
310
|
+
yield (b,(e,k))
|
|
311
|
+
|
|
312
|
+
def _sorted_offsets(offsets):
|
|
313
|
+
return sorted(offsets)
|
|
314
|
+
|
|
315
|
+
def _orphaned_I(text,offsets):
|
|
316
|
+
for b,(e,t) in offsets:
|
|
317
|
+
if 'I' in text[(b-2):(b-1)].strip():
|
|
318
|
+
#print(text[(b-2):e],t)
|
|
319
|
+
yield (b-2, (e,t))
|
|
320
|
+
else:
|
|
321
|
+
yield (b, (e,t))
|
|
322
|
+
|
|
323
|
+
def _orphaned_word(text,offsets):
|
|
324
|
+
coffs = {}
|
|
325
|
+
p = (0,(-2,''))
|
|
326
|
+
for b,(e,t) in offsets:
|
|
327
|
+
#print(p[1][0])
|
|
328
|
+
if (p[1][0]+3)>=b:
|
|
329
|
+
#print("Prev:",p,f"|{df.text[0][p[0]:p[1][0]]}|")
|
|
330
|
+
#print("<--->",f"|{df.text[0][(p[1][0]+1):(b-1)]}|")
|
|
331
|
+
#print("This:",b,e,t,f"|{df.text[0][b:e]}|")
|
|
332
|
+
#print()
|
|
333
|
+
if len((text[p[0]:p[1][0]]).split(' '))<2:
|
|
334
|
+
#print(f"Removing {p[0]},{p[1][0]},{p[1][1]}")
|
|
335
|
+
coffs.pop(p[0])
|
|
336
|
+
#print(f"Replacing {b},{e},{t} with {p[0]},{e},{t}")
|
|
337
|
+
coffs[p[0]] = (e,t)
|
|
338
|
+
p=(p[0],(e,t))
|
|
339
|
+
else:
|
|
340
|
+
coffs[b] = (e,t)
|
|
341
|
+
p = (b,(e,t))
|
|
342
|
+
else:
|
|
343
|
+
coffs[b] = (e,t)
|
|
344
|
+
p = (b,(e,t))
|
|
345
|
+
return coffs.items()
|
|
346
|
+
|
|
347
|
+
def _regroup(offsets):
|
|
348
|
+
grouped = (((b,e),k) for (b,(e,k)) in offsets)
|
|
349
|
+
return dict(grouped)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
### UTILITY FUNCTIONS
|
|
355
|
+
|
|
356
|
+
# Create a reverse date string YYYYmmdd based on current local time
|
|
357
|
+
def _date_string()->str:
|
|
358
|
+
return datetime.today().strftime('%Y%m%d')
|
|
359
|
+
|
|
360
|
+
# Get the current local working dir
|
|
361
|
+
def _local_path(dir)->str:
|
|
362
|
+
return os.getcwd()+dir
|
|
363
|
+
|
|
364
|
+
# Check if local directory exists
|
|
365
|
+
def _dir_exists_local(dir:str)->bool:
|
|
366
|
+
return os.path.exists(_local_path(dir))
|
|
367
|
+
|
|
368
|
+
# Return function to create directory
|
|
369
|
+
def _create_dir(dir)->str:
|
|
370
|
+
os.makedirs(_local_path(dir))
|
|
371
|
+
return _local_path(dir)
|
|
372
|
+
|
|
373
|
+
# Create local directory if required
|
|
374
|
+
def _create_local_dir(dir,logger)->str:
|
|
375
|
+
if not _dir_exists_local(dir):
|
|
376
|
+
try:
|
|
377
|
+
path = _create_dir(dir)
|
|
378
|
+
except Exception as e:
|
|
379
|
+
logger.error("There was an error creating the local directory: %s",repr(e))
|
|
380
|
+
finally:
|
|
381
|
+
return path
|
|
382
|
+
else:
|
|
383
|
+
return _local_path(dir)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from graph_tool.all import (
|
|
2
|
+
Graph,
|
|
3
|
+
VertexPropertyMap,
|
|
4
|
+
EdgePropertyMap,
|
|
5
|
+
GraphPropertyMap,
|
|
6
|
+
graph_draw,
|
|
7
|
+
ungroup_vector_property,
|
|
8
|
+
group_vector_property
|
|
9
|
+
)
|
|
10
|
+
import cairo
|
|
11
|
+
from itertools import chain
|
|
12
|
+
from spacy import displacy
|
|
13
|
+
|
|
14
|
+
# Text Display functions
|
|
15
|
+
|
|
16
|
+
def _create_displacy_ents(name,text,offsets):
|
|
17
|
+
return {"text": text,
|
|
18
|
+
"ents": [{"start":s,"end":e,"label":l}for (s,e),l in offsets.items()],
|
|
19
|
+
"title": name}
|
|
20
|
+
|
|
21
|
+
def _render_annotated_text(ents,inline=True):
|
|
22
|
+
#default to inline
|
|
23
|
+
page_opt = False
|
|
24
|
+
jupyter_opt = True
|
|
25
|
+
if not inline:
|
|
26
|
+
page_opt = True
|
|
27
|
+
jupyter_opt = False
|
|
28
|
+
|
|
29
|
+
return displacy.render(ents,manual=True,style="ent", options=_get_text_display_options(),page=page_opt,jupyter=jupyter_opt)
|
|
30
|
+
|
|
31
|
+
def _get_text_display_options():
|
|
32
|
+
colours = dict([(prop['lbl'],prop['clr']) for prop in _res_graph_properties().values()])
|
|
33
|
+
return {"ents": list(colours.keys()), "colors": colours}
|
|
34
|
+
|
|
35
|
+
# RES properties for all graphs
|
|
36
|
+
def _res_graph_properties()->dict:
|
|
37
|
+
return {0:{ "lbl":"RR",
|
|
38
|
+
"pos":(0.2,6.5),
|
|
39
|
+
"clr":"#00AEEF"},
|
|
40
|
+
1:{ "lbl":"NR",
|
|
41
|
+
"pos":(5,10),
|
|
42
|
+
"clr":"#ED1B23"},
|
|
43
|
+
2:{ "lbl":"AR",
|
|
44
|
+
"pos":(9.8,6.5),
|
|
45
|
+
"clr":"#00A64F"},
|
|
46
|
+
3:{ "lbl":"AF",
|
|
47
|
+
"pos":(7.9,1),
|
|
48
|
+
"clr":"#EC008C"},
|
|
49
|
+
4:{ "lbl":"EP",
|
|
50
|
+
"pos":(2.1,1),
|
|
51
|
+
"clr":"#FFF200"}}
|
|
52
|
+
|
|
53
|
+
# Create a graph from an adjacency matrix
|
|
54
|
+
def _create_graph(matrix,id)->Graph:
|
|
55
|
+
if matrix:
|
|
56
|
+
graph = _graph_from_edges(dict(_matrix_to_dict(matrix)))
|
|
57
|
+
else:
|
|
58
|
+
graph = _graph_no_edges()
|
|
59
|
+
prop_list = _res_graph_properties().values()
|
|
60
|
+
graph.vp["v_positions"] = graph.new_vp("vector<double>",vals=[prop['pos'] for prop in prop_list])
|
|
61
|
+
graph.vp["v_labels"] = graph.new_vp("string",vals=[prop['lbl'] for prop in prop_list])
|
|
62
|
+
graph.gp["id"] = graph.new_gp("string",val=id)
|
|
63
|
+
return graph
|
|
64
|
+
|
|
65
|
+
# # Vertex properties common to all graphs
|
|
66
|
+
# v_lbl = graph.new_vp("string",vals=_get_prop_values('lbl'))
|
|
67
|
+
# v_pos = graph.new_vp("vector<double>",vals=_get_prop_values('pos'))
|
|
68
|
+
# # Make propertyMaps internal to the graph
|
|
69
|
+
# graph.vp["v_colour"] = v_clr
|
|
70
|
+
# graph.vp["v_position"] = v_pos
|
|
71
|
+
# graph.vp["v_label"] = v_lbl
|
|
72
|
+
# graph.ep["e_weights"] = e_weight
|
|
73
|
+
|
|
74
|
+
def _graph_from_edges(edges:dict)->Graph:
|
|
75
|
+
graph = Graph(g=edges.keys(),directed=False)
|
|
76
|
+
graph.ep["e_weights"] = graph.new_ep("double",vals=edges.values())
|
|
77
|
+
graph.ep["e_widths"] = graph.new_ep("double",vals=_scale_weights(edges.values()))
|
|
78
|
+
graph.vp["v_colours"] = _get_vcolours_from_edges(graph)
|
|
79
|
+
return graph
|
|
80
|
+
|
|
81
|
+
def _scale_weights(weights,factor=5):
|
|
82
|
+
return [round(w*factor,1) for w in weights]
|
|
83
|
+
|
|
84
|
+
def _graph_no_edges()->Graph:
|
|
85
|
+
graph = Graph(g=_empty_edge_dict(),directed=False)
|
|
86
|
+
graph.ep["e_weights"] = graph.new_ep("double")
|
|
87
|
+
graph.ep["e_widths"] = graph.new_ep("double")
|
|
88
|
+
graph.vp["v_colours"] = graph.new_vp("string",val="#cccccc")
|
|
89
|
+
return graph
|
|
90
|
+
|
|
91
|
+
def _get_vcolours_from_edges(graph:Graph)->VertexPropertyMap:
|
|
92
|
+
prop_list:dict[int,dict] = _res_graph_properties()
|
|
93
|
+
for i in _isolated_vertices(graph):
|
|
94
|
+
prop_list[i]['clr']= "#cccccc"
|
|
95
|
+
return graph.new_vp("string",[prop['clr'] for prop in prop_list.values()])
|
|
96
|
+
|
|
97
|
+
def _isolated_vertices(graph):
|
|
98
|
+
edgelist = chain.from_iterable([sorted((int(e.source()),int(e.target()))) for e in graph.edges()])
|
|
99
|
+
return set(range(5)) - set([e for e in set(edgelist)])
|
|
100
|
+
|
|
101
|
+
#
|
|
102
|
+
def _matrix_to_dict(matrix):
|
|
103
|
+
egen = ((((tuple(sorted((r,c))),w)) for c,w in enumerate(row) if w>0) for r,row in enumerate(matrix) if sum(row)>0)
|
|
104
|
+
return dict(chain.from_iterable(egen))
|
|
105
|
+
# edges = {}
|
|
106
|
+
# for r,row in enumerate(matrix):
|
|
107
|
+
# # if empty row, add to iso_vertices
|
|
108
|
+
# # if sum(row) == 0:
|
|
109
|
+
# # self.iso_vertices.add(r)
|
|
110
|
+
# # else:
|
|
111
|
+
# if sum(row) > 0: # edge exists
|
|
112
|
+
# for c,weight in enumerate(row):
|
|
113
|
+
# if weight > 0:
|
|
114
|
+
# edge = tuple(sorted((r,c)))
|
|
115
|
+
# #print("r,c:",edge," - ",weight)
|
|
116
|
+
# edges[edge] = weight
|
|
117
|
+
# return edges
|
|
118
|
+
|
|
119
|
+
#
|
|
120
|
+
def _empty_edge_dict():
|
|
121
|
+
empty_edges = {}
|
|
122
|
+
for idx in range(5): #self.gt_props.keys():
|
|
123
|
+
empty_edges[idx] = []
|
|
124
|
+
return empty_edges
|
|
125
|
+
|
|
126
|
+
#
|
|
127
|
+
def _get_prop_values(key):
|
|
128
|
+
values_list = self.gt_props.values()
|
|
129
|
+
return [p[key] for p in values_list]
|
|
130
|
+
|
|
131
|
+
# flip coordinates for graph-tool
|
|
132
|
+
def _flipY(vpositions):
|
|
133
|
+
x, y = ungroup_vector_property(vpositions, [0, 1])
|
|
134
|
+
y.fa *= -1
|
|
135
|
+
y.fa -= y.fa.min()
|
|
136
|
+
return group_vector_property([x, y])
|
|
137
|
+
|
|
138
|
+
#
|
|
139
|
+
def _draw_graph(graph:Graph,inline=True):
|
|
140
|
+
|
|
141
|
+
positions = _flipY(graph.vp["v_positions"])
|
|
142
|
+
labels = graph.vp["v_labels"]
|
|
143
|
+
colors = graph.vp["v_colours"]
|
|
144
|
+
widths = graph.ep["e_widths"]
|
|
145
|
+
graph_draw(graph, inline=inline,output_size=(300,300),fit_view=0.7,
|
|
146
|
+
pos=positions,
|
|
147
|
+
vertex_text=labels,
|
|
148
|
+
vertex_font_family="sans serif",
|
|
149
|
+
vertex_font_size=18,
|
|
150
|
+
vertex_font_weight=cairo.FONT_WEIGHT_BOLD,
|
|
151
|
+
vertex_fill_color=colors,
|
|
152
|
+
vertex_size = 50,
|
|
153
|
+
vertex_halo=False,
|
|
154
|
+
vertex_pen_width=1.2,
|
|
155
|
+
vertex_color="#999999",
|
|
156
|
+
edge_pen_width=widths)
|
|
157
|
+
|
|
158
|
+
# def get_vertex_labels(self):
|
|
159
|
+
# return self._get_prop_values('lbl')
|
|
160
|
+
|
|
161
|
+
# def get_vertex_colours(self):
|
|
162
|
+
# return self._get_prop_values('clr')
|
|
163
|
+
|
|
164
|
+
# def get_vertex_positions(self):
|
|
165
|
+
# return self._get_prop_values('pos')
|