reflexive 1.2.8__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. reflexive-2.0.0/=3.12 +48 -0
  2. {reflexive-1.2.8 → reflexive-2.0.0}/PKG-INFO +1 -1
  3. reflexive-1.2.8/old reflexive dist/reflexive-1.0.15-py3-none-any.whl → reflexive-2.0.0/dist_v1/reflexive-1.2.1-py3-none-any.whl +0 -0
  4. reflexive-2.0.0/dist_v1/reflexive-1.2.1.tar.gz +0 -0
  5. reflexive-2.0.0/dist_v1/reflexive-1.2.2-py3-none-any.whl +0 -0
  6. reflexive-2.0.0/dist_v1/reflexive-1.2.2.tar.gz +0 -0
  7. reflexive-2.0.0/dist_v1/reflexive-1.2.3-py3-none-any.whl +0 -0
  8. reflexive-2.0.0/dist_v1/reflexive-1.2.3.tar.gz +0 -0
  9. reflexive-2.0.0/dist_v1/reflexive-1.2.4-py3-none-any.whl +0 -0
  10. reflexive-2.0.0/dist_v1/reflexive-1.2.4.tar.gz +0 -0
  11. reflexive-2.0.0/dist_v1/reflexive-1.2.5-py3-none-any.whl +0 -0
  12. reflexive-2.0.0/dist_v1/reflexive-1.2.5.tar.gz +0 -0
  13. reflexive-2.0.0/dist_v1/reflexive-1.2.6-py3-none-any.whl +0 -0
  14. reflexive-2.0.0/dist_v1/reflexive-1.2.6.tar.gz +0 -0
  15. reflexive-2.0.0/dist_v1/reflexive-1.2.7-py3-none-any.whl +0 -0
  16. reflexive-2.0.0/dist_v1/reflexive-1.2.7.tar.gz +0 -0
  17. reflexive-2.0.0/dist_v1/reflexive-1.2.8-py3-none-any.whl +0 -0
  18. reflexive-2.0.0/dist_v1/reflexive-1.2.8.tar.gz +0 -0
  19. {reflexive-1.2.8 → reflexive-2.0.0}/pyproject.toml +1 -1
  20. reflexive-2.0.0/src/reflexive/__init__.py +4 -0
  21. reflexive-2.0.0/src/reflexive/analysis_functions.py +383 -0
  22. reflexive-2.0.0/src/reflexive/display_functions.py +165 -0
  23. reflexive-2.0.0/src/reflexive/res_analysis.py +217 -0
  24. reflexive-2.0.0/src/reflexive/service.py +58 -0
  25. reflexive-2.0.0/tests/annotated_0.html +119 -0
  26. reflexive-2.0.0/tests/annotated_1.html +174 -0
  27. reflexive-2.0.0/tests/annotated_2.html +169 -0
  28. reflexive-2.0.0/tests/data/df-250207.pkl +0 -0
  29. reflexive-2.0.0/tests/data/res_analysis_20250206.json +1 -0
  30. reflexive-2.0.0/tests/data/res_analysis_20250206.tar.gz +0 -0
  31. reflexive-2.0.0/tests/data/res_analysis_20250207.json +1 -0
  32. reflexive-2.0.0/tests/data/res_analysis_20250207.tar.gz +0 -0
  33. reflexive-2.0.0/tests/data_source/synth_ref-datascientist.txt +9 -0
  34. reflexive-2.0.0/tests/data_source/synth_ref-footballer.txt +7 -0
  35. reflexive-2.0.0/tests/data_source/synth_ref-surgeon.txt +9 -0
  36. reflexive-2.0.0/tests/data_source/teach_ref-3.txt +9 -0
  37. reflexive-2.0.0/tests/data_source/teach_ref-4.txt +7 -0
  38. reflexive-2.0.0/tests/data_source/teach_ref-5.txt +5 -0
  39. reflexive-2.0.0/tests/df-250207.pkl +0 -0
  40. reflexive-2.0.0/tests/test_reflexive.ipynb +1893 -0
  41. reflexive-1.2.8/old reflexive dist/reflexive-1.0.13-py3-none-any.whl +0 -0
  42. reflexive-1.2.8/old reflexive dist/reflexive-1.0.13.tar.gz +0 -0
  43. reflexive-1.2.8/old reflexive dist/reflexive-1.0.14-py3-none-any.whl +0 -0
  44. reflexive-1.2.8/old reflexive dist/reflexive-1.0.14.tar.gz +0 -0
  45. reflexive-1.2.8/old reflexive dist/reflexive-1.0.15.tar.gz +0 -0
  46. reflexive-1.2.8/old reflexive dist/reflexive-1.0.16-py3-none-any.whl +0 -0
  47. reflexive-1.2.8/old reflexive dist/reflexive-1.0.16.tar.gz +0 -0
  48. reflexive-1.2.8/old reflexive dist/reflexive-1.0.17-py3-none-any.whl +0 -0
  49. reflexive-1.2.8/old reflexive dist/reflexive-1.0.17.tar.gz +0 -0
  50. reflexive-1.2.8/old reflexive dist/reflexive-1.0.18-py3-none-any.whl +0 -0
  51. reflexive-1.2.8/old reflexive dist/reflexive-1.0.18.tar.gz +0 -0
  52. reflexive-1.2.8/old reflexive dist/reflexive-1.0.19-py3-none-any.whl +0 -0
  53. reflexive-1.2.8/old reflexive dist/reflexive-1.0.19.tar.gz +0 -0
  54. reflexive-1.2.8/old reflexive dist/reflexive-1.0.20-py3-none-any.whl +0 -0
  55. reflexive-1.2.8/old reflexive dist/reflexive-1.0.20.tar.gz +0 -0
  56. reflexive-1.2.8/old reflexive dist/reflexive-1.1.0-py3-none-any.whl +0 -0
  57. reflexive-1.2.8/old reflexive dist/reflexive-1.1.0.tar.gz +0 -0
  58. reflexive-1.2.8/old reflexive dist/reflexive-1.1.1-py3-none-any.whl +0 -0
  59. reflexive-1.2.8/old reflexive dist/reflexive-1.1.1.tar.gz +0 -0
  60. reflexive-1.2.8/src/reflexive/__init__.py +0 -19
  61. reflexive-1.2.8/src/reflexive/analyse.py +0 -430
  62. reflexive-1.2.8/src/reflexive/cfg.py +0 -116
  63. reflexive-1.2.8/src/reflexive/res.py +0 -225
  64. reflexive-1.2.8/src/reflexive/res_functions.py +0 -62
  65. reflexive-1.2.8/src/reflexive/session.py +0 -264
  66. reflexive-1.2.8/src/reflexive/util.py +0 -127
  67. reflexive-1.2.8/src/reflexive/visualise.py +0 -355
  68. {reflexive-1.2.8 → reflexive-2.0.0}/.gitignore +0 -0
  69. {reflexive-1.2.8 → reflexive-2.0.0}/LICENSE +0 -0
  70. {reflexive-1.2.8 → reflexive-2.0.0}/README.md +0 -0
  71. {reflexive-1.2.8 → reflexive-2.0.0}/tests/__init__.py +0 -0
  72. /reflexive-1.2.8/tests/test_reflexive.py → /reflexive-2.0.0/tests/test_reflexive_extras.py +0 -0
reflexive-2.0.0/=3.12 ADDED
@@ -0,0 +1,48 @@
1
+ Channels:
2
+ - conda-forge
3
+ - defaults
4
+ Platform: linux-aarch64
5
+ Collecting package metadata (repodata.json): ...working... done
6
+ Solving environment: ...working... done
7
+
8
+ ## Package Plan ##
9
+
10
+ environment location: /opt/conda/envs/reflexive
11
+
12
+ added / updated specs:
13
+ - python
14
+
15
+
16
+ The following packages will be downloaded:
17
+
18
+ package | build
19
+ ---------------------------|-----------------
20
+ libmpdec-4.0.0 | h68df207_0 108 KB conda-forge
21
+ pip-25.0 | pyh145f28c_0 1.2 MB conda-forge
22
+ python-3.13.1 |h3e021d1_104_cp313 32.0 MB conda-forge
23
+ python_abi-3.13 | 5_cp313 6 KB conda-forge
24
+ ------------------------------------------------------------
25
+ Total: 33.3 MB
26
+
27
+ The following NEW packages will be INSTALLED:
28
+
29
+ bzip2 conda-forge/linux-aarch64::bzip2-1.0.8-h68df207_7
30
+ ld_impl_linux-aar~ conda-forge/linux-aarch64::ld_impl_linux-aarch64-2.43-h80caac9_2
31
+ libexpat conda-forge/linux-aarch64::libexpat-2.6.4-h5ad3122_0
32
+ libffi conda-forge/linux-aarch64::libffi-3.4.2-h3557bc0_5
33
+ libgcc-ng conda-forge/linux-aarch64::libgcc-ng-14.2.0-he9431aa_1
34
+ liblzma conda-forge/linux-aarch64::liblzma-5.6.4-h86ecc28_0
35
+ libmpdec conda-forge/linux-aarch64::libmpdec-4.0.0-h68df207_0
36
+ libsqlite conda-forge/linux-aarch64::libsqlite-3.48.0-h5eb1b54_1
37
+ libuuid conda-forge/linux-aarch64::libuuid-2.38.1-hb4cce97_0
38
+ libzlib conda-forge/linux-aarch64::libzlib-1.3.1-h86ecc28_2
39
+ ncurses conda-forge/linux-aarch64::ncurses-6.5-ha32ae93_3
40
+ pip conda-forge/noarch::pip-25.0-pyh145f28c_0
41
+ python conda-forge/linux-aarch64::python-3.13.1-h3e021d1_104_cp313
42
+ python_abi conda-forge/linux-aarch64::python_abi-3.13-5_cp313
43
+ readline conda-forge/linux-aarch64::readline-8.2-h8fc344f_1
44
+ tk conda-forge/linux-aarch64::tk-8.6.13-h194ca79_0
45
+ tzdata conda-forge/noarch::tzdata-2025a-h78e105d_0
46
+
47
+
48
+ Proceed ([y]/n)?
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: reflexive
3
- Version: 1.2.8
3
+ Version: 2.0.0
4
4
  Summary: Supports AWS Reflexive Expressions Systems (RES) Analysis
5
5
  Project-URL: Repository, https://github.com/nlytx/reflexive.git
6
6
  Author-email: Andrew Gibson <andrew@nlytx.io>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "reflexive"
3
- version = "1.2.8"
3
+ version = "2.0.0"
4
4
  description = "Supports AWS Reflexive Expressions Systems (RES) Analysis"
5
5
  authors = [{name = "Andrew Gibson", email = "andrew@nlytx.io"}]
6
6
  readme = "README.md"
@@ -0,0 +1,4 @@
1
+ from reflexive.service import (AWS_service)
2
+ from reflexive.res_analysis import (RES_analyser,RES_visualiser)
3
+
4
+ __all__ = ["AWS_service","RES_analyser","RES_visualiser"]
@@ -0,0 +1,383 @@
1
+ from typing import Callable
2
+ from pandas import (DataFrame,Series)
3
+ from datetime import datetime
4
+ from zoneinfo import ZoneInfo
5
+ from time import sleep
6
+ from functools import partial
7
+ import tarfile
8
+ import json
9
+ import os
10
+ import numpy as np
11
+ from numpy.linalg import norm
12
+ from itertools import chain
13
+ from graph_tool.all import (
14
+ Graph,
15
+ similarity,
16
+ adjacency)
17
+
18
+ ### PIPELINE FUNCTIONS
19
+
20
+ # Clean text using supplied function and calculate text length
21
+ # Used by RES_analyser.preprocess_text()
22
+
23
+ def _clean_text(df:DataFrame,text_cleaner:Callable[[str],str])->DataFrame:
24
+ return (df
25
+ .assign(text=lambda d: d.text.apply(text_cleaner))
26
+ .assign(text_length=lambda d: [len(row.text) for row in d.itertuples()]))
27
+
28
+ # Upload text using supplied uploader function
29
+ # Used by RES_analyser.upload_text_to_s3()
30
+
31
+ def _upload_text(df:DataFrame,uploader:Callable,res_analyser)->DataFrame:
32
+ upload = partial(uploader,aws_service=res_analyser.aws_service,config=res_analyser.config,logger=res_analyser.logger)
33
+ return df.assign(uploaded=lambda d: [upload(str(row.Index),row.text) for row in d.itertuples()])
34
+
35
+ # Initiate comprehend analysis on S3 text documents
36
+
37
+ def _analyse_text(analyser:Callable,res_analyser)->dict:
38
+ analyse = partial(analyser,
39
+ aws_service=res_analyser.aws_service,
40
+ config=res_analyser.config,
41
+ logger = res_analyser.logger)
42
+ job_status = analyse()
43
+ return job_status['EntitiesDetectionJobProperties']
44
+
45
+ # Add comprehend analysis results to dataframe
46
+ def _analysis_to_dataframe(df:DataFrame,results:list)->DataFrame:
47
+ analysis = _extract_analysis(results=results)
48
+ df['res_results']=Series(analysis)
49
+ return df
50
+
51
+ # Offsets to dataframe
52
+ def _add_offsets(df:DataFrame,offset_cleaner,orphan_joiner)->DataFrame:
53
+ return (df
54
+ .assign(offsets=lambda d: d.res_results.apply(offset_cleaner))
55
+ .assign(offsets_clean=lambda d: [orphan_joiner(row.text,row.offsets) for row in d.itertuples()]))
56
+
57
+ def _offset_cleaner(res_results):
58
+ offsets = _collect_offsets(res_results)
59
+ tuples = _offset_tuples(offsets)
60
+ return _sorted_offsets(tuples)
61
+
62
+ def _orphan_joiner(text,offsets):
63
+ otuples = _orphaned_I(text,offsets)
64
+ offs = _orphaned_word(text,otuples)
65
+ return _regroup(offs)
66
+
67
+ def _collect_offsets(rrs):
68
+ new_rrs = {}
69
+ for rr in rrs:
70
+ if rr['Score']>0.6:
71
+ ent_type = rr['Type']
72
+ if ent_type in ['VR','ER']:
73
+ label = "NR"
74
+ elif ent_type in ['EP','EV']:
75
+ label = "EP"
76
+ elif ent_type in ['CN','AF']:
77
+ label = "AF"
78
+ else:
79
+ label = ent_type
80
+ new_rrs.setdefault(label,[]).append((rr['BeginOffset'],rr['EndOffset']))
81
+ return new_rrs
82
+
83
+
84
+
85
+ #####
86
+
87
+ def _add_res_sequence(df):
88
+ temp_df = df.copy()
89
+ temp_df['res_sequence'] = temp_df.offsets_clean.apply(_get_res_sequence)
90
+ return temp_df
91
+
92
+ def _add_res_interactions(df):
93
+ temp_df = df.copy()
94
+ temp_df['res_interactions'] = temp_df.res_sequence.apply(_count_res_interactions)
95
+ return temp_df
96
+
97
+ def _add_res_weights(df):
98
+ temp_df = df.copy()
99
+ temp_df['res_weights'] = temp_df.res_interactions.apply(_calc_res_weights)
100
+ return temp_df
101
+
102
+ def _add_res_adj_matrix(df):
103
+ temp_df = df.copy()
104
+ temp_df['res_adj_matrix'] = temp_df.res_weights.apply(_create_adj_matrix)
105
+ return temp_df
106
+
107
+ def _get_res_sequence(offsets_clean):
108
+ return [label for label in offsets_clean.values()]
109
+
110
+
111
+ def _empty_res_interactions() -> dict[tuple,int]:
112
+ RE_types = ['RR','NR','AR','AF','EP']
113
+ RE_interactions:dict[tuple,int] = dict()
114
+ for t1 in RE_types:
115
+ for t2 in RE_types:
116
+ entry = tuple(sorted((t1,t2)))
117
+ if entry not in RE_interactions.keys():
118
+ RE_interactions[entry] = 0
119
+ return RE_interactions
120
+
121
+ def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
122
+ re_ints = _empty_res_interactions()
123
+ limit = len(re_sequence)-1
124
+ for i,s in enumerate(re_sequence):
125
+ if i < limit:
126
+ rei = tuple(sorted((s,re_sequence[i+1])))
127
+ #print(i,rei)
128
+ re_ints[rei] += 1
129
+ return re_ints
130
+
131
+ def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
132
+ max_count = max(interactions.values())
133
+ weights = dict()
134
+ for edge,count in interactions.items():
135
+ weights[edge] = round(count/(max_count),2)
136
+ return weights
137
+
138
+
139
+ def _create_adj_matrix(weights:dict[tuple,float])->list[list[float]]:
140
+ re_types = ["RR","NR","AR","AF","EP"]
141
+ matrix = []
142
+ for r in re_types:
143
+ row = []
144
+ for c in re_types:
145
+ key = tuple(sorted((r,c)))
146
+ #print(key)
147
+ weight = weights.get(key,0)
148
+ row.append(weight)
149
+ matrix.append(row)
150
+ return matrix
151
+
152
+ ### GRAPH ANALYSIS
153
+
154
+ def _jaccard_similarity(g1:Graph,g2:Graph)->float:
155
+ return similarity(g1, g2,
156
+ eweight1=g1.ep['e_weights'], eweight2=g2.ep['e_weights'],
157
+ #label1=g1.vp['v_labels'], label2=g2.vp['v_labels'],
158
+ norm=True, p=1.0, distance=False, asymmetric=False)
159
+
160
+ def _cosine_similarity(m1,m2)->float:
161
+ v1 = list(chain.from_iterable(m1))
162
+ v2 = list(chain.from_iterable(m2))
163
+ return np.dot(v1,v2)/(norm(v1)*norm(v2))
164
+
165
+
166
+
167
+ ### PIPELINE SUPPORT FUNCTIONS
168
+
169
+ # Clean return characters and strip whitespace
170
+ # Used by preprocess_text()
171
+ def _whitespace_cleaner(text:str)->str:
172
+ return text.strip().replace('\r\n','\n')
173
+
174
+ # Upload text to S3
175
+ def _s3_text_uploader(idx:str,text:str,aws_service,config:dict,logger)->bool:
176
+ try:
177
+ response = aws_service.s3_client.put_object(Body=text,
178
+ Bucket=aws_service.aws_params["s3_bucket_name"],
179
+ Key=f"{config["s3_source_dir"]}/{idx}.txt")
180
+ except Exception as e:
181
+ logger.error("There was an error when uploading text to s3 %s",repr(e))
182
+ return False
183
+ else:
184
+ if response['ResponseMetadata']['HTTPStatusCode']==200:
185
+ logger.debug(f"File {idx} uploaded successfully")
186
+ return True
187
+ else:
188
+ logger.error(f"File {idx} did not upload successfully to S3: {response}")
189
+ return False
190
+
191
+ # Analyse text with comprehend custom entity recognizer
192
+ def _comprehend_cer_analyser(aws_service,config,logger)->dict:
193
+ try:
194
+ response = aws_service.comprehend_client.start_entities_detection_job(
195
+ InputDataConfig={
196
+ 'S3Uri': _comprehend_input_uri(aws_service.aws_params["s3_bucket_name"],
197
+ config["s3_source_dir"]),
198
+ 'InputFormat': 'ONE_DOC_PER_FILE'
199
+ },
200
+ OutputDataConfig={
201
+ 'S3Uri': _comprehend_output_uri(aws_service.aws_params["s3_bucket_name"],
202
+ config["s3_target_dir"])
203
+ },
204
+ DataAccessRoleArn=_comprehend_access_role_arn(aws_service.aws_params["comprehend_service_role_name"],
205
+ aws_service.aws_account_number),
206
+ JobName=f"res_analysis_{_date_string()}",
207
+ EntityRecognizerArn=_comprehend_cer_arn(aws_service.aws_session.region_name,
208
+ aws_service.aws_account_number,
209
+ aws_service.aws_params["reflexive_entity_name"],
210
+ aws_service.aws_params["reflexive_entity_version"]),
211
+ LanguageCode='en'
212
+ )
213
+ except Exception as e:
214
+ logger.error("There was an error when analysing text with comprehend %s",repr(e))
215
+ return {"ERROR":repr(e)}
216
+ else:
217
+ return aws_service.comprehend_client.describe_entities_detection_job(JobId=response['JobId'])
218
+
219
+ # Monitor a CER Analysis Job
220
+ def _cer_job_progress(status:dict,aws_service,tz,output)->dict:
221
+ # Submitted
222
+ job_name = status['JobName']
223
+ job_id = status['JobId']
224
+ submit_time = status['SubmitTime'].astimezone(ZoneInfo(tz))
225
+ output(f"RES_ANALYSIS JOB {job_name} ({job_id}) submitted at: {submit_time}")
226
+
227
+ # In progress
228
+ while status['JobStatus'] in ["SUBMITTED","IN_PROGRESS"]:
229
+ time = datetime.now().astimezone(ZoneInfo(tz))
230
+ job_status = status['JobStatus']
231
+ output(f"{time} [{job_id}] {job_name} status: {job_status}")
232
+ sleep(10)
233
+ properties = aws_service.comprehend_client.describe_entities_detection_job(JobId=job_id)
234
+ status=properties['EntitiesDetectionJobProperties']
235
+
236
+ # Finished (complete or error)
237
+ job_status = status['JobStatus']
238
+ end_time = status['EndTime'].astimezone(ZoneInfo(tz))
239
+ time_taken = end_time - submit_time
240
+ output_url = status['OutputDataConfig']['S3Uri']
241
+ output(f"RES_ANALYSIS JOB {job_name} ({job_id}) finished with status: {job_status} at: {end_time}")
242
+ output(f"Analysis time: {str(time_taken)}")
243
+ output(f"Results available at: {output_url}")
244
+ return status
245
+
246
+
247
+ # Download from S3 to local
248
+ def _download_from_s3(res_analyser,status)->str:
249
+ local_file_path = f"{res_analyser.config['local_data_dir']}/{status['JobName']}.tar.gz"
250
+ bucket_name = res_analyser.aws_service.aws_params["s3_bucket_name"]
251
+ try:
252
+ output_key = status['OutputDataConfig']['S3Uri'].split(bucket_name)[1]
253
+ with open(f"{local_file_path}",'wb') as output_data:
254
+ res_analyser.aws_service.s3_client.download_fileobj(bucket_name,output_key[1:],output_data)
255
+ except Exception as e:
256
+ res_analyser.logger.error("An error occured when downloading results from S3: %s",repr(e))
257
+ local_file_path = None
258
+ return local_file_path
259
+
260
+ # Extract results from tar.gz file and save as json
261
+ def _extract_save_results(res_analyser,local_file_path)->list:
262
+ # extract the tar archive
263
+ files = list()
264
+ with tarfile.open(f"{local_file_path}", "r:gz") as tf:
265
+ for member in tf.getmembers():
266
+ f = tf.extractfile(member)
267
+ if f is not None:
268
+ content = f.read()
269
+ files.append(content)
270
+ # extract results and save and return
271
+ raw_results = files[0].decode("utf-8").split('\n')
272
+ raw_results.pop() # pop last item off as empty entry due to final \n
273
+ #
274
+ #json_results = json.dumps(raw_results)
275
+ #res_analyser.logger.info("raw_results>> ",raw_results)
276
+ results = [json.loads(result) for result in raw_results]
277
+ with open(f"{local_file_path[:-7]}.json","w") as fp:
278
+ json.dump(results,fp)
279
+ return results
280
+
281
+ # Get a dict of (index,entities) from cer analysis results
282
+ def _extract_analysis(results):
283
+ file_ents = ((result["File"],result["Entities"]) for result in results)
284
+ idx_ents = ((int(file.split('_')[-1].split('.')[0]),ents) for file,ents in file_ents)
285
+ return dict(idx_ents)
286
+
287
+
288
+
289
+ # Comprehend access role arn
290
+ def _comprehend_access_role_arn(comprehend_service_role_name,aws_account_number):
291
+ return f"arn:aws:iam::{aws_account_number}:role/service-role/{comprehend_service_role_name}"
292
+
293
+ # Comprehend input url
294
+ def _comprehend_input_uri(s3_bucket_name,s3_files,prefix=""):
295
+ return f"s3://{s3_bucket_name}/{s3_files}/{prefix}"
296
+
297
+ # Comprehend output url
298
+ def _comprehend_output_uri(s3_bucket_name,s3_results):
299
+ return f"s3://{s3_bucket_name}/{s3_results}/"
300
+
301
+ # Comprehend entity recognizer arn
302
+ def _comprehend_cer_arn(region,account_number,cer_name,cer_version):
303
+ return f"arn:aws:comprehend:{region}:{account_number}:entity-recognizer/{cer_name}/version/{cer_version}"
304
+
305
+ ## Offset functions
306
+
307
+ def _offset_tuples(offsets):
308
+ for k,vs in offsets.items():
309
+ for b,e in vs:
310
+ yield (b,(e,k))
311
+
312
+ def _sorted_offsets(offsets):
313
+ return sorted(offsets)
314
+
315
+ def _orphaned_I(text,offsets):
316
+ for b,(e,t) in offsets:
317
+ if 'I' in text[(b-2):(b-1)].strip():
318
+ #print(text[(b-2):e],t)
319
+ yield (b-2, (e,t))
320
+ else:
321
+ yield (b, (e,t))
322
+
323
+ def _orphaned_word(text,offsets):
324
+ coffs = {}
325
+ p = (0,(-2,''))
326
+ for b,(e,t) in offsets:
327
+ #print(p[1][0])
328
+ if (p[1][0]+3)>=b:
329
+ #print("Prev:",p,f"|{df.text[0][p[0]:p[1][0]]}|")
330
+ #print("<--->",f"|{df.text[0][(p[1][0]+1):(b-1)]}|")
331
+ #print("This:",b,e,t,f"|{df.text[0][b:e]}|")
332
+ #print()
333
+ if len((text[p[0]:p[1][0]]).split(' '))<2:
334
+ #print(f"Removing {p[0]},{p[1][0]},{p[1][1]}")
335
+ coffs.pop(p[0])
336
+ #print(f"Replacing {b},{e},{t} with {p[0]},{e},{t}")
337
+ coffs[p[0]] = (e,t)
338
+ p=(p[0],(e,t))
339
+ else:
340
+ coffs[b] = (e,t)
341
+ p = (b,(e,t))
342
+ else:
343
+ coffs[b] = (e,t)
344
+ p = (b,(e,t))
345
+ return coffs.items()
346
+
347
+ def _regroup(offsets):
348
+ grouped = (((b,e),k) for (b,(e,k)) in offsets)
349
+ return dict(grouped)
350
+
351
+
352
+
353
+
354
+ ### UTILITY FUNCTIONS
355
+
356
+ # Create a reverse date string YYYYmmdd based on current local time
357
+ def _date_string()->str:
358
+ return datetime.today().strftime('%Y%m%d')
359
+
360
+ # Get the current local working dir
361
+ def _local_path(dir)->str:
362
+ return os.getcwd()+dir
363
+
364
+ # Check if local directory exists
365
+ def _dir_exists_local(dir:str)->bool:
366
+ return os.path.exists(_local_path(dir))
367
+
368
+ # Return function to create directory
369
+ def _create_dir(dir)->str:
370
+ os.makedirs(_local_path(dir))
371
+ return _local_path(dir)
372
+
373
+ # Create local directory if required
374
+ def _create_local_dir(dir,logger)->str:
375
+ if not _dir_exists_local(dir):
376
+ try:
377
+ path = _create_dir(dir)
378
+ except Exception as e:
379
+ logger.error("There was an error creating the local directory: %s",repr(e))
380
+ finally:
381
+ return path
382
+ else:
383
+ return _local_path(dir)
@@ -0,0 +1,165 @@
1
+ from graph_tool.all import (
2
+ Graph,
3
+ VertexPropertyMap,
4
+ EdgePropertyMap,
5
+ GraphPropertyMap,
6
+ graph_draw,
7
+ ungroup_vector_property,
8
+ group_vector_property
9
+ )
10
+ import cairo
11
+ from itertools import chain
12
+ from spacy import displacy
13
+
14
+ # Text Display functions
15
+
16
+ def _create_displacy_ents(name,text,offsets):
17
+ return {"text": text,
18
+ "ents": [{"start":s,"end":e,"label":l}for (s,e),l in offsets.items()],
19
+ "title": name}
20
+
21
+ def _render_annotated_text(ents,inline=True):
22
+ #default to inline
23
+ page_opt = False
24
+ jupyter_opt = True
25
+ if not inline:
26
+ page_opt = True
27
+ jupyter_opt = False
28
+
29
+ return displacy.render(ents,manual=True,style="ent", options=_get_text_display_options(),page=page_opt,jupyter=jupyter_opt)
30
+
31
+ def _get_text_display_options():
32
+ colours = dict([(prop['lbl'],prop['clr']) for prop in _res_graph_properties().values()])
33
+ return {"ents": list(colours.keys()), "colors": colours}
34
+
35
+ # RES properties for all graphs
36
+ def _res_graph_properties()->dict:
37
+ return {0:{ "lbl":"RR",
38
+ "pos":(0.2,6.5),
39
+ "clr":"#00AEEF"},
40
+ 1:{ "lbl":"NR",
41
+ "pos":(5,10),
42
+ "clr":"#ED1B23"},
43
+ 2:{ "lbl":"AR",
44
+ "pos":(9.8,6.5),
45
+ "clr":"#00A64F"},
46
+ 3:{ "lbl":"AF",
47
+ "pos":(7.9,1),
48
+ "clr":"#EC008C"},
49
+ 4:{ "lbl":"EP",
50
+ "pos":(2.1,1),
51
+ "clr":"#FFF200"}}
52
+
53
+ # Create a graph from an adjacency matrix
54
+ def _create_graph(matrix,id)->Graph:
55
+ if matrix:
56
+ graph = _graph_from_edges(dict(_matrix_to_dict(matrix)))
57
+ else:
58
+ graph = _graph_no_edges()
59
+ prop_list = _res_graph_properties().values()
60
+ graph.vp["v_positions"] = graph.new_vp("vector<double>",vals=[prop['pos'] for prop in prop_list])
61
+ graph.vp["v_labels"] = graph.new_vp("string",vals=[prop['lbl'] for prop in prop_list])
62
+ graph.gp["id"] = graph.new_gp("string",val=id)
63
+ return graph
64
+
65
+ # # Vertex properties common to all graphs
66
+ # v_lbl = graph.new_vp("string",vals=_get_prop_values('lbl'))
67
+ # v_pos = graph.new_vp("vector<double>",vals=_get_prop_values('pos'))
68
+ # # Make propertyMaps internal to the graph
69
+ # graph.vp["v_colour"] = v_clr
70
+ # graph.vp["v_position"] = v_pos
71
+ # graph.vp["v_label"] = v_lbl
72
+ # graph.ep["e_weights"] = e_weight
73
+
74
+ def _graph_from_edges(edges:dict)->Graph:
75
+ graph = Graph(g=edges.keys(),directed=False)
76
+ graph.ep["e_weights"] = graph.new_ep("double",vals=edges.values())
77
+ graph.ep["e_widths"] = graph.new_ep("double",vals=_scale_weights(edges.values()))
78
+ graph.vp["v_colours"] = _get_vcolours_from_edges(graph)
79
+ return graph
80
+
81
+ def _scale_weights(weights,factor=5):
82
+ return [round(w*factor,1) for w in weights]
83
+
84
+ def _graph_no_edges()->Graph:
85
+ graph = Graph(g=_empty_edge_dict(),directed=False)
86
+ graph.ep["e_weights"] = graph.new_ep("double")
87
+ graph.ep["e_widths"] = graph.new_ep("double")
88
+ graph.vp["v_colours"] = graph.new_vp("string",val="#cccccc")
89
+ return graph
90
+
91
+ def _get_vcolours_from_edges(graph:Graph)->VertexPropertyMap:
92
+ prop_list:dict[int,dict] = _res_graph_properties()
93
+ for i in _isolated_vertices(graph):
94
+ prop_list[i]['clr']= "#cccccc"
95
+ return graph.new_vp("string",[prop['clr'] for prop in prop_list.values()])
96
+
97
+ def _isolated_vertices(graph):
98
+ edgelist = chain.from_iterable([sorted((int(e.source()),int(e.target()))) for e in graph.edges()])
99
+ return set(range(5)) - set([e for e in set(edgelist)])
100
+
101
+ #
102
+ def _matrix_to_dict(matrix):
103
+ egen = ((((tuple(sorted((r,c))),w)) for c,w in enumerate(row) if w>0) for r,row in enumerate(matrix) if sum(row)>0)
104
+ return dict(chain.from_iterable(egen))
105
+ # edges = {}
106
+ # for r,row in enumerate(matrix):
107
+ # # if empty row, add to iso_vertices
108
+ # # if sum(row) == 0:
109
+ # # self.iso_vertices.add(r)
110
+ # # else:
111
+ # if sum(row) > 0: # edge exists
112
+ # for c,weight in enumerate(row):
113
+ # if weight > 0:
114
+ # edge = tuple(sorted((r,c)))
115
+ # #print("r,c:",edge," - ",weight)
116
+ # edges[edge] = weight
117
+ # return edges
118
+
119
+ #
120
+ def _empty_edge_dict():
121
+ empty_edges = {}
122
+ for idx in range(5): #self.gt_props.keys():
123
+ empty_edges[idx] = []
124
+ return empty_edges
125
+
126
+ #
127
+ def _get_prop_values(key):
128
+ values_list = self.gt_props.values()
129
+ return [p[key] for p in values_list]
130
+
131
+ # flip coordinates for graph-tool
132
+ def _flipY(vpositions):
133
+ x, y = ungroup_vector_property(vpositions, [0, 1])
134
+ y.fa *= -1
135
+ y.fa -= y.fa.min()
136
+ return group_vector_property([x, y])
137
+
138
+ #
139
+ def _draw_graph(graph:Graph,inline=True):
140
+
141
+ positions = _flipY(graph.vp["v_positions"])
142
+ labels = graph.vp["v_labels"]
143
+ colors = graph.vp["v_colours"]
144
+ widths = graph.ep["e_widths"]
145
+ graph_draw(graph, inline=inline,output_size=(300,300),fit_view=0.7,
146
+ pos=positions,
147
+ vertex_text=labels,
148
+ vertex_font_family="sans serif",
149
+ vertex_font_size=18,
150
+ vertex_font_weight=cairo.FONT_WEIGHT_BOLD,
151
+ vertex_fill_color=colors,
152
+ vertex_size = 50,
153
+ vertex_halo=False,
154
+ vertex_pen_width=1.2,
155
+ vertex_color="#999999",
156
+ edge_pen_width=widths)
157
+
158
+ # def get_vertex_labels(self):
159
+ # return self._get_prop_values('lbl')
160
+
161
+ # def get_vertex_colours(self):
162
+ # return self._get_prop_values('clr')
163
+
164
+ # def get_vertex_positions(self):
165
+ # return self._get_prop_values('pos')