reflexive 2.2.0__tar.gz → 2.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. reflexive-2.2.0/src/reflexive/analysis_functions.py → reflexive-2.2.2/.history/src/reflexive/analysis_functions_20251021145809.py +2 -0
  2. reflexive-2.2.2/.history/src/reflexive/analysis_functions_20251021150142.py +463 -0
  3. {reflexive-2.2.0 → reflexive-2.2.2}/PKG-INFO +1 -1
  4. {reflexive-2.2.0 → reflexive-2.2.2}/pyproject.toml +1 -1
  5. reflexive-2.2.2/src/reflexive/analysis_functions.py +463 -0
  6. {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/res_analysis.py +4 -3
  7. {reflexive-2.2.0 → reflexive-2.2.2}/.gitignore +0 -0
  8. {reflexive-2.2.0 → reflexive-2.2.2}/=3.12 +0 -0
  9. {reflexive-2.2.0 → reflexive-2.2.2}/LICENSE +0 -0
  10. {reflexive-2.2.0 → reflexive-2.2.2}/README.md +0 -0
  11. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.1-py3-none-any.whl +0 -0
  12. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.1.tar.gz +0 -0
  13. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.2-py3-none-any.whl +0 -0
  14. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.2.tar.gz +0 -0
  15. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.3-py3-none-any.whl +0 -0
  16. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.3.tar.gz +0 -0
  17. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.4-py3-none-any.whl +0 -0
  18. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.4.tar.gz +0 -0
  19. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.5-py3-none-any.whl +0 -0
  20. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.5.tar.gz +0 -0
  21. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.6-py3-none-any.whl +0 -0
  22. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.6.tar.gz +0 -0
  23. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.7-py3-none-any.whl +0 -0
  24. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.7.tar.gz +0 -0
  25. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.8-py3-none-any.whl +0 -0
  26. {reflexive-2.2.0 → reflexive-2.2.2}/dist_v1/reflexive-1.2.8.tar.gz +0 -0
  27. {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/__init__.py +0 -0
  28. {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/display_functions.py +0 -0
  29. {reflexive-2.2.0 → reflexive-2.2.2}/src/reflexive/service.py +0 -0
  30. {reflexive-2.2.0 → reflexive-2.2.2}/tests/__init__.py +0 -0
  31. {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/synth_ref-datascientist.txt +0 -0
  32. {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/synth_ref-footballer.txt +0 -0
  33. {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/synth_ref-surgeon.txt +0 -0
  34. {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/teach_ref-3.txt +0 -0
  35. {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/teach_ref-4.txt +0 -0
  36. {reflexive-2.2.0 → reflexive-2.2.2}/tests/data_source/teach_ref-5.txt +0 -0
  37. {reflexive-2.2.0 → reflexive-2.2.2}/tests/test_reflexive-v2.ipynb +0 -0
  38. {reflexive-2.2.0 → reflexive-2.2.2}/tests/test_reflexive.ipynb +0 -0
  39. {reflexive-2.2.0 → reflexive-2.2.2}/tests/test_reflexive_extras.py +0 -0
@@ -140,6 +140,8 @@ def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
140
140
 
141
141
  def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
142
142
  max_count = max(interactions.values())
143
+ if max_count == 0:
144
+ max_count = 0.0001
143
145
  weights = dict()
144
146
  for edge,count in interactions.items():
145
147
  weights[edge] = round(count/(max_count),2)
@@ -0,0 +1,463 @@
1
+ from typing import Callable
2
+ from pandas import (DataFrame,Series)
3
+ from datetime import datetime
4
+ from zoneinfo import ZoneInfo
5
+ from time import sleep
6
+ from functools import partial
7
+ import tarfile
8
+ import json
9
+ import os
10
+ from numpy import (
11
+ asarray,
12
+ dot
13
+ )
14
+ from numpy.linalg import norm
15
+ from itertools import chain
16
+ from graph_tool.all import (
17
+ Graph,
18
+ similarity,
19
+ adjacency)
20
+
21
+
22
+ ### PIPELINE FUNCTIONS
23
+
24
+ # Clean text using supplied function and calculate text length
25
+ # Used by RES_analyser.preprocess_text()
26
+
27
+ def _clean_text(df:DataFrame,text_cleaner:Callable[[str],str])->DataFrame:
28
+ return (df
29
+ .assign(text=lambda d: d.text.apply(text_cleaner))
30
+ .assign(text_length=lambda d: [len(row.text) for row in d.itertuples()]))
31
+
32
+ # Upload text using supplied uploader function
33
+ # Used by RES_analyser.upload_text_to_s3()
34
+
35
+ def _upload_text(df:DataFrame,uploader:Callable,res_analyser)->DataFrame:
36
+ upload = partial(uploader,aws_service=res_analyser.aws_service,config=res_analyser.config,logger=res_analyser.logger)
37
+ return df.assign(uploaded=lambda d: [upload(str(row.Index),row.text) for row in d.itertuples()])
38
+
39
+ # Initiate comprehend analysis on S3 text documents
40
+
41
+ def _analyse_text(analyser:Callable,res_analyser)->dict:
42
+ analyse = partial(analyser,
43
+ aws_service=res_analyser.aws_service,
44
+ config=res_analyser.config,
45
+ logger = res_analyser.logger)
46
+ job_status = analyse()
47
+ return job_status['EntitiesDetectionJobProperties']
48
+
49
+ # Add comprehend analysis results to dataframe
50
+ def _analysis_to_dataframe(df:DataFrame,results:list)->DataFrame:
51
+ analysis = _extract_analysis(results=results)
52
+ df['res_results']=Series(analysis)
53
+ return df
54
+
55
+ # Offsets to dataframe
56
+ def _add_offsets(df:DataFrame,offset_cleaner,orphan_joiner)->DataFrame:
57
+ return (df
58
+ .assign(offsets=lambda d: d.res_results.apply(offset_cleaner))
59
+ .assign(offsets_clean=lambda d: [orphan_joiner(row.text,row.offsets) for row in d.itertuples()]))
60
+
61
+ def _offset_cleaner(res_results):
62
+ offsets = _collect_offsets(res_results)
63
+ tuples = _offset_tuples(offsets)
64
+ return _sorted_offsets(tuples)
65
+
66
+ def _orphan_joiner(text,offsets):
67
+ otuples = _orphaned_I(text,offsets)
68
+ offs = _orphaned_word(text,otuples)
69
+ return _regroup(offs)
70
+
71
+ def _collect_offsets(rrs):
72
+ new_rrs = {}
73
+ for rr in rrs:
74
+ if rr['Score']>0.6:
75
+ ent_type = rr['Type']
76
+ if ent_type in ['VR','ER']:
77
+ label = "NR"
78
+ elif ent_type in ['EP','EV']:
79
+ label = "EP"
80
+ elif ent_type in ['CN','AF']:
81
+ label = "AF"
82
+ else:
83
+ label = ent_type
84
+ new_rrs.setdefault(label,[]).append((rr['BeginOffset'],rr['EndOffset']))
85
+ return new_rrs
86
+
87
+
88
+
89
+ #####
90
+
91
+ def _add_res_sequence(df):
92
+ temp_df = df.copy()
93
+ temp_df['res_sequence'] = temp_df.offsets_clean.apply(_get_res_sequence)
94
+ return temp_df
95
+
96
+ def _add_res_interactions(df):
97
+ temp_df = df.copy()
98
+ temp_df['res_interactions'] = temp_df.res_sequence.apply(_count_res_interactions)
99
+ return temp_df
100
+
101
+ def _add_res_weights(df):
102
+ temp_df = df.copy()
103
+ temp_df['res_weights'] = temp_df.res_interactions.apply(_calc_res_weights)
104
+ return temp_df
105
+
106
+ def _add_semantic_weights(df,ranking_factors={}):
107
+ temp_df = df.copy()
108
+ ranks = partial(_calc_semantic_weights,factors=ranking_factors)
109
+ temp_df['semantic_weights'] = temp_df.res_weights.apply(ranks)
110
+ return temp_df
111
+
112
+ def _add_res_adj_matrix(df):
113
+ temp_df = df.copy()
114
+ temp_df['res_adj_matrix'] = temp_df.semantic_weights.apply(_create_adj_matrix)
115
+ return temp_df
116
+
117
+ def _get_res_sequence(offsets_clean):
118
+ return [label for label in offsets_clean.values()]
119
+
120
+
121
+ def _empty_res_interactions() -> dict[tuple,int]:
122
+ RE_types = ['RR','NR','AR','AF','EP']
123
+ RE_interactions:dict[tuple,int] = dict()
124
+ for t1 in RE_types:
125
+ for t2 in RE_types:
126
+ entry = tuple(sorted((t1,t2)))
127
+ if entry not in RE_interactions.keys():
128
+ RE_interactions[entry] = 0
129
+ return RE_interactions
130
+
131
+ def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
132
+ re_ints = _empty_res_interactions()
133
+ limit = len(re_sequence)-1
134
+ for i,s in enumerate(re_sequence):
135
+ if i < limit:
136
+ rei = tuple(sorted((s,re_sequence[i+1])))
137
+ #print(i,rei)
138
+ re_ints[rei] += 1
139
+ return re_ints
140
+
141
+ def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
142
+ max_count = max(interactions.values())
143
+ weights = dict()
144
+ for edge,count in interactions.items():
145
+ if max_count != 0:
146
+ weights[edge] = round(count/(max_count),2)
147
+ else:
148
+ weights[edge] = 0
149
+ return weights
150
+
151
+
152
+
153
+ def _calc_semantic_weights(weights:dict[tuple,float], factors:dict[tuple,float]={})->dict[tuple,float]:
154
+ if not factors:
155
+ return weights
156
+ else:
157
+ for edge,w in weights.items():
158
+ weights[edge] = factors[edge] * w
159
+ return weights
160
+
161
+
162
+ def _create_adj_matrix(weights:dict[tuple,float])->list[list[float]]:
163
+ re_types = ["RR","NR","AR","AF","EP"]
164
+ matrix = []
165
+ for r in re_types:
166
+ row = []
167
+ for c in re_types:
168
+ key = tuple(sorted((r,c)))
169
+ #print(key)
170
+ weight = weights.get(key,0)
171
+ row.append(weight)
172
+ matrix.append(row)
173
+ return matrix
174
+
175
+ ### SIMILARITY ANALYSIS
176
+
177
+ def _jaccard_similarity(g1:Graph,g2:Graph)->float:
178
+ return similarity(g1, g2,
179
+ eweight1=g1.ep['e_weights'], eweight2=g2.ep['e_weights'],
180
+ #label1=g1.vp['v_labels'], label2=g2.vp['v_labels'],
181
+ norm=True, p=1.0, distance=False, asymmetric=False)
182
+
183
+ # def _cosine_similarity(m1,m2)->float:
184
+ # v1 = list(chain.from_iterable(m1))
185
+ # v2 = list(chain.from_iterable(m2))
186
+ # return np.dot(v1,v2)/(norm(v1)*norm(v2))
187
+
188
+ def _vectorise_adj(matrix):
189
+ return list(chain.from_iterable((matrix[i][j] for j in range(i,5)) for i in range(5)))
190
+
191
+ def zero_pos(am):
192
+ nm = []
193
+ for r,row in enumerate(am):
194
+ nr = []
195
+ for c,weight in enumerate(row):
196
+ if r < 3 or c < 3:
197
+ nr.append(weight)
198
+ else:
199
+ nr.append(0)
200
+ nm.append(nr)
201
+ return nm
202
+
203
+ def zero_mod(am):
204
+ nm = []
205
+ for r,row in enumerate(am):
206
+ nr = []
207
+ for c,weight in enumerate(row):
208
+ if r >= 3 or c >= 3:
209
+ nr.append(weight)
210
+ else:
211
+ nr.append(0)
212
+ nm.append(nr)
213
+ return nm
214
+
215
+ def _adj_vector(adj_matrix):
216
+ return _vectorise_adj(adj_matrix)
217
+
218
+ def _positional_vector(adj_matrix):
219
+ return _vectorise_adj(zero_pos(adj_matrix))
220
+
221
+ def _modal_vector(adj_matrix):
222
+ return _vectorise_adj(zero_mod(adj_matrix))
223
+
224
+ def _cosine(A,B):
225
+ return dot(A,B)/(norm(A)*norm(B))
226
+
227
+ def _am4idx(df,idx:int):
228
+ return df.res_adj_matrix[idx]
229
+
230
+ def _similarity(m1,m2,vector_func):
231
+ return float(_cosine(vector_func(m1),vector_func(m2)))
232
+
233
+ def _res_similarity(df,idx1,idx2,vector_func):
234
+ return _similarity(_am4idx(df,idx1),_am4idx(df,idx2),vector_func)
235
+
236
+ _interaction_similarity = partial(_res_similarity,vector_func=_adj_vector)
237
+ _positional_similarity = partial(_res_similarity,vector_func=_positional_vector)
238
+ _modal_similarity = partial(_res_similarity,vector_func=_modal_vector)
239
+
240
+ def _similarities(df,idx1,idx2):
241
+ return {"interaction": _interaction_similarity(df,idx1,idx2),
242
+ "positional": _positional_similarity(df,idx1,idx2),
243
+ "modal":_modal_similarity(df,idx1,idx2)}
244
+
245
+
246
+
247
+ ### PIPELINE SUPPORT FUNCTIONS
248
+
249
+ # Clean return characters and strip whitespace
250
+ # Used by preprocess_text()
251
+ def _whitespace_cleaner(text:str)->str:
252
+ return text.strip().replace('\r\n','\n')
253
+
254
+ # Upload text to S3
255
+ def _s3_text_uploader(idx:str,text:str,aws_service,config:dict,logger)->bool:
256
+ try:
257
+ response = aws_service.s3_client.put_object(Body=text,
258
+ Bucket=aws_service.aws_params["s3_bucket_name"],
259
+ Key=f"{config["s3_source_dir"]}/{idx}.txt")
260
+ except Exception as e:
261
+ logger.error("There was an error when uploading text to s3 %s",repr(e))
262
+ return False
263
+ else:
264
+ if response['ResponseMetadata']['HTTPStatusCode']==200:
265
+ logger.debug(f"File {idx} uploaded successfully")
266
+ return True
267
+ else:
268
+ logger.error(f"File {idx} did not upload successfully to S3: {response}")
269
+ return False
270
+
271
+ # Analyse text with comprehend custom entity recognizer
272
+ def _comprehend_cer_analyser(aws_service,config,logger)->dict:
273
+ try:
274
+ response = aws_service.comprehend_client.start_entities_detection_job(
275
+ InputDataConfig={
276
+ 'S3Uri': _comprehend_input_uri(aws_service.aws_params["s3_bucket_name"],
277
+ config["s3_source_dir"]),
278
+ 'InputFormat': 'ONE_DOC_PER_FILE'
279
+ },
280
+ OutputDataConfig={
281
+ 'S3Uri': _comprehend_output_uri(aws_service.aws_params["s3_bucket_name"],
282
+ config["s3_target_dir"])
283
+ },
284
+ DataAccessRoleArn=_comprehend_access_role_arn(aws_service.aws_params["comprehend_service_role_name"],
285
+ aws_service.aws_account_number),
286
+ JobName=f"res_analysis_{_date_string()}",
287
+ EntityRecognizerArn=_comprehend_cer_arn(aws_service.aws_session.region_name,
288
+ aws_service.aws_account_number,
289
+ aws_service.aws_params["reflexive_entity_name"],
290
+ aws_service.aws_params["reflexive_entity_version"]),
291
+ LanguageCode='en'
292
+ )
293
+ except Exception as e:
294
+ logger.error("There was an error when analysing text with comprehend %s",repr(e))
295
+ return {"ERROR":repr(e)}
296
+ else:
297
+ return aws_service.comprehend_client.describe_entities_detection_job(JobId=response['JobId'])
298
+
299
+ # Monitor a CER Analysis Job
300
+ def _cer_job_progress(status:dict,aws_service,tz,output)->dict:
301
+ # Submitted
302
+ job_name = status['JobName']
303
+ job_id = status['JobId']
304
+ submit_time = status['SubmitTime'].astimezone(ZoneInfo(tz))
305
+ output(f"RES_ANALYSIS JOB {job_name} ({job_id}) submitted at: {submit_time}")
306
+
307
+ # In progress
308
+ while status['JobStatus'] in ["SUBMITTED","IN_PROGRESS"]:
309
+ time = datetime.now().astimezone(ZoneInfo(tz))
310
+ job_status = status['JobStatus']
311
+ output(f"{time} [{job_id}] {job_name} status: {job_status}")
312
+ sleep(10)
313
+ properties = aws_service.comprehend_client.describe_entities_detection_job(JobId=job_id)
314
+ status=properties['EntitiesDetectionJobProperties']
315
+
316
+ # Finished (complete or error)
317
+ job_status = status['JobStatus']
318
+ end_time = status['EndTime'].astimezone(ZoneInfo(tz))
319
+ time_taken = end_time - submit_time
320
+ output_url = status['OutputDataConfig']['S3Uri']
321
+ output(f"RES_ANALYSIS JOB {job_name} ({job_id}) finished with status: {job_status} at: {end_time}")
322
+ output(f"Analysis time: {str(time_taken)}")
323
+ output(f"Results available at: {output_url}")
324
+ return status
325
+
326
+
327
+ # Download from S3 to local
328
+ def _download_from_s3(res_analyser,status)->str:
329
+ local_file_path = f"{res_analyser.config['local_data_dir']}/{status['JobName']}.tar.gz"
330
+ bucket_name = res_analyser.aws_service.aws_params["s3_bucket_name"]
331
+ try:
332
+ output_key = status['OutputDataConfig']['S3Uri'].split(bucket_name)[1]
333
+ with open(f"{local_file_path}",'wb') as output_data:
334
+ res_analyser.aws_service.s3_client.download_fileobj(bucket_name,output_key[1:],output_data)
335
+ except Exception as e:
336
+ res_analyser.logger.error("An error occured when downloading results from S3: %s",repr(e))
337
+ local_file_path = None
338
+ return local_file_path
339
+
340
+ # Extract results from tar.gz file and save as json
341
+ def _extract_save_results(res_analyser,local_file_path)->list:
342
+ # extract the tar archive
343
+ files = list()
344
+ with tarfile.open(f"{local_file_path}", "r:gz") as tf:
345
+ for member in tf.getmembers():
346
+ f = tf.extractfile(member)
347
+ if f is not None:
348
+ content = f.read()
349
+ files.append(content)
350
+ # extract results and save and return
351
+ raw_results = files[0].decode("utf-8").split('\n')
352
+ raw_results.pop() # pop last item off as empty entry due to final \n
353
+ #
354
+ #json_results = json.dumps(raw_results)
355
+ #res_analyser.logger.info("raw_results>> ",raw_results)
356
+ results = [json.loads(result) for result in raw_results]
357
+ with open(f"{local_file_path[:-7]}.json","w") as fp:
358
+ json.dump(results,fp)
359
+ return results
360
+
361
+ # Get a dict of (index,entities) from cer analysis results
362
+ def _extract_analysis(results):
363
+ file_ents = ((result["File"],result["Entities"]) for result in results)
364
+ idx_ents = ((int(file.split('_')[-1].split('.')[0]),ents) for file,ents in file_ents)
365
+ return dict(idx_ents)
366
+
367
+
368
+
369
+ # Comprehend access role arn
370
+ def _comprehend_access_role_arn(comprehend_service_role_name,aws_account_number):
371
+ return f"arn:aws:iam::{aws_account_number}:role/service-role/{comprehend_service_role_name}"
372
+
373
+ # Comprehend input url
374
+ def _comprehend_input_uri(s3_bucket_name,s3_files,prefix=""):
375
+ return f"s3://{s3_bucket_name}/{s3_files}/{prefix}"
376
+
377
+ # Comprehend output url
378
+ def _comprehend_output_uri(s3_bucket_name,s3_results):
379
+ return f"s3://{s3_bucket_name}/{s3_results}/"
380
+
381
+ # Comprehend entity recognizer arn
382
+ def _comprehend_cer_arn(region,account_number,cer_name,cer_version):
383
+ return f"arn:aws:comprehend:{region}:{account_number}:entity-recognizer/{cer_name}/version/{cer_version}"
384
+
385
+ ## Offset functions
386
+
387
+ def _offset_tuples(offsets):
388
+ for k,vs in offsets.items():
389
+ for b,e in vs:
390
+ yield (b,(e,k))
391
+
392
+ def _sorted_offsets(offsets):
393
+ return sorted(offsets)
394
+
395
+ def _orphaned_I(text,offsets):
396
+ for b,(e,t) in offsets:
397
+ if 'I' in text[(b-2):(b-1)].strip():
398
+ #print(text[(b-2):e],t)
399
+ yield (b-2, (e,t))
400
+ else:
401
+ yield (b, (e,t))
402
+
403
+ def _orphaned_word(text,offsets):
404
+ coffs = {}
405
+ p = (0,(-2,''))
406
+ for b,(e,t) in offsets:
407
+ #print(p[1][0])
408
+ if (p[1][0]+3)>=b:
409
+ #print("Prev:",p,f"|{df.text[0][p[0]:p[1][0]]}|")
410
+ #print("<--->",f"|{df.text[0][(p[1][0]+1):(b-1)]}|")
411
+ #print("This:",b,e,t,f"|{df.text[0][b:e]}|")
412
+ #print()
413
+ if len((text[p[0]:p[1][0]]).split(' '))<2:
414
+ #print(f"Removing {p[0]},{p[1][0]},{p[1][1]}")
415
+ coffs.pop(p[0])
416
+ #print(f"Replacing {b},{e},{t} with {p[0]},{e},{t}")
417
+ coffs[p[0]] = (e,t)
418
+ p=(p[0],(e,t))
419
+ else:
420
+ coffs[b] = (e,t)
421
+ p = (b,(e,t))
422
+ else:
423
+ coffs[b] = (e,t)
424
+ p = (b,(e,t))
425
+ return coffs.items()
426
+
427
+ def _regroup(offsets):
428
+ grouped = (((b,e),k) for (b,(e,k)) in offsets)
429
+ return dict(grouped)
430
+
431
+
432
+
433
+
434
+ ### UTILITY FUNCTIONS
435
+
436
+ # Create a reverse date string YYYYmmdd based on current local time
437
+ def _date_string()->str:
438
+ return datetime.today().strftime('%Y%m%d')
439
+
440
+ # Get the current local working dir
441
+ def _local_path(dir)->str:
442
+ return os.getcwd()+dir
443
+
444
+ # Check if local directory exists
445
+ def _dir_exists_local(dir:str)->bool:
446
+ return os.path.exists(_local_path(dir))
447
+
448
+ # Return function to create directory
449
+ def _create_dir(dir)->str:
450
+ os.makedirs(_local_path(dir))
451
+ return _local_path(dir)
452
+
453
+ # Create local directory if required
454
+ def _create_local_dir(dir,logger)->str:
455
+ if not _dir_exists_local(dir):
456
+ try:
457
+ path = _create_dir(dir)
458
+ except Exception as e:
459
+ logger.error("There was an error creating the local directory: %s",repr(e))
460
+ finally:
461
+ return path
462
+ else:
463
+ return _local_path(dir)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: reflexive
3
- Version: 2.2.0
3
+ Version: 2.2.2
4
4
  Summary: Supports AWS Reflexive Expressions Systems (RES) Analysis
5
5
  Project-URL: Repository, https://github.com/nlytx/reflexive.git
6
6
  Author-email: Andrew Gibson <andrew@nlytx.io>
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "reflexive"
3
- version = "2.2.0"
3
+ version = "2.2.2"
4
4
  description = "Supports AWS Reflexive Expressions Systems (RES) Analysis"
5
5
  authors = [{name = "Andrew Gibson", email = "andrew@nlytx.io"}]
6
6
  readme = "README.md"
@@ -0,0 +1,463 @@
1
+ from typing import Callable
2
+ from pandas import (DataFrame,Series)
3
+ from datetime import datetime
4
+ from zoneinfo import ZoneInfo
5
+ from time import sleep
6
+ from functools import partial
7
+ import tarfile
8
+ import json
9
+ import os
10
+ from numpy import (
11
+ asarray,
12
+ dot
13
+ )
14
+ from numpy.linalg import norm
15
+ from itertools import chain
16
+ from graph_tool.all import (
17
+ Graph,
18
+ similarity,
19
+ adjacency)
20
+
21
+
22
+ ### PIPELINE FUNCTIONS
23
+
24
+ # Clean text using supplied function and calculate text length
25
+ # Used by RES_analyser.preprocess_text()
26
+
27
+ def _clean_text(df:DataFrame,text_cleaner:Callable[[str],str])->DataFrame:
28
+ return (df
29
+ .assign(text=lambda d: d.text.apply(text_cleaner))
30
+ .assign(text_length=lambda d: [len(row.text) for row in d.itertuples()]))
31
+
32
+ # Upload text using supplied uploader function
33
+ # Used by RES_analyser.upload_text_to_s3()
34
+
35
+ def _upload_text(df:DataFrame,uploader:Callable,res_analyser)->DataFrame:
36
+ upload = partial(uploader,aws_service=res_analyser.aws_service,config=res_analyser.config,logger=res_analyser.logger)
37
+ return df.assign(uploaded=lambda d: [upload(str(row.Index),row.text) for row in d.itertuples()])
38
+
39
+ # Initiate comprehend analysis on S3 text documents
40
+
41
+ def _analyse_text(analyser:Callable,res_analyser)->dict:
42
+ analyse = partial(analyser,
43
+ aws_service=res_analyser.aws_service,
44
+ config=res_analyser.config,
45
+ logger = res_analyser.logger)
46
+ job_status = analyse()
47
+ return job_status['EntitiesDetectionJobProperties']
48
+
49
+ # Add comprehend analysis results to dataframe
50
+ def _analysis_to_dataframe(df:DataFrame,results:list)->DataFrame:
51
+ analysis = _extract_analysis(results=results)
52
+ df['res_results']=Series(analysis)
53
+ return df
54
+
55
+ # Offsets to dataframe
56
+ def _add_offsets(df:DataFrame,offset_cleaner,orphan_joiner)->DataFrame:
57
+ return (df
58
+ .assign(offsets=lambda d: d.res_results.apply(offset_cleaner))
59
+ .assign(offsets_clean=lambda d: [orphan_joiner(row.text,row.offsets) for row in d.itertuples()]))
60
+
61
+ def _offset_cleaner(res_results):
62
+ offsets = _collect_offsets(res_results)
63
+ tuples = _offset_tuples(offsets)
64
+ return _sorted_offsets(tuples)
65
+
66
+ def _orphan_joiner(text,offsets):
67
+ otuples = _orphaned_I(text,offsets)
68
+ offs = _orphaned_word(text,otuples)
69
+ return _regroup(offs)
70
+
71
+ def _collect_offsets(rrs):
72
+ new_rrs = {}
73
+ for rr in rrs:
74
+ if rr['Score']>0.6:
75
+ ent_type = rr['Type']
76
+ if ent_type in ['VR','ER']:
77
+ label = "NR"
78
+ elif ent_type in ['EP','EV']:
79
+ label = "EP"
80
+ elif ent_type in ['CN','AF']:
81
+ label = "AF"
82
+ else:
83
+ label = ent_type
84
+ new_rrs.setdefault(label,[]).append((rr['BeginOffset'],rr['EndOffset']))
85
+ return new_rrs
86
+
87
+
88
+
89
+ #####
90
+
91
+ def _add_res_sequence(df):
92
+ temp_df = df.copy()
93
+ temp_df['res_sequence'] = temp_df.offsets_clean.apply(_get_res_sequence)
94
+ return temp_df
95
+
96
+ def _add_res_interactions(df):
97
+ temp_df = df.copy()
98
+ temp_df['res_interactions'] = temp_df.res_sequence.apply(_count_res_interactions)
99
+ return temp_df
100
+
101
+ def _add_res_weights(df):
102
+ temp_df = df.copy()
103
+ temp_df['res_weights'] = temp_df.res_interactions.apply(_calc_res_weights)
104
+ return temp_df
105
+
106
+ def _add_semantic_weights(df,ranking_factors={}):
107
+ temp_df = df.copy()
108
+ ranks = partial(_calc_semantic_weights,factors=ranking_factors)
109
+ temp_df['semantic_weights'] = temp_df.res_weights.apply(ranks)
110
+ return temp_df
111
+
112
+ def _add_res_adj_matrix(df):
113
+ temp_df = df.copy()
114
+ temp_df['res_adj_matrix'] = temp_df.semantic_weights.apply(_create_adj_matrix)
115
+ return temp_df
116
+
117
+ def _get_res_sequence(offsets_clean):
118
+ return [label for label in offsets_clean.values()]
119
+
120
+
121
+ def _empty_res_interactions() -> dict[tuple,int]:
122
+ RE_types = ['RR','NR','AR','AF','EP']
123
+ RE_interactions:dict[tuple,int] = dict()
124
+ for t1 in RE_types:
125
+ for t2 in RE_types:
126
+ entry = tuple(sorted((t1,t2)))
127
+ if entry not in RE_interactions.keys():
128
+ RE_interactions[entry] = 0
129
+ return RE_interactions
130
+
131
+ def _count_res_interactions(re_sequence:list[str]) -> dict[tuple,int]:
132
+ re_ints = _empty_res_interactions()
133
+ limit = len(re_sequence)-1
134
+ for i,s in enumerate(re_sequence):
135
+ if i < limit:
136
+ rei = tuple(sorted((s,re_sequence[i+1])))
137
+ #print(i,rei)
138
+ re_ints[rei] += 1
139
+ return re_ints
140
+
141
+ def _calc_res_weights(interactions:dict[tuple,int])->dict[tuple,float]:
142
+ max_count = max(interactions.values())
143
+ weights = dict()
144
+ for edge,count in interactions.items():
145
+ if max_count != 0:
146
+ weights[edge] = round(count/(max_count),2)
147
+ else:
148
+ weights[edge] = 0
149
+ return weights
150
+
151
+
152
+
153
+ def _calc_semantic_weights(weights:dict[tuple,float], factors:dict[tuple,float]={})->dict[tuple,float]:
154
+ if not factors:
155
+ return weights
156
+ else:
157
+ for edge,w in weights.items():
158
+ weights[edge] = factors[edge] * w
159
+ return weights
160
+
161
+
162
+ def _create_adj_matrix(weights:dict[tuple,float])->list[list[float]]:
163
+ re_types = ["RR","NR","AR","AF","EP"]
164
+ matrix = []
165
+ for r in re_types:
166
+ row = []
167
+ for c in re_types:
168
+ key = tuple(sorted((r,c)))
169
+ #print(key)
170
+ weight = weights.get(key,0)
171
+ row.append(weight)
172
+ matrix.append(row)
173
+ return matrix
174
+
175
+ ### SIMILARITY ANALYSIS
176
+
177
+ def _jaccard_similarity(g1:Graph,g2:Graph)->float:
178
+ return similarity(g1, g2,
179
+ eweight1=g1.ep['e_weights'], eweight2=g2.ep['e_weights'],
180
+ #label1=g1.vp['v_labels'], label2=g2.vp['v_labels'],
181
+ norm=True, p=1.0, distance=False, asymmetric=False)
182
+
183
+ # def _cosine_similarity(m1,m2)->float:
184
+ # v1 = list(chain.from_iterable(m1))
185
+ # v2 = list(chain.from_iterable(m2))
186
+ # return np.dot(v1,v2)/(norm(v1)*norm(v2))
187
+
188
+ def _vectorise_adj(matrix):
189
+ return list(chain.from_iterable((matrix[i][j] for j in range(i,5)) for i in range(5)))
190
+
191
+ def zero_pos(am):
192
+ nm = []
193
+ for r,row in enumerate(am):
194
+ nr = []
195
+ for c,weight in enumerate(row):
196
+ if r < 3 or c < 3:
197
+ nr.append(weight)
198
+ else:
199
+ nr.append(0)
200
+ nm.append(nr)
201
+ return nm
202
+
203
+ def zero_mod(am):
204
+ nm = []
205
+ for r,row in enumerate(am):
206
+ nr = []
207
+ for c,weight in enumerate(row):
208
+ if r >= 3 or c >= 3:
209
+ nr.append(weight)
210
+ else:
211
+ nr.append(0)
212
+ nm.append(nr)
213
+ return nm
214
+
215
+ def _adj_vector(adj_matrix):
216
+ return _vectorise_adj(adj_matrix)
217
+
218
+ def _positional_vector(adj_matrix):
219
+ return _vectorise_adj(zero_pos(adj_matrix))
220
+
221
+ def _modal_vector(adj_matrix):
222
+ return _vectorise_adj(zero_mod(adj_matrix))
223
+
224
+ def _cosine(A,B):
225
+ return dot(A,B)/(norm(A)*norm(B))
226
+
227
+ def _am4idx(df,idx:int):
228
+ return df.res_adj_matrix[idx]
229
+
230
+ def _similarity(m1,m2,vector_func):
231
+ return float(_cosine(vector_func(m1),vector_func(m2)))
232
+
233
+ def _res_similarity(df,idx1,idx2,vector_func):
234
+ return _similarity(_am4idx(df,idx1),_am4idx(df,idx2),vector_func)
235
+
236
+ _interaction_similarity = partial(_res_similarity,vector_func=_adj_vector)
237
+ _positional_similarity = partial(_res_similarity,vector_func=_positional_vector)
238
+ _modal_similarity = partial(_res_similarity,vector_func=_modal_vector)
239
+
240
+ def _similarities(df,idx1,idx2):
241
+ return {"interaction": _interaction_similarity(df,idx1,idx2),
242
+ "positional": _positional_similarity(df,idx1,idx2),
243
+ "modal":_modal_similarity(df,idx1,idx2)}
244
+
245
+
246
+
247
+ ### PIPELINE SUPPORT FUNCTIONS
248
+
249
+ # Clean return characters and strip whitespace
250
+ # Used by preprocess_text()
251
+ def _whitespace_cleaner(text:str)->str:
252
+ return text.strip().replace('\r\n','\n')
253
+
254
+ # Upload text to S3
255
+ def _s3_text_uploader(idx:str,text:str,aws_service,config:dict,logger)->bool:
256
+ try:
257
+ response = aws_service.s3_client.put_object(Body=text,
258
+ Bucket=aws_service.aws_params["s3_bucket_name"],
259
+ Key=f"{config["s3_source_dir"]}/{idx}.txt")
260
+ except Exception as e:
261
+ logger.error("There was an error when uploading text to s3 %s",repr(e))
262
+ return False
263
+ else:
264
+ if response['ResponseMetadata']['HTTPStatusCode']==200:
265
+ logger.debug(f"File {idx} uploaded successfully")
266
+ return True
267
+ else:
268
+ logger.error(f"File {idx} did not upload successfully to S3: {response}")
269
+ return False
270
+
271
+ # Analyse text with comprehend custom entity recognizer
272
+ def _comprehend_cer_analyser(aws_service,config,logger)->dict:
273
+ try:
274
+ response = aws_service.comprehend_client.start_entities_detection_job(
275
+ InputDataConfig={
276
+ 'S3Uri': _comprehend_input_uri(aws_service.aws_params["s3_bucket_name"],
277
+ config["s3_source_dir"]),
278
+ 'InputFormat': 'ONE_DOC_PER_FILE'
279
+ },
280
+ OutputDataConfig={
281
+ 'S3Uri': _comprehend_output_uri(aws_service.aws_params["s3_bucket_name"],
282
+ config["s3_target_dir"])
283
+ },
284
+ DataAccessRoleArn=_comprehend_access_role_arn(aws_service.aws_params["comprehend_service_role_name"],
285
+ aws_service.aws_account_number),
286
+ JobName=f"res_analysis_{_date_string()}",
287
+ EntityRecognizerArn=_comprehend_cer_arn(aws_service.aws_session.region_name,
288
+ aws_service.aws_account_number,
289
+ aws_service.aws_params["reflexive_entity_name"],
290
+ aws_service.aws_params["reflexive_entity_version"]),
291
+ LanguageCode='en'
292
+ )
293
+ except Exception as e:
294
+ logger.error("There was an error when analysing text with comprehend %s",repr(e))
295
+ return {"ERROR":repr(e)}
296
+ else:
297
+ return aws_service.comprehend_client.describe_entities_detection_job(JobId=response['JobId'])
298
+
299
+ # Monitor a CER Analysis Job
300
+ def _cer_job_progress(status:dict,aws_service,tz,output)->dict:
301
+ # Submitted
302
+ job_name = status['JobName']
303
+ job_id = status['JobId']
304
+ submit_time = status['SubmitTime'].astimezone(ZoneInfo(tz))
305
+ output(f"RES_ANALYSIS JOB {job_name} ({job_id}) submitted at: {submit_time}")
306
+
307
+ # In progress
308
+ while status['JobStatus'] in ["SUBMITTED","IN_PROGRESS"]:
309
+ time = datetime.now().astimezone(ZoneInfo(tz))
310
+ job_status = status['JobStatus']
311
+ output(f"{time} [{job_id}] {job_name} status: {job_status}")
312
+ sleep(10)
313
+ properties = aws_service.comprehend_client.describe_entities_detection_job(JobId=job_id)
314
+ status=properties['EntitiesDetectionJobProperties']
315
+
316
+ # Finished (complete or error)
317
+ job_status = status['JobStatus']
318
+ end_time = status['EndTime'].astimezone(ZoneInfo(tz))
319
+ time_taken = end_time - submit_time
320
+ output_url = status['OutputDataConfig']['S3Uri']
321
+ output(f"RES_ANALYSIS JOB {job_name} ({job_id}) finished with status: {job_status} at: {end_time}")
322
+ output(f"Analysis time: {str(time_taken)}")
323
+ output(f"Results available at: {output_url}")
324
+ return status
325
+
326
+
327
+ # Download from S3 to local
328
+ def _download_from_s3(res_analyser,status)->str:
329
+ local_file_path = f"{res_analyser.config['local_data_dir']}/{status['JobName']}.tar.gz"
330
+ bucket_name = res_analyser.aws_service.aws_params["s3_bucket_name"]
331
+ try:
332
+ output_key = status['OutputDataConfig']['S3Uri'].split(bucket_name)[1]
333
+ with open(f"{local_file_path}",'wb') as output_data:
334
+ res_analyser.aws_service.s3_client.download_fileobj(bucket_name,output_key[1:],output_data)
335
+ except Exception as e:
336
+ res_analyser.logger.error("An error occured when downloading results from S3: %s",repr(e))
337
+ local_file_path = None
338
+ return local_file_path
339
+
340
+ # Extract results from tar.gz file and save as json
341
+ def _extract_save_results(res_analyser,local_file_path)->list:
342
+ # extract the tar archive
343
+ files = list()
344
+ with tarfile.open(f"{local_file_path}", "r:gz") as tf:
345
+ for member in tf.getmembers():
346
+ f = tf.extractfile(member)
347
+ if f is not None:
348
+ content = f.read()
349
+ files.append(content)
350
+ # extract results and save and return
351
+ raw_results = files[0].decode("utf-8").split('\n')
352
+ raw_results.pop() # pop last item off as empty entry due to final \n
353
+ #
354
+ #json_results = json.dumps(raw_results)
355
+ #res_analyser.logger.info("raw_results>> ",raw_results)
356
+ results = [json.loads(result) for result in raw_results]
357
+ with open(f"{local_file_path[:-7]}.json","w") as fp:
358
+ json.dump(results,fp)
359
+ return results
360
+
361
+ # Get a dict of (index,entities) from cer analysis results
362
+ def _extract_analysis(results):
363
+ file_ents = ((result["File"],result["Entities"]) for result in results)
364
+ idx_ents = ((int(file.split('_')[-1].split('.')[0]),ents) for file,ents in file_ents)
365
+ return dict(idx_ents)
366
+
367
+
368
+
369
+ # Comprehend access role arn
370
+ def _comprehend_access_role_arn(comprehend_service_role_name,aws_account_number):
371
+ return f"arn:aws:iam::{aws_account_number}:role/service-role/{comprehend_service_role_name}"
372
+
373
+ # Comprehend input url
374
+ def _comprehend_input_uri(s3_bucket_name,s3_files,prefix=""):
375
+ return f"s3://{s3_bucket_name}/{s3_files}/{prefix}"
376
+
377
+ # Comprehend output url
378
+ def _comprehend_output_uri(s3_bucket_name,s3_results):
379
+ return f"s3://{s3_bucket_name}/{s3_results}/"
380
+
381
+ # Comprehend entity recognizer arn
382
+ def _comprehend_cer_arn(region,account_number,cer_name,cer_version):
383
+ return f"arn:aws:comprehend:{region}:{account_number}:entity-recognizer/{cer_name}/version/{cer_version}"
384
+
385
+ ## Offset functions
386
+
387
+ def _offset_tuples(offsets):
388
+ for k,vs in offsets.items():
389
+ for b,e in vs:
390
+ yield (b,(e,k))
391
+
392
+ def _sorted_offsets(offsets):
393
+ return sorted(offsets)
394
+
395
+ def _orphaned_I(text,offsets):
396
+ for b,(e,t) in offsets:
397
+ if 'I' in text[(b-2):(b-1)].strip():
398
+ #print(text[(b-2):e],t)
399
+ yield (b-2, (e,t))
400
+ else:
401
+ yield (b, (e,t))
402
+
403
+ def _orphaned_word(text,offsets):
404
+ coffs = {}
405
+ p = (0,(-2,''))
406
+ for b,(e,t) in offsets:
407
+ #print(p[1][0])
408
+ if (p[1][0]+3)>=b:
409
+ #print("Prev:",p,f"|{df.text[0][p[0]:p[1][0]]}|")
410
+ #print("<--->",f"|{df.text[0][(p[1][0]+1):(b-1)]}|")
411
+ #print("This:",b,e,t,f"|{df.text[0][b:e]}|")
412
+ #print()
413
+ if len((text[p[0]:p[1][0]]).split(' '))<2:
414
+ #print(f"Removing {p[0]},{p[1][0]},{p[1][1]}")
415
+ coffs.pop(p[0])
416
+ #print(f"Replacing {b},{e},{t} with {p[0]},{e},{t}")
417
+ coffs[p[0]] = (e,t)
418
+ p=(p[0],(e,t))
419
+ else:
420
+ coffs[b] = (e,t)
421
+ p = (b,(e,t))
422
+ else:
423
+ coffs[b] = (e,t)
424
+ p = (b,(e,t))
425
+ return coffs.items()
426
+
427
+ def _regroup(offsets):
428
+ grouped = (((b,e),k) for (b,(e,k)) in offsets)
429
+ return dict(grouped)
430
+
431
+
432
+
433
+
434
+ ### UTILITY FUNCTIONS
435
+
436
+ # Create a reverse date string YYYYmmdd based on current local time
437
+ def _date_string()->str:
438
+ return datetime.today().strftime('%Y%m%d')
439
+
440
+ # Get the current local working dir
441
+ def _local_path(dir)->str:
442
+ return os.getcwd()+dir
443
+
444
+ # Check if local directory exists
445
+ def _dir_exists_local(dir:str)->bool:
446
+ return os.path.exists(_local_path(dir))
447
+
448
+ # Return function to create directory
449
+ def _create_dir(dir)->str:
450
+ os.makedirs(_local_path(dir))
451
+ return _local_path(dir)
452
+
453
+ # Create local directory if required
454
+ def _create_local_dir(dir,logger)->str:
455
+ if not _dir_exists_local(dir):
456
+ try:
457
+ path = _create_dir(dir)
458
+ except Exception as e:
459
+ logger.error("There was an error creating the local directory: %s",repr(e))
460
+ finally:
461
+ return path
462
+ else:
463
+ return _local_path(dir)
@@ -152,9 +152,10 @@ class RES_analyser:
152
152
  file_names = []
153
153
  texts = []
154
154
  for file_name in sorted(os.listdir(file_path)):
155
- file_names.append(file_name.split('.')[0])
156
- with open(os.path.join(file_path,file_name),'r') as fp:
157
- texts.append(fp.read())
155
+ if not file_name.startswith('.'):
156
+ file_names.append(file_name.split('.')[0])
157
+ with open(os.path.join(file_path,file_name),'r') as fp:
158
+ texts.append(fp.read())
158
159
  return {"doc_name":file_names,"text":texts}
159
160
 
160
161
 
File without changes
File without changes
File without changes
File without changes
File without changes