reflexive 0.1.9__py3-none-any.whl → 1.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reflexive/__init__.py +5 -9
- reflexive/analyse.py +431 -0
- reflexive/cfg.py +118 -0
- reflexive/session.py +265 -0
- reflexive/util.py +125 -0
- reflexive/{visual/display.py → visualise.py} +22 -22
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/METADATA +1 -1
- reflexive-1.0.13.dist-info/RECORD +12 -0
- reflexive/analyse/__init__.py +0 -0
- reflexive/analyse/aws_nlp.py +0 -196
- reflexive/analyse/general.py +0 -128
- reflexive/analyse/reflexive_expressions.py +0 -124
- reflexive/aws_connect/__init__.py +0 -0
- reflexive/aws_connect/comprehend.py +0 -205
- reflexive/aws_connect/s3.py +0 -89
- reflexive/common/__init__.py +0 -0
- reflexive/common/local.py +0 -48
- reflexive/common/parameters.py +0 -77
- reflexive/common/util.py +0 -108
- reflexive/visual/__init__.py +0 -0
- reflexive-0.1.9.dist-info/RECORD +0 -20
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/LICENSE +0 -0
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/LICENSE.txt +0 -0
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/WHEEL +0 -0
- {reflexive-0.1.9.dist-info → reflexive-1.0.13.dist-info}/top_level.txt +0 -0
reflexive/analyse/general.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import logging,coloredlogs
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import json
|
|
5
|
-
|
|
6
|
-
from reflexive.common.parameters import Parameters
|
|
7
|
-
from reflexive.common.util import Util
|
|
8
|
-
|
|
9
|
-
coloredlogs.install(level='INFO')
|
|
10
|
-
|
|
11
|
-
class General:
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
def __init__(self,parameters:Parameters):
|
|
16
|
-
#print(parameters)
|
|
17
|
-
self.__parameters = parameters.all_parameters()
|
|
18
|
-
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def general_analytics(self,df):
|
|
22
|
-
util = Util()
|
|
23
|
-
custom_df = df.copy()
|
|
24
|
-
custom_df["text_length"] = df.text.apply(lambda x: len(x))
|
|
25
|
-
if (len(custom_df)>1):
|
|
26
|
-
custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
|
|
27
|
-
else:
|
|
28
|
-
custom_df["text_scaled"] = 1
|
|
29
|
-
return custom_df
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def remove_IQR_outliers(self,df):
|
|
33
|
-
tempdf = df.copy()
|
|
34
|
-
# Calculate text length
|
|
35
|
-
tempdf["text_length"] = tempdf.text.apply(lambda t: len(t))
|
|
36
|
-
fence = Util.outlier_fence(tempdf.text_length)
|
|
37
|
-
print(fence)
|
|
38
|
-
# Check change with removed outliers
|
|
39
|
-
checkdf = tempdf[tempdf.text_length<fence['UPPER']]
|
|
40
|
-
checkdf.reset_index(drop=True,inplace=True)
|
|
41
|
-
print("Original:",len(tempdf))
|
|
42
|
-
print(tempdf.describe())
|
|
43
|
-
print()
|
|
44
|
-
print("Outliers:",len(tempdf)-len(checkdf))
|
|
45
|
-
print()
|
|
46
|
-
print("No outliers:",len(checkdf))
|
|
47
|
-
print(checkdf.describe())
|
|
48
|
-
return checkdf
|
|
49
|
-
|
|
50
|
-
# Parse text for domain terms
|
|
51
|
-
def parse_domain_terms(self,text,domain_terms):
|
|
52
|
-
matched_terms = {}
|
|
53
|
-
for dtk,dtv in domain_terms.items():
|
|
54
|
-
matched_terms[dtk] = []
|
|
55
|
-
for term in dtv:
|
|
56
|
-
if term[0]=='_': #acronym - treat as whole word
|
|
57
|
-
regex = r"\b{}\b".format(term[1:])
|
|
58
|
-
matches = re.findall(regex,str.lower(text))
|
|
59
|
-
if len(matches)>0:
|
|
60
|
-
matched_terms[dtk].append((term[1:],len(matches)))
|
|
61
|
-
else:
|
|
62
|
-
count = str.lower(text).count(term)
|
|
63
|
-
if count > 0:
|
|
64
|
-
matched_terms[dtk].append((term,count))
|
|
65
|
-
return matched_terms
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def get_top_ngrams(self,text_series,min_val=3):
|
|
69
|
-
ngrams = {}
|
|
70
|
-
for text in text_series:
|
|
71
|
-
self.__ngrams345(text,ngrams)
|
|
72
|
-
#print("Generated 3,4,5 ngrams:", len(ngrams))
|
|
73
|
-
f_ngrams = self.filter_dict_by_value(ngrams,min_val)
|
|
74
|
-
return self.sort_dict_by_value(f_ngrams)
|
|
75
|
-
|
|
76
|
-
def get_top_ngrams_for_text(self,text,top_ngrams):
|
|
77
|
-
ngrams = self.__ngrams345(text,{})
|
|
78
|
-
return {key: ngrams[key] for key in top_ngrams.keys() if key in ngrams}
|
|
79
|
-
|
|
80
|
-
def ngram_counts(self,ref_top_ngrams):
|
|
81
|
-
return sum(ref_top_ngrams.values())
|
|
82
|
-
|
|
83
|
-
# Given text and number of terms, create ngrams from the text
|
|
84
|
-
def __make_ngrams(self,text, n=1):
|
|
85
|
-
# Replace all none alphanumeric characters with spaces
|
|
86
|
-
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', text.lower())
|
|
87
|
-
|
|
88
|
-
tokens = [token for token in s.split(" ") if token != ""]
|
|
89
|
-
ngrams = zip(*[tokens[i:] for i in range(n)])
|
|
90
|
-
return [" ".join(ngram) for ngram in ngrams]
|
|
91
|
-
|
|
92
|
-
# Generate 3,4,5 -grams
|
|
93
|
-
def __ngrams345(self,text,ngrams):
|
|
94
|
-
ngrams3 = self.__make_ngrams(text,3)
|
|
95
|
-
for n in ngrams3:
|
|
96
|
-
ngrams[n] = ngrams.get(n,0)+1
|
|
97
|
-
ngrams4 = self.__make_ngrams(text,4)
|
|
98
|
-
for n in ngrams4:
|
|
99
|
-
ngrams[n] = ngrams.get(n,0)+1
|
|
100
|
-
ngrams5 = self.__make_ngrams(text,5)
|
|
101
|
-
for n in ngrams5:
|
|
102
|
-
ngrams[n] = ngrams.get(n,0)+1
|
|
103
|
-
return ngrams
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
# Count domain terms
|
|
107
|
-
def count_domain_terms(self,terms):
|
|
108
|
-
counts = {}
|
|
109
|
-
for k,v in terms.items():
|
|
110
|
-
for term in v:
|
|
111
|
-
counts[k] = counts.setdefault(k,0) + term[1]
|
|
112
|
-
return counts
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
# Ratio between action POS and object POS
|
|
116
|
-
def action_object_ratio(self,pos_ratios,action_pos = ['VERB'],object_pos = ['NOUN','PROPN']):
|
|
117
|
-
ap = [s[1] for s in pos_ratios if s[0] in action_pos]
|
|
118
|
-
if ap:
|
|
119
|
-
aps = sum(ap)
|
|
120
|
-
else:
|
|
121
|
-
aps = 0
|
|
122
|
-
op = [s[1] for s in pos_ratios if s[0] in object_pos]
|
|
123
|
-
if op:
|
|
124
|
-
ops = sum(op)
|
|
125
|
-
else:
|
|
126
|
-
ops = 1 #avoid divide zero error - only happens with aps of 1
|
|
127
|
-
#print("aps",aps,"ops",ops)
|
|
128
|
-
return aps/ops
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
import logging,coloredlogs
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import json
|
|
6
|
-
|
|
7
|
-
from reflexive.common.parameters import Parameters
|
|
8
|
-
from reflexive.common.local import Local
|
|
9
|
-
from reflexive.common.util import Util
|
|
10
|
-
from reflexive.aws_connect.s3 import S3
|
|
11
|
-
from reflexive.aws_connect.comprehend import Comprehend
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
coloredlogs.install(level='INFO')
|
|
15
|
-
|
|
16
|
-
class ReflexiveExpressions:
|
|
17
|
-
|
|
18
|
-
logger = logging.getLogger(__name__)
|
|
19
|
-
|
|
20
|
-
def __init__(self,parameters:Parameters,aws_s3:S3,local:Local,comprehend:Comprehend):
|
|
21
|
-
#print(parameters)
|
|
22
|
-
self.__params = parameters
|
|
23
|
-
self.__parameters = parameters.all_parameters()
|
|
24
|
-
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
25
|
-
self.prefix = self.__parameters['prefix']
|
|
26
|
-
self.postfix = self.__parameters['postfix']
|
|
27
|
-
self.local_path = self.__parameters['local_path']
|
|
28
|
-
self.__s3 = aws_s3
|
|
29
|
-
self.__local = local
|
|
30
|
-
self.__comprehend = comprehend
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
######## REFLEXIVE EXPRESSION ANALYSIS FUNCTIONS
|
|
34
|
-
|
|
35
|
-
def analyse_reflexive_expressions(self,df): #,s3_bucket_name,access_role_arn,entity_recogniser_arn):
|
|
36
|
-
#self.__bucket_name = s3_bucket_name
|
|
37
|
-
text = df.text.replace('\r\n','\n') # Comprehend treats \r\n as one character
|
|
38
|
-
# Upload reflections to S3 for analysis
|
|
39
|
-
self.__s3.upload_docs(text)
|
|
40
|
-
|
|
41
|
-
# Save a copy of reflections locally for offline analysis
|
|
42
|
-
self.__local.save_docs(text)
|
|
43
|
-
|
|
44
|
-
# Submit the job
|
|
45
|
-
return self.__comprehend.submit_custom_entity_job("reflexive_expressions_analysis") #submitReflexiveExpressionsJob(access_role_arn, entity_recogniser_arn)
|
|
46
|
-
|
|
47
|
-
def check_job_status(self):
|
|
48
|
-
return self.__comprehend.check_job_status()
|
|
49
|
-
|
|
50
|
-
def get_job_details(self):
|
|
51
|
-
return self.__comprehend.get_job_details()
|
|
52
|
-
|
|
53
|
-
def download_and_extract(self):
|
|
54
|
-
local_output_dir = f"{self.local_path}{self.prefix}output{self.postfix}"
|
|
55
|
-
job_details = self.get_job_details()
|
|
56
|
-
s3Uri = job_details['OutputDataConfig']['S3Uri']
|
|
57
|
-
return self.__s3.results_download_save_extract(s3Uri,local_output_dir)
|
|
58
|
-
|
|
59
|
-
def extractAnalysisFromResults(self,results):
|
|
60
|
-
analysis_output = dict()
|
|
61
|
-
for result in results:
|
|
62
|
-
j = json.loads(result)
|
|
63
|
-
#print(j)
|
|
64
|
-
idx = j["File"].split('_')[-1].split('.')[0]
|
|
65
|
-
analysis_output[int(idx)] = j["Entities"]
|
|
66
|
-
return analysis_output
|
|
67
|
-
|
|
68
|
-
def add_to_dataframe(self,df,results):
|
|
69
|
-
# Extract analysis from raw results
|
|
70
|
-
analysis_output = self.extractAnalysisFromResults(results)
|
|
71
|
-
# Add results to dataframe
|
|
72
|
-
results_df = df.copy()
|
|
73
|
-
results_df['reflexiveResults'] = pd.Series(analysis_output)
|
|
74
|
-
return results_df
|
|
75
|
-
|
|
76
|
-
def reflexive_analytics(self,df):
|
|
77
|
-
util = Util()
|
|
78
|
-
custom_df = df.copy()
|
|
79
|
-
# custom_df["text_length"] = df.text.apply(lambda x: len(x))
|
|
80
|
-
# if (len(custom_df)>1):
|
|
81
|
-
# custom_df["text_scaled"] = util.scale_min_max(custom_df[['text_length']])
|
|
82
|
-
# else:
|
|
83
|
-
# custom_df["text_scaled"] = 1
|
|
84
|
-
custom_df["reflexive_results"] = df.reflexiveResults
|
|
85
|
-
# The expressions and their reflexive expression labels
|
|
86
|
-
custom_df["reflexive_expressions"] = df.reflexiveResults.apply(self.parse_reflexiveResults)
|
|
87
|
-
# The counts for each labels
|
|
88
|
-
custom_df["reflexive_counts"] = custom_df.reflexive_expressions.apply(util.count_labels)
|
|
89
|
-
# Ratios between reflexive expressions
|
|
90
|
-
custom_df["reflexive_ratio"] = custom_df.reflexive_counts.apply(util.ratios)
|
|
91
|
-
# Ratio vector
|
|
92
|
-
custom_df['ratio_vector'] = custom_df.reflexive_ratio.apply(self.make_ratio_vector)
|
|
93
|
-
# Get the diversity of reflexive types - out of 8 possible types
|
|
94
|
-
custom_df["reflexive_type_diversity"] = custom_df.reflexive_counts.apply(lambda x: len(x)/8)
|
|
95
|
-
# A total of all labels
|
|
96
|
-
custom_df["reflexive_total"] = custom_df.reflexive_counts.apply(util.tuple_values_total)
|
|
97
|
-
# MinMax scale the reflexive_total
|
|
98
|
-
if (len(custom_df)>1):
|
|
99
|
-
custom_df["reflexive_scaled"] = util.scale_min_max(custom_df[['reflexive_total']])
|
|
100
|
-
else:
|
|
101
|
-
custom_df["reflexive_scaled"] = 1
|
|
102
|
-
# Normalise based on text_scaled
|
|
103
|
-
custom_df['reflexive_norm'] = util.normalise_scaled(custom_df,'reflexive_scaled')
|
|
104
|
-
return custom_df
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
# Parse reflexive results - include all above threshold
|
|
108
|
-
def parse_reflexiveResults(self,reflexiveResults,threshold=0.5):
|
|
109
|
-
final_refs = list()
|
|
110
|
-
for ref in reflexiveResults:
|
|
111
|
-
if ref['Score'] > threshold:
|
|
112
|
-
final_refs.append((str.lower(ref['Text']),ref['Type']))
|
|
113
|
-
return final_refs
|
|
114
|
-
|
|
115
|
-
# Function for creating a vector out of reflexive ratio - could be used for others
|
|
116
|
-
def make_ratio_vector(self,ratio_list,ref_codes = ['RR','ER','VR','AR','EP','AF','CN','EV']):
|
|
117
|
-
ratio_dict = dict(ratio_list)
|
|
118
|
-
vec = []
|
|
119
|
-
for rc in ref_codes:
|
|
120
|
-
if rc in ratio_dict.keys():
|
|
121
|
-
vec.append(ratio_dict[rc])
|
|
122
|
-
else:
|
|
123
|
-
vec.append(0)
|
|
124
|
-
return vec
|
|
File without changes
|
|
@@ -1,205 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
import boto3
|
|
5
|
-
import time
|
|
6
|
-
import json
|
|
7
|
-
import pandas as pd
|
|
8
|
-
|
|
9
|
-
from reflexive.common.parameters import Parameters
|
|
10
|
-
from reflexive.common.util import Util
|
|
11
|
-
from reflexive.aws_connect.s3 import S3
|
|
12
|
-
|
|
13
|
-
import logging
|
|
14
|
-
try:
|
|
15
|
-
import coloredlogs
|
|
16
|
-
coloredlogs.install(level='INFO')
|
|
17
|
-
except:
|
|
18
|
-
print("Colored logs not available")
|
|
19
|
-
|
|
20
|
-
class Comprehend:
|
|
21
|
-
|
|
22
|
-
logger = logging.getLogger(__name__)
|
|
23
|
-
|
|
24
|
-
def __init__(self,parameters:Parameters):
|
|
25
|
-
#print(parameters)
|
|
26
|
-
self.__parameters = parameters.all_parameters()
|
|
27
|
-
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
28
|
-
self.region = self.__parameters['region']
|
|
29
|
-
self.access_role_arn = self.__parameters['comprehend_access_role_arn']
|
|
30
|
-
self.entity_recogniser_arn = self.__parameters['reflexive_entity_arn']
|
|
31
|
-
self.local_path = self.__parameters['local_path']
|
|
32
|
-
self.prefix = self.__parameters['prefix']
|
|
33
|
-
self.postfix = self.__parameters['postfix']
|
|
34
|
-
self.bucket_name = self.__parameters["bucket_name"]
|
|
35
|
-
self.files_folder = f"{self.prefix}files{self.postfix}"
|
|
36
|
-
self.results_folder = f"{self.prefix}results{self.postfix}"
|
|
37
|
-
self.input_uri = f"s3://{self.bucket_name}/{self.files_folder}/{self.prefix}"
|
|
38
|
-
self.output_uri = f"s3://{self.bucket_name}/{self.results_folder}/"
|
|
39
|
-
self.analysis_types = self.__parameters['analysis_types']
|
|
40
|
-
# create client
|
|
41
|
-
try:
|
|
42
|
-
self.logger.debug(f"Region:{self.region}")
|
|
43
|
-
self.__comp_client = boto3.client(service_name='comprehend',region_name=self.region)
|
|
44
|
-
except Exception as err:
|
|
45
|
-
self.logger.error("Unable to create Comprehend client: ",err)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def client(self):
|
|
49
|
-
return self.__comp_client
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
#### CUSTOM ENTITY
|
|
53
|
-
|
|
54
|
-
def submit_custom_entity_job(self,job_name): #access_role_arn,entity_recogniser_arn):
|
|
55
|
-
job_str = f"{self.prefix}{job_name}{self.postfix}"
|
|
56
|
-
|
|
57
|
-
response = self.__comp_client.start_entities_detection_job(
|
|
58
|
-
InputDataConfig={
|
|
59
|
-
'S3Uri': self.input_uri,
|
|
60
|
-
'InputFormat': 'ONE_DOC_PER_FILE'
|
|
61
|
-
},
|
|
62
|
-
OutputDataConfig={
|
|
63
|
-
'S3Uri': self.output_uri
|
|
64
|
-
},
|
|
65
|
-
DataAccessRoleArn=self.access_role_arn,
|
|
66
|
-
JobName=job_str,
|
|
67
|
-
EntityRecognizerArn=self.entity_recogniser_arn,
|
|
68
|
-
LanguageCode='en'
|
|
69
|
-
)
|
|
70
|
-
self.job_id = response['JobId']
|
|
71
|
-
return response
|
|
72
|
-
|
|
73
|
-
# Check job status
|
|
74
|
-
def check_job_status(self):
|
|
75
|
-
job_status = self.__comp_client.describe_entities_detection_job(
|
|
76
|
-
JobId=self.job_id
|
|
77
|
-
)
|
|
78
|
-
self.__job_properties = job_status['EntitiesDetectionJobProperties']
|
|
79
|
-
return self.__job_properties['JobStatus']
|
|
80
|
-
|
|
81
|
-
def get_job_details(self):
|
|
82
|
-
return self.__job_properties
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
# Use AWS comprehend to get bulk key phrases from single batch of chunked text
|
|
86
|
-
def get_single_batch_analysis(self,index,chunk):
|
|
87
|
-
comprehend = self.client()
|
|
88
|
-
results = {}
|
|
89
|
-
print("Analysing chunk",index)
|
|
90
|
-
print(" . key_phrase")
|
|
91
|
-
kpresult = comprehend.batch_detect_key_phrases(TextList=chunk,LanguageCode='en')
|
|
92
|
-
results['KeyPhraseResults'] = kpresult
|
|
93
|
-
#key_phrase_results.append(kpresult)
|
|
94
|
-
time.sleep(2)
|
|
95
|
-
print(" . sentiment")
|
|
96
|
-
senresult = comprehend.batch_detect_sentiment(TextList=chunk,LanguageCode='en')
|
|
97
|
-
results['SentimentResults'] = senresult
|
|
98
|
-
#sentiment_results.append(senresult)
|
|
99
|
-
time.sleep(2)
|
|
100
|
-
print(" . targeted_sentiment")
|
|
101
|
-
tsenresult = comprehend.batch_detect_targeted_sentiment(TextList=chunk,LanguageCode='en')
|
|
102
|
-
results['TargetedSentimentResults'] = tsenresult
|
|
103
|
-
#target_sent_results.append(tsenresult)
|
|
104
|
-
time.sleep(2)
|
|
105
|
-
print(" . syntax")
|
|
106
|
-
synresult = comprehend.batch_detect_syntax(TextList=chunk,LanguageCode='en')
|
|
107
|
-
results['SyntaxResults'] = synresult
|
|
108
|
-
#syntax_results.append(synresult)
|
|
109
|
-
time.sleep(2)
|
|
110
|
-
return results
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
# Use AWS comprehend to get bulk key phrases from chunked text
|
|
114
|
-
def get_multiple_batch_analysis(self,chunked_text):
|
|
115
|
-
chunk_results = {}
|
|
116
|
-
for key in self.analysis_types.keys():
|
|
117
|
-
chunk_results[key] = []
|
|
118
|
-
|
|
119
|
-
for idx,chunk in enumerate(chunked_text):
|
|
120
|
-
if len(chunked_text) > 4999:
|
|
121
|
-
print("WARNING: Text too long to analyse - index",idx,"skipped!")
|
|
122
|
-
else:
|
|
123
|
-
try:
|
|
124
|
-
results = self.get_single_batch_analysis(index=idx,chunk=chunk)
|
|
125
|
-
except(Exception) as error:
|
|
126
|
-
print("There was an error with index",idx,error)
|
|
127
|
-
finally:
|
|
128
|
-
if results:
|
|
129
|
-
for key in results.keys():
|
|
130
|
-
chunk_results[key].append(results[key])
|
|
131
|
-
|
|
132
|
-
return chunk_results
|
|
133
|
-
|
|
134
|
-
# Take batched responses and concenate single lists of results, errors, and http responses
|
|
135
|
-
def unbatch_results(self,result_type,results,batch_size=25):
|
|
136
|
-
unbatched_results = {}
|
|
137
|
-
unbatched_errors = {}
|
|
138
|
-
batch_responses = {}
|
|
139
|
-
for idx,batch in enumerate(results):
|
|
140
|
-
#print("Response for batch:",idx)
|
|
141
|
-
batch_responses[idx] = batch['ResponseMetadata']
|
|
142
|
-
result_list = batch['ResultList']
|
|
143
|
-
error_list = batch['ErrorList']
|
|
144
|
-
for r in result_list:
|
|
145
|
-
ridx = idx*batch_size + r['Index']
|
|
146
|
-
rdata = r[result_type]
|
|
147
|
-
unbatched_results[ridx] = rdata
|
|
148
|
-
for e in error_list:
|
|
149
|
-
eidx = e['Index']
|
|
150
|
-
unbatched_errors[eidx] = 'ERROR' + e['ErrorCode'] + ': ' + e['ErrorMessage']
|
|
151
|
-
unbatched = {}
|
|
152
|
-
unbatched['results'] = unbatched_results
|
|
153
|
-
unbatched['errors'] = unbatched_errors
|
|
154
|
-
unbatched['responses'] = batch_responses
|
|
155
|
-
return unbatched
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def check_long_text(self,df):
|
|
160
|
-
# Check for long reflections (too long for batch analysis)
|
|
161
|
-
long_df = df.copy()
|
|
162
|
-
long_df = long_df[long_df.text.str.len()>5000]
|
|
163
|
-
long_df['length'] = long_df.text.str.len()
|
|
164
|
-
return long_df
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
# def extract_result(self,result,batch,batch_params):
|
|
168
|
-
# match batch:
|
|
169
|
-
# case "KeyPhraseResults":
|
|
170
|
-
# extracted = [r['Text'] for r in result if r['Score'] >= batch_params["min_score"]]
|
|
171
|
-
# case "SentimentResults":
|
|
172
|
-
# extracted = result
|
|
173
|
-
# case "TargetedSentimentResults":
|
|
174
|
-
# extracted = dict()
|
|
175
|
-
# for r in result:
|
|
176
|
-
# for mention in r['Mentions']:
|
|
177
|
-
# if (mention['Score'] >= batch_params["min_score"]):
|
|
178
|
-
# text = mention['Text']
|
|
179
|
-
# key = f"{mention['Type']}_{mention['MentionSentiment']['Sentiment']}"
|
|
180
|
-
# if key in extracted.keys():
|
|
181
|
-
# extracted[key].add(text)
|
|
182
|
-
# else:
|
|
183
|
-
# extracted[key] = {text}
|
|
184
|
-
# case "SyntaxResults":
|
|
185
|
-
# tags = []
|
|
186
|
-
# tokens = []
|
|
187
|
-
# for r in result:
|
|
188
|
-
# pos = r['PartOfSpeech']
|
|
189
|
-
# tag = pos['Tag']
|
|
190
|
-
# if pos['Score'] < batch_params["max_score"]:
|
|
191
|
-
# tag = tag+"_?"
|
|
192
|
-
# tags.append(tag)
|
|
193
|
-
# tokens.append(r['Text'])
|
|
194
|
-
|
|
195
|
-
# extracted = {'tokens':tokens,'tags':tags}
|
|
196
|
-
# case other:
|
|
197
|
-
# extracted = []
|
|
198
|
-
# return extracted
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
reflexive/aws_connect/s3.py
DELETED
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
3
|
-
import logging,coloredlogs
|
|
4
|
-
import boto3
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import tarfile
|
|
7
|
-
import json
|
|
8
|
-
|
|
9
|
-
from reflexive.common.parameters import Parameters
|
|
10
|
-
|
|
11
|
-
coloredlogs.install(level='INFO')
|
|
12
|
-
|
|
13
|
-
class S3:
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
def __init__(self,parameters:Parameters):
|
|
18
|
-
#print(parameters)
|
|
19
|
-
# set local parameters
|
|
20
|
-
self.__parameters = parameters.all_parameters()
|
|
21
|
-
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
22
|
-
self.region = self.__parameters['region']
|
|
23
|
-
self.prefix = self.__parameters['prefix']
|
|
24
|
-
self.postfix = self.__parameters['postfix']
|
|
25
|
-
self.s3_access_point_arn = self.__parameters["s3_accesspoint_arn"]
|
|
26
|
-
self.bucket_name = self.__parameters["bucket_name"]
|
|
27
|
-
# create client
|
|
28
|
-
try:
|
|
29
|
-
self.logger.debug(f"Region:{self.region}")
|
|
30
|
-
self.__s3_client = boto3.client(service_name='s3',region_name=self.region)
|
|
31
|
-
except Exception as err:
|
|
32
|
-
self.logger.error("Unable to create S3 client: ",err)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# Return the S3 client
|
|
36
|
-
def client(self):
|
|
37
|
-
return self.__s3_client
|
|
38
|
-
|
|
39
|
-
# Function to upload reflections to S3
|
|
40
|
-
def upload_docs(self,text_series):
|
|
41
|
-
#self.__prefix, self.__postfix
|
|
42
|
-
files_folder = f"{self.prefix}files{self.postfix}"
|
|
43
|
-
|
|
44
|
-
s3 = self.__s3_client
|
|
45
|
-
s3ap = self.s3_access_point_arn
|
|
46
|
-
self.logger.debug(f"ACCESS POINT: {s3ap}")
|
|
47
|
-
|
|
48
|
-
self.logger.info(f"Uploading {len(text_series)} reflections to S3 ({files_folder})...")
|
|
49
|
-
self.logger.debug(f"({s3ap}/{files_folder})")
|
|
50
|
-
for idx in text_series.index:
|
|
51
|
-
file_name = f"{self.prefix}{idx}.txt"
|
|
52
|
-
file_body = text_series.iloc[idx]
|
|
53
|
-
self.logger.info(f"Uploading {file_name}")
|
|
54
|
-
#print(file_body)
|
|
55
|
-
response = s3.put_object(Body=file_body,Bucket=s3ap,Key=f"{files_folder}/{file_name}")
|
|
56
|
-
if response['ResponseMetadata']['HTTPStatusCode'] != 200:
|
|
57
|
-
self.logger.error("------------------------------------------------------------")
|
|
58
|
-
self.logger.error(f"ERROR: There was a problem with {file_name}")
|
|
59
|
-
self.logger.error(response)
|
|
60
|
-
self.logger.error("------------------------------------------------------------")
|
|
61
|
-
else:
|
|
62
|
-
self.logger.info('Success')
|
|
63
|
-
self.logger.info("Finished uploading reflections to S3.")
|
|
64
|
-
return response
|
|
65
|
-
|
|
66
|
-
# download and save results
|
|
67
|
-
def results_download_save_extract(self,s3Uri,local_file_path):
|
|
68
|
-
s3 = self.__s3_client
|
|
69
|
-
output_key = s3Uri.split(self.bucket_name)[1]
|
|
70
|
-
# download from S3 to local path
|
|
71
|
-
with open(f"{local_file_path}.tar.gz",'wb') as output_data:
|
|
72
|
-
s3.download_fileobj(self.bucket_name,output_key[1:],output_data)
|
|
73
|
-
|
|
74
|
-
# extract the files from tar archive
|
|
75
|
-
files = list()
|
|
76
|
-
with tarfile.open(f"{local_file_path}.tar.gz", "r:gz") as tf:
|
|
77
|
-
for member in tf.getmembers():
|
|
78
|
-
f = tf.extractfile(member)
|
|
79
|
-
if f is not None:
|
|
80
|
-
content = f.read()
|
|
81
|
-
files.append(content)
|
|
82
|
-
#print("Number of files:",len(files))
|
|
83
|
-
# extract results and save and return
|
|
84
|
-
raw_results = files[0].decode("utf-8").split('\n')
|
|
85
|
-
raw_results.pop() # pop last item off as empty entry due to final \n
|
|
86
|
-
with open(f"{local_file_path}.json","w") as fp:
|
|
87
|
-
fp.write(json.dumps(raw_results))
|
|
88
|
-
return raw_results
|
|
89
|
-
|
reflexive/common/__init__.py
DELETED
|
File without changes
|
reflexive/common/local.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import os
|
|
3
|
-
import logging,coloredlogs
|
|
4
|
-
#import boto3
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
|
-
from reflexive.common.parameters import Parameters
|
|
8
|
-
|
|
9
|
-
coloredlogs.install(level='INFO')
|
|
10
|
-
|
|
11
|
-
class Local:
|
|
12
|
-
|
|
13
|
-
logger = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
def __init__(self,parameters:Parameters):
|
|
16
|
-
self.__parameters = parameters.all_parameters()
|
|
17
|
-
self.logger.debug(f"Parameters: {self.__parameters}")
|
|
18
|
-
self.local_path = self.__parameters['local_path']
|
|
19
|
-
self.local_dir = self.local_path
|
|
20
|
-
self.logger.info(f"Path: {self.local_path}")
|
|
21
|
-
self.prefix = self.__parameters['prefix']
|
|
22
|
-
self.postfix = self.__parameters['postfix']
|
|
23
|
-
|
|
24
|
-
def get_data_path_name(self,name,ext):
|
|
25
|
-
return f"{self.local_path}{self.prefix}{name}{self.postfix}.{ext}"
|
|
26
|
-
|
|
27
|
-
def set_sub_dir(self,sub_dir=None):
|
|
28
|
-
# check dir sub_dir exists
|
|
29
|
-
if sub_dir:
|
|
30
|
-
self.local_dir = f"{self.local_path}{sub_dir}/"
|
|
31
|
-
self.logger.debug(f"local_dir: {self.local_dir}")
|
|
32
|
-
dirExists = os.path.exists(self.local_dir)
|
|
33
|
-
if not dirExists:
|
|
34
|
-
self.logger.info(f"Creating subdirectory: {self.local_dir}")
|
|
35
|
-
os.makedirs(self.local_dir)
|
|
36
|
-
else:
|
|
37
|
-
self.local_dir = self.local_path
|
|
38
|
-
|
|
39
|
-
def save_docs(self,text_series,):
|
|
40
|
-
self.logger.info(f"Saving {len(text_series)} docs to {self.local_dir}...")
|
|
41
|
-
for idx in text_series.index:
|
|
42
|
-
file_name = f"{self.prefix}{idx}.txt"
|
|
43
|
-
file_body = text_series.iloc[idx]
|
|
44
|
-
self.logger.info(f"Saving {file_name}")
|
|
45
|
-
with open(f"{self.local_dir}{file_name}",'w') as fp:
|
|
46
|
-
fp.write(file_body)
|
|
47
|
-
self.logger.info("Finished saving reflections locally.")
|
|
48
|
-
|
reflexive/common/parameters.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
# Store the parameters for connecting to AWS
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import logging,coloredlogs
|
|
5
|
-
from datetime import datetime
|
|
6
|
-
import boto3
|
|
7
|
-
|
|
8
|
-
coloredlogs.install(level='INFO')
|
|
9
|
-
|
|
10
|
-
class Parameters:
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
def __init__(self,profile="default",name_prefix="refex",local_path=None,date_string=None):
|
|
15
|
-
working_dir = os.getcwd()
|
|
16
|
-
try:
|
|
17
|
-
aws_session = boto3.Session(profile_name=profile)
|
|
18
|
-
self.region = aws_session.region_name
|
|
19
|
-
self.logger.info("AWS region:",self.region)
|
|
20
|
-
self.access_key = aws_session.get_credentials().access_key
|
|
21
|
-
self.logger.debug("AWS access key:",self.access_key)
|
|
22
|
-
self.account_number = aws_session.client('sts').get_caller_identity().get('Account')
|
|
23
|
-
except Exception as err:
|
|
24
|
-
self.logger.error("Unable to retrieve AWS credentials",err)
|
|
25
|
-
self.access_key = None
|
|
26
|
-
self.region = None
|
|
27
|
-
self.account_number = None
|
|
28
|
-
|
|
29
|
-
# AWS specific
|
|
30
|
-
|
|
31
|
-
self.analysis_types = {
|
|
32
|
-
"KeyPhraseResults":"KeyPhrases",
|
|
33
|
-
"SentimentResults":"Sentiment",
|
|
34
|
-
"TargetedSentimentResults":"Entities",
|
|
35
|
-
"SyntaxResults":"SyntaxTokens"
|
|
36
|
-
}
|
|
37
|
-
# General parameters
|
|
38
|
-
|
|
39
|
-
if not local_path:
|
|
40
|
-
self.logger.warning("No path supplied, creating a data directory...")
|
|
41
|
-
#print(f"WD: {working_dir}")
|
|
42
|
-
data_dir = working_dir+"/data/"
|
|
43
|
-
if not os.path.exists(data_dir):
|
|
44
|
-
os.makedirs(data_dir)
|
|
45
|
-
self.logger.info("Created:",data_dir)
|
|
46
|
-
self.local_path = data_dir
|
|
47
|
-
else:
|
|
48
|
-
data_dir = local_path
|
|
49
|
-
if not os.path.exists(data_dir):
|
|
50
|
-
self.logger.warning("Path does not exist, creating directory")
|
|
51
|
-
os.makedirs(data_dir)
|
|
52
|
-
self.logger.info(f"Created {data_dir}")
|
|
53
|
-
self.local_path = local_path
|
|
54
|
-
if not date_string:
|
|
55
|
-
date_string = datetime.today().strftime('%Y%m%d')
|
|
56
|
-
self.logger.warning(f"No date_string supplied, using today: {date_string}")
|
|
57
|
-
self.date_string = date_string
|
|
58
|
-
self.prefix = f"{name_prefix}_"
|
|
59
|
-
self.postfix = f"-{date_string}"
|
|
60
|
-
return None
|
|
61
|
-
|
|
62
|
-
def all_parameters(self):
|
|
63
|
-
return self.__dict__
|
|
64
|
-
|
|
65
|
-
def set_s3_parameters(self,s3_access_point,bucket_name):
|
|
66
|
-
self.s3_access_point = s3_access_point
|
|
67
|
-
self.bucket_name = bucket_name
|
|
68
|
-
self.s3_accesspoint_arn = f"arn:aws:s3:{self.region}:{self.account_number}:accesspoint/{s3_access_point}"
|
|
69
|
-
|
|
70
|
-
def set_comprehend_parameters(self,comprehend_service_role_name):
|
|
71
|
-
self.comprehend_service_role_name = comprehend_service_role_name
|
|
72
|
-
self.comprehend_access_role_arn = f"arn:aws:iam::{self.account_number}:role/service-role/{comprehend_service_role_name}"
|
|
73
|
-
|
|
74
|
-
def set_comprehend_custom_entity_parameters(self,reflexive_entity_name,reflexive_entity_version):
|
|
75
|
-
self.reflexive_entity_name = reflexive_entity_name
|
|
76
|
-
self.reflexive_entity_version = reflexive_entity_version
|
|
77
|
-
self.reflexive_entity_arn = f"arn:aws:comprehend:{self.region}:{self.account_number}:entity-recognizer/{self.reflexive_entity_name}/version/{self.reflexive_entity_version}"
|