opener-opinion-detector-base 2.0.1 → 2.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/core/python-scripts/README.md +78 -3
- data/core/python-scripts/classify_kaf_naf_file.py +94 -94
- data/core/python-scripts/models.cfg +1 -0
- data/core/python-scripts/scripts/config_manager.py +3 -0
- data/core/python-scripts/scripts/extract_features.py +0 -3
- data/core/python-scripts/scripts/relation_classifier.py +1 -1
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -42
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -26
- data/ext/hack/Rakefile +5 -2
- data/lib/opener/opinion_detectors/base.rb +19 -15
- data/lib/opener/opinion_detectors/base/version.rb +1 -1
- data/lib/opener/opinion_detectors/configuration_creator.rb +6 -8
- data/lib/opener/opinion_detectors/de.rb +1 -1
- data/lib/opener/opinion_detectors/es.rb +7 -0
- data/lib/opener/opinion_detectors/fr.rb +7 -0
- data/opener-opinion-detector-base.gemspec +0 -1
- data/pre_install_requirements.txt +3 -0
- metadata +41 -85
- data/core/packages/KafNafParser-1.4.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +0 -22
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +0 -47
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +0 -390
- data/core/site-packages/pre_build/KafNafParser/__init__.py +0 -14
- data/core/site-packages/pre_build/KafNafParser/constituency_data.py +0 -125
- data/core/site-packages/pre_build/KafNafParser/coreference_data.py +0 -52
- data/core/site-packages/pre_build/KafNafParser/dependency_data.py +0 -78
- data/core/site-packages/pre_build/KafNafParser/entity_data.py +0 -59
- data/core/site-packages/pre_build/KafNafParser/external_references_data.py +0 -41
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +0 -2
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +0 -205
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +0 -309
- data/core/site-packages/pre_build/KafNafParser/features_data.py +0 -131
- data/core/site-packages/pre_build/KafNafParser/header_data.py +0 -127
- data/core/site-packages/pre_build/KafNafParser/opinion_data.py +0 -211
- data/core/site-packages/pre_build/KafNafParser/references_data.py +0 -23
- data/core/site-packages/pre_build/KafNafParser/span_data.py +0 -63
- data/core/site-packages/pre_build/KafNafParser/term_data.py +0 -111
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +0 -42
- data/core/site-packages/pre_build/KafNafParser/text_data.py +0 -99
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +0 -14
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +0 -23
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/VUA_pylib/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/common/common.py +0 -28
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +0 -156
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +0 -121
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +0 -72
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +0 -7
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +0 -11
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +0 -165
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +0 -439
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +0 -7
- data/pre_build_requirements.txt +0 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8cab19f98d9ee9c6ae4938a3be0cebb666126e44
|
4
|
+
data.tar.gz: d83ded3deb19fe5b7cead1ba079cb6bf585c9593
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb3d6a9b3d9d6fd3a3fa496b6a2de948398d250e49540dd89bd24d1efe98e9837e7bcb5dfef3b973c5fb069f4425dca2cf2c01d66bfdb148884d8af20a0c8792
|
7
|
+
data.tar.gz: ac1f7d71ec3160f279b0e2c8954ea1ee5bf29f56630d7e249a16c46c3498a091a511df97f89606d1bbaece012e011aff0bdd30bd50b84643e0993d8f9598f68a
|
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
##Introduction##
|
4
4
|
|
5
|
-
|
6
5
|
Opinion miner based on machine learning that can be trained using a list of
|
7
6
|
KAF/NAF files. It is important to notice that the opinion miner module will not call
|
8
7
|
to any external module to obtain features. It will read all the features from the input KAF/NAF file,
|
9
8
|
so you have to make sure that your input file contains all the required information in advance (tokens,
|
10
|
-
terms, polarities, constituents, entitiess, dependencies...)
|
9
|
+
terms, polarities, constituents, entitiess, dependencies...).
|
11
10
|
|
12
|
-
The task is divided into 2 steps
|
11
|
+
The task is general divided into 2 steps
|
13
12
|
* Detection of opinion entities (holder, target and expression): using
|
14
13
|
Conditional Random Fields
|
15
14
|
* Opinion entity linking (expression<-target and expression-<holder): using
|
@@ -79,6 +78,82 @@ of CRFsuite and SVMLight. This file will be passed to the main script to detect
|
|
79
78
|
cat my_file.kaf | classify_kaf_naf_file.py your_config_file.cfg
|
80
79
|
````
|
81
80
|
|
81
|
+
There are two basic functionalities:
|
82
|
+
|
83
|
+
* Training: from a corpus of opinion annotated files, induce and learn the models for detecting opinions
|
84
|
+
* Classification: using the previous models, find and extract opinions in new text files.
|
85
|
+
|
86
|
+
We provide models already trained and evaluated on hotel, news, attractions and restaurants domains for all the languages covered
|
87
|
+
by the OpeNER project. Most of the users will just focus on this classification step, using the models that we provide. Some others
|
88
|
+
will need to retrain the system to adapt it to a new domain or language. In the next sections we will introduce these 2 differents
|
89
|
+
usages of the opinion miner deluxe
|
90
|
+
|
91
|
+
##Classification##
|
92
|
+
|
93
|
+
In this case you have the models already trained (either you trained them yourself or got the ones we provide) and you want just to detect
|
94
|
+
the opinions in a new file. The input format of your file needs to be valid KAF format. The script that perfoms the classification is the script
|
95
|
+
`classify_kaf_naf_file.py`. You can get information about the available parameters by running the script with the parameter -h.
|
96
|
+
```shell
|
97
|
+
classify_kaf_naf_file.py -h
|
98
|
+
usage: classify_kaf_naf_file.py [-h]
|
99
|
+
(-m MODEL_FOLDER | -d DOMAIN | -show-models)
|
100
|
+
[-keep-opinions] [-no-time]
|
101
|
+
|
102
|
+
Detect opinion triples in a KAF/NAF file
|
103
|
+
|
104
|
+
optional arguments:
|
105
|
+
-h, --help show this help message and exit
|
106
|
+
-m MODEL_FOLDER Folder storing the trained models
|
107
|
+
-d DOMAIN The domain where the models were trained
|
108
|
+
-show-models Show the models available and finish
|
109
|
+
-keep-opinions Keep the opinions from the input (by default will be deleted)
|
110
|
+
-no-time No include time in timestamp (for testing)
|
111
|
+
```
|
112
|
+
|
113
|
+
The script reads the input KAF file from the standard input and will write the output KAF into the standard output. The main parameter is the model that
|
114
|
+
will be used. There are two ways of specifyng this parameter:
|
115
|
+
* By using the -m FOLDER option, by means of which we can specify that we would like to use exactly the folder stored in the path FOLDER
|
116
|
+
* By using the -d DOMAIN option, where DOMAIN is the domain where the model that we want to use was trained.
|
117
|
+
|
118
|
+
We can get which are the models available by running:
|
119
|
+
```shell
|
120
|
+
classify_kaf_naf_file.py -show-models
|
121
|
+
#########################
|
122
|
+
Models available
|
123
|
+
#########################
|
124
|
+
Model 0
|
125
|
+
Lang: en
|
126
|
+
Domain: hotel
|
127
|
+
Folder: final_models/en/hotel_cfg1
|
128
|
+
Desc: Trained with config1 in the last version of hotel annotations
|
129
|
+
Model 1
|
130
|
+
Lang: en
|
131
|
+
Domain: news
|
132
|
+
Folder: final_models/en/news_cfg1
|
133
|
+
Desc: Trained with config1 using only the sentences annotated with news
|
134
|
+
....
|
135
|
+
....
|
136
|
+
```
|
137
|
+
|
138
|
+
You can train as use as many models as you want. You will need the file `models.cfg` which contains the metadata about which models
|
139
|
+
are available and how to refer to them (the domain). This is an example of the content of this file:
|
140
|
+
```shell
|
141
|
+
#LANG|domain|pathtomodel|description
|
142
|
+
en|hotel|final_models/en/hotel_cfg1|Trained with config1 in the last version of hotel annotations
|
143
|
+
en|news|final_models/en/news_cfg1|Trained with config1 using only the sentences annotated with news
|
144
|
+
nl|hotel|final_models/nl/hotel_cfg1|Trained with config1 in the last version of hotel annotations
|
145
|
+
nl|news|final_models/nl/news_cfg1|Trained with config1 using only the sentences annotated with news
|
146
|
+
```
|
147
|
+
So in each line a model is specified and represented using 4 fields, the language, the domain identifier (which will be used later to refer to this model),
|
148
|
+
the path to the folder and a text with a description. The language for the KAF file will be read directly from the KAF header, and considering this model
|
149
|
+
and the domain id provided to the script, the proper model will be loaded and used.
|
150
|
+
|
151
|
+
So if you want to tag a file with Dutch text called input.nl.kaf with the models trained on hotel reviews, and store the result on the file output.nl.kaf you just
|
152
|
+
should call to the program as:
|
153
|
+
```shell
|
154
|
+
cat input.nl.kaf | python classify_kaf_naf_file.py -d hotel > output.nl.kaf
|
155
|
+
```
|
156
|
+
|
82
157
|
##Training your own models##
|
83
158
|
|
84
159
|
You will need first to install all the requirementes given and then follow these steps:
|
@@ -8,7 +8,7 @@ this_folder = os.path.dirname(os.path.realpath(__file__))
|
|
8
8
|
|
9
9
|
# This updates the load path to ensure that the local site-packages directory
|
10
10
|
# can be used to load packages (e.g. a locally installed copy of lxml).
|
11
|
-
sys.path.append(os.path.join(this_folder, '../site-packages/
|
11
|
+
sys.path.append(os.path.join(this_folder, '../site-packages/pre_install'))
|
12
12
|
|
13
13
|
import csv
|
14
14
|
from tempfile import NamedTemporaryFile
|
@@ -20,17 +20,16 @@ import argparse
|
|
20
20
|
from scripts import lexicons as lexicons_manager
|
21
21
|
from scripts.config_manager import Cconfig_manager
|
22
22
|
from scripts.extract_features import extract_features_from_kaf_naf_file
|
23
|
-
from scripts.crfutils import extract_features_to_crf
|
23
|
+
from scripts.crfutils import extract_features_to_crf
|
24
24
|
from scripts.link_entities_distance import link_entities_distance
|
25
25
|
from scripts.relation_classifier import link_entities_svm
|
26
|
-
from
|
27
|
-
from VUA_pylib import *
|
26
|
+
from KafNafParserPy import *
|
28
27
|
|
29
28
|
|
30
29
|
DEBUG=0
|
31
30
|
|
32
31
|
my_config_manager = Cconfig_manager()
|
33
|
-
__this_folder = os.
|
32
|
+
__this_folder = os.path.dirname(os.path.realpath(__file__))
|
34
33
|
separator = '\t'
|
35
34
|
__desc = 'Deluxe opinion miner (CRF+SVM)'
|
36
35
|
__last_edited = '10jan2014'
|
@@ -59,7 +58,7 @@ def match_crfsuite_out(crfout,list_token_ids):
|
|
59
58
|
if inside:
|
60
59
|
matches.append((current,current_type))
|
61
60
|
current = []
|
62
|
-
inside = False
|
61
|
+
inside = False
|
63
62
|
else:
|
64
63
|
if line=='O':
|
65
64
|
if inside:
|
@@ -73,8 +72,8 @@ def match_crfsuite_out(crfout,list_token_ids):
|
|
73
72
|
if inside:
|
74
73
|
matches.append((current,current_type))
|
75
74
|
current = [list_token_ids[num_token]]
|
76
|
-
inside = True
|
77
|
-
current_type = value
|
75
|
+
inside = True
|
76
|
+
current_type = value
|
78
77
|
elif my_type == 'I':
|
79
78
|
if inside:
|
80
79
|
current.append(list_token_ids[num_token])
|
@@ -92,42 +91,42 @@ def match_crfsuite_out(crfout,list_token_ids):
|
|
92
91
|
def extract_features(kaf_naf_obj):
|
93
92
|
feat_file_desc = NamedTemporaryFile(delete=False)
|
94
93
|
feat_file_desc.close()
|
95
|
-
|
94
|
+
|
96
95
|
out_file = feat_file_desc.name
|
97
96
|
err_file = out_file+'.log'
|
98
|
-
|
97
|
+
|
99
98
|
expressions_lexicon = None
|
100
99
|
targets_lexicon = None
|
101
100
|
if my_config_manager.get_use_training_lexicons():
|
102
101
|
expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
|
103
102
|
target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
|
104
|
-
|
103
|
+
|
105
104
|
expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
|
106
105
|
targets_lexicon =lexicons_manager.load_lexicon(target_lexicon_filename)
|
107
106
|
|
108
107
|
#def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None):
|
109
108
|
labels, separator,polarities_skipped = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file,include_class=False, exp_lex=expressions_lexicon,tar_lex=targets_lexicon)
|
110
109
|
return out_file, err_file
|
111
|
-
|
112
|
-
|
110
|
+
|
111
|
+
|
113
112
|
def convert_to_crf(input_file,templates):
|
114
113
|
out_desc = NamedTemporaryFile(delete=False)
|
115
114
|
out_desc.close()
|
116
|
-
|
115
|
+
|
117
116
|
out_crf = out_desc.name
|
118
|
-
|
117
|
+
|
119
118
|
##Load description of features
|
120
119
|
path_feat_desc = my_config_manager.get_feature_desc_filename()
|
121
120
|
fic = open(path_feat_desc)
|
122
121
|
fields = fic.read().strip()
|
123
122
|
fic.close()
|
124
123
|
####
|
125
|
-
|
124
|
+
|
126
125
|
extract_features_to_crf(input_file,out_crf,fields,separator,templates,possible_classes=None)
|
127
126
|
return out_crf
|
128
|
-
|
129
|
-
|
130
|
-
|
127
|
+
|
128
|
+
|
129
|
+
|
131
130
|
def run_crfsuite_tag(input_file,model_file):
|
132
131
|
crfsuite = my_config_manager.get_crfsuite_binary()
|
133
132
|
cmd = [crfsuite]
|
@@ -150,8 +149,8 @@ def run_crfsuite_tag(input_file,model_file):
|
|
150
149
|
|
151
150
|
def detect_expressions(tab_feat_file,list_token_ids):
|
152
151
|
#1) Convert to the correct CRF
|
153
|
-
templates = my_config_manager.get_templates_expr()
|
154
|
-
|
152
|
+
templates = my_config_manager.get_templates_expr()
|
153
|
+
|
155
154
|
crf_exp_file = convert_to_crf(tab_feat_file,templates)
|
156
155
|
logging.debug('File with crf format for EXPRESSIONS '+crf_exp_file)
|
157
156
|
if DEBUG:
|
@@ -161,10 +160,10 @@ def detect_expressions(tab_feat_file,list_token_ids):
|
|
161
160
|
print>>sys.stderr,f.read()
|
162
161
|
f.close()
|
163
162
|
print>>sys.stderr,'#'*50
|
164
|
-
|
163
|
+
|
165
164
|
model_file = my_config_manager.get_filename_model_expression()
|
166
165
|
output_crf,error_crf = run_crfsuite_tag(crf_exp_file,model_file)
|
167
|
-
|
166
|
+
|
168
167
|
logging.debug('Expressions crf error: '+error_crf)
|
169
168
|
matches_exp = match_crfsuite_out(output_crf, list_token_ids)
|
170
169
|
if DEBUG:
|
@@ -175,19 +174,19 @@ def detect_expressions(tab_feat_file,list_token_ids):
|
|
175
174
|
print>>sys.stderr,'MATCHES:',str(matches_exp)
|
176
175
|
print>>sys.stderr,'TEMP FILE:',crf_exp_file
|
177
176
|
print>>sys.stderr,'#'*50
|
178
|
-
|
179
|
-
|
177
|
+
|
178
|
+
|
180
179
|
logging.debug('Detector expressions out: '+str(matches_exp))
|
181
180
|
os.remove(crf_exp_file)
|
182
181
|
return matches_exp
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
182
|
+
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
188
187
|
def detect_targets(tab_feat_file, list_token_ids):
|
189
188
|
templates_target = my_config_manager.get_templates_target()
|
190
|
-
|
189
|
+
|
191
190
|
crf_target_file = convert_to_crf(tab_feat_file,templates_target)
|
192
191
|
logging.debug('File with crf format for TARGETS '+crf_target_file)
|
193
192
|
if DEBUG:
|
@@ -197,13 +196,13 @@ def detect_targets(tab_feat_file, list_token_ids):
|
|
197
196
|
print>>sys.stderr,f.read()
|
198
197
|
f.close()
|
199
198
|
print>>sys.stderr,'#'*50
|
200
|
-
|
199
|
+
|
201
200
|
model_target_file = my_config_manager.get_filename_model_target()
|
202
201
|
out_crf_target,error_crf = run_crfsuite_tag(crf_target_file, model_target_file)
|
203
202
|
logging.debug('TARGETS crf error: '+error_crf)
|
204
203
|
|
205
204
|
matches_tar = match_crfsuite_out(out_crf_target, list_token_ids)
|
206
|
-
|
205
|
+
|
207
206
|
if DEBUG:
|
208
207
|
print>>sys.stderr,'#'*50
|
209
208
|
print>>sys.stderr,'CRF output for TARGETS'
|
@@ -211,18 +210,18 @@ def detect_targets(tab_feat_file, list_token_ids):
|
|
211
210
|
print>>sys.stderr,'List token ids:',str(list_token_ids)
|
212
211
|
print>>sys.stderr,'MATCHES:',str(matches_tar)
|
213
212
|
print>>sys.stderr,'#'*50
|
214
|
-
|
213
|
+
|
215
214
|
logging.debug('Detector targets out: '+str(matches_tar))
|
216
215
|
os.remove(crf_target_file)
|
217
216
|
return matches_tar
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
|
223
222
|
def detect_holders(tab_feat_file, list_token_ids):
|
224
223
|
templates_holder = my_config_manager.get_templates_holder()
|
225
|
-
|
224
|
+
|
226
225
|
crf_holder_file = convert_to_crf(tab_feat_file,templates_holder)
|
227
226
|
logging.debug('File with crf format for HOLDERS '+crf_holder_file)
|
228
227
|
if DEBUG:
|
@@ -232,7 +231,7 @@ def detect_holders(tab_feat_file, list_token_ids):
|
|
232
231
|
print>>sys.stderr,f.read()
|
233
232
|
f.close()
|
234
233
|
print>>sys.stderr,'#'*50
|
235
|
-
|
234
|
+
|
236
235
|
model_holder_file = my_config_manager.get_filename_model_holder()
|
237
236
|
out_crf_holder,error_crf = run_crfsuite_tag(crf_holder_file, model_holder_file)
|
238
237
|
logging.debug('HOLDERS crf error: '+error_crf)
|
@@ -246,12 +245,12 @@ def detect_holders(tab_feat_file, list_token_ids):
|
|
246
245
|
print>>sys.stderr,'List token ids:',str(list_token_ids)
|
247
246
|
print>>sys.stderr,'MATCHES:',str(matches_holder)
|
248
247
|
print>>sys.stderr,'#'*50
|
249
|
-
|
248
|
+
|
250
249
|
logging.debug('Detector HOLDERS out: '+str(matches_holder))
|
251
250
|
os.remove(crf_holder_file)
|
252
251
|
return matches_holder
|
253
|
-
|
254
|
-
|
252
|
+
|
253
|
+
|
255
254
|
|
256
255
|
|
257
256
|
|
@@ -267,19 +266,19 @@ def map_tokens_to_terms(list_tokens,knaf_obj):
|
|
267
266
|
terms_for_token[tokid] = [termid]
|
268
267
|
else:
|
269
268
|
terms_for_token[tokid].append(termid)
|
270
|
-
|
269
|
+
|
271
270
|
ret = set()
|
272
271
|
for my_id in list_tokens:
|
273
272
|
term_ids = terms_for_token[my_id]
|
274
273
|
ret |= set(term_ids)
|
275
274
|
return sorted(list(ret))
|
276
|
-
|
277
|
-
|
278
|
-
|
275
|
+
|
276
|
+
|
277
|
+
|
279
278
|
def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=True,include_polarity_strength=True):
|
280
279
|
num_opinion = 0
|
281
280
|
for type_exp, span_exp, span_tar, span_hol in triples:
|
282
|
-
#Map tokens to terms
|
281
|
+
#Map tokens to terms
|
283
282
|
if map_to_terms:
|
284
283
|
span_exp_terms = map_tokens_to_terms(span_exp,kaf_obj)
|
285
284
|
span_tar_terms = map_tokens_to_terms(span_tar,kaf_obj)
|
@@ -288,16 +287,16 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
|
|
288
287
|
span_hol_terms = span_hol
|
289
288
|
span_tar_terms = span_tar
|
290
289
|
span_exp_terms = span_exp
|
291
|
-
|
290
|
+
|
292
291
|
##Creating holder
|
293
292
|
span_hol = Cspan()
|
294
293
|
span_hol.create_from_ids(span_hol_terms)
|
295
294
|
my_hol = Cholder()
|
296
295
|
my_hol.set_span(span_hol)
|
297
|
-
|
296
|
+
|
298
297
|
hol_text = ' '.join(text_for_tid[tid] for tid in span_hol_terms)
|
299
298
|
my_hol.set_comment(hol_text)
|
300
|
-
|
299
|
+
|
301
300
|
#Creating target
|
302
301
|
span_tar = Cspan()
|
303
302
|
span_tar.create_from_ids(span_tar_terms)
|
@@ -318,7 +317,7 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
|
|
318
317
|
exp_text = ' '.join(text_for_tid[tid] for tid in span_exp_terms)
|
319
318
|
my_exp.set_comment(exp_text)
|
320
319
|
#########################
|
321
|
-
|
320
|
+
|
322
321
|
#To get the first possible ID not already used
|
323
322
|
new_id = None
|
324
323
|
while True:
|
@@ -332,32 +331,33 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
|
|
332
331
|
new_opinion.set_id(new_id)
|
333
332
|
if len(span_hol_terms) != 0: #To avoid empty holders
|
334
333
|
new_opinion.set_holder(my_hol)
|
335
|
-
|
334
|
+
|
336
335
|
if len(span_tar_terms) != 0: #To avoid empty targets
|
337
336
|
new_opinion.set_target(my_tar)
|
338
|
-
|
337
|
+
|
339
338
|
new_opinion.set_expression(my_exp)
|
340
|
-
|
339
|
+
|
341
340
|
knaf_obj.add_opinion(new_opinion)
|
342
|
-
|
341
|
+
|
343
342
|
##
|
344
343
|
# Input_file_stream can be a filename of a stream
|
345
344
|
# Opoutfile_trasm can be a filename of a stream
|
346
345
|
#Config file must be a string filename
|
347
346
|
def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,kaf_obj=None, remove_existing_opinions=True,include_polarity_strength=True,timestamp=True):
|
347
|
+
|
348
348
|
config_filename = os.path.join(model_folder)
|
349
349
|
if not os.path.exists(config_filename):
|
350
350
|
print>>sys.stderr,'Config file not found on:',config_filename
|
351
351
|
sys.exit(-1)
|
352
|
-
|
352
|
+
|
353
353
|
my_config_manager.set_current_folder(__this_folder)
|
354
354
|
my_config_manager.set_config(config_filename)
|
355
|
-
|
355
|
+
|
356
356
|
if kaf_obj is not None:
|
357
357
|
knaf_obj = kaf_obj
|
358
358
|
else:
|
359
359
|
knaf_obj = KafNafParser(input_file_stream)
|
360
|
-
|
360
|
+
|
361
361
|
#Create a temporary file
|
362
362
|
out_feat_file, err_feat_file = extract_features(knaf_obj)
|
363
363
|
if DEBUG:
|
@@ -367,7 +367,7 @@ def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,ka
|
|
367
367
|
print>>sys.stderr,f.read()
|
368
368
|
f.close()
|
369
369
|
print>>sys.stderr,'#'*50
|
370
|
-
|
370
|
+
|
371
371
|
#get all the tokens in order
|
372
372
|
list_token_ids = []
|
373
373
|
text_for_wid = {}
|
@@ -378,67 +378,67 @@ def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,ka
|
|
378
378
|
s_id = token_obj.get_sent()
|
379
379
|
w_id = token_obj.get_id()
|
380
380
|
text_for_wid[w_id] = token
|
381
|
-
|
381
|
+
|
382
382
|
list_token_ids.append(w_id)
|
383
383
|
sentence_for_token[w_id] = s_id
|
384
|
-
|
384
|
+
|
385
385
|
for term in knaf_obj.get_terms():
|
386
386
|
tid = term.get_id()
|
387
387
|
toks = [text_for_wid.get(wid,'') for wid in term.get_span().get_span_ids()]
|
388
388
|
text_for_tid[tid] = ' '.join(toks)
|
389
389
|
|
390
|
-
|
390
|
+
|
391
391
|
expressions = detect_expressions(out_feat_file,list_token_ids)
|
392
392
|
targets = detect_targets(out_feat_file, list_token_ids)
|
393
393
|
holders = detect_holders(out_feat_file, list_token_ids)
|
394
|
-
|
394
|
+
|
395
395
|
os.remove(out_feat_file)
|
396
396
|
os.remove(err_feat_file)
|
397
397
|
|
398
398
|
if DEBUG:
|
399
399
|
print>>sys.stderr,"Expressions detected:"
|
400
400
|
for e in expressions:
|
401
|
-
print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
|
401
|
+
print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
|
402
402
|
print>>sys.stderr
|
403
|
-
|
403
|
+
|
404
404
|
print>>sys.stderr,'Targets detected'
|
405
405
|
for t in targets:
|
406
|
-
print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
|
406
|
+
print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
|
407
407
|
print>>sys.stderr
|
408
|
-
|
408
|
+
|
409
409
|
print>>sys.stderr,'Holders',holders
|
410
410
|
for h in holders:
|
411
|
-
print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
|
411
|
+
print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
|
412
412
|
print>>sys.stderr
|
413
|
-
|
414
|
-
|
413
|
+
|
414
|
+
|
415
415
|
# Entity linker based on distances
|
416
416
|
####triples = link_entities_distance(expressions,targets,holders,sentence_for_token)
|
417
|
-
|
417
|
+
|
418
418
|
triples = link_entities_svm(expressions, targets, holders, knaf_obj, my_config_manager)
|
419
|
-
|
419
|
+
|
420
420
|
ids_used = set()
|
421
421
|
if remove_existing_opinions:
|
422
422
|
knaf_obj.remove_opinion_layer()
|
423
423
|
else:
|
424
424
|
for opi in knaf_obj.get_opinions():
|
425
425
|
ids_used.add(opi.get_id())
|
426
|
-
|
427
|
-
|
428
|
-
add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
|
429
|
-
|
426
|
+
|
427
|
+
|
428
|
+
add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
|
429
|
+
|
430
430
|
#Adding linguistic processor
|
431
431
|
my_lp = Clp()
|
432
432
|
my_lp.set_name(__desc)
|
433
433
|
my_lp.set_version(__last_edited+'_'+__version)
|
434
434
|
if timestamp:
|
435
|
-
|
435
|
+
my_lp.set_timestamp() ##Set to the current date and time
|
436
436
|
else:
|
437
|
-
|
437
|
+
my_lp.set_timestamp('*')
|
438
438
|
knaf_obj.add_linguistic_processor('opinions',my_lp)
|
439
439
|
knaf_obj.dump(output_file_stream)
|
440
|
-
|
441
|
-
|
440
|
+
|
441
|
+
|
442
442
|
|
443
443
|
def obtain_predefined_model(lang,domain,just_show=False):
|
444
444
|
#This function will read the models from the file models.cfg and will return
|
@@ -451,7 +451,7 @@ def obtain_predefined_model(lang,domain,just_show=False):
|
|
451
451
|
print '#'*25
|
452
452
|
print 'Models available'
|
453
453
|
print '#'*25
|
454
|
-
|
454
|
+
|
455
455
|
nm = 0
|
456
456
|
for line in fic:
|
457
457
|
if line[0]!='#':
|
@@ -471,15 +471,15 @@ def obtain_predefined_model(lang,domain,just_show=False):
|
|
471
471
|
if just_show:
|
472
472
|
print '#'*25
|
473
473
|
return use_this_model
|
474
|
-
|
474
|
+
|
475
475
|
if __name__ == '__main__':
|
476
|
-
|
476
|
+
|
477
477
|
argument_parser = argparse.ArgumentParser(description='Detect opinion triples in a KAF/NAF file')
|
478
478
|
group = argument_parser.add_mutually_exclusive_group(required=True)
|
479
479
|
group.add_argument('-m',dest='model_folder',help='Folder storing the trained models')
|
480
480
|
group.add_argument('-d', dest='domain',help='The domain where the models were trained')
|
481
481
|
group.add_argument('-show-models', dest='show_models', action='store_true',help='Show the models available and finish')
|
482
|
-
|
482
|
+
|
483
483
|
argument_parser.add_argument('-keep-opinions',dest='keep_opinions',action='store_true',help='Keep the opinions from the input (by default will be deleted)')
|
484
484
|
argument_parser.add_argument('-no-time',dest='timestamp',action='store_false',help='No include time in timestamp (for testing)')
|
485
485
|
arguments = argument_parser.parse_args()
|
@@ -487,7 +487,7 @@ if __name__ == '__main__':
|
|
487
487
|
if arguments.show_models:
|
488
488
|
obtain_predefined_model(None,None,just_show=True)
|
489
489
|
sys.exit(0)
|
490
|
-
|
490
|
+
|
491
491
|
knaf_obj = KafNafParser(sys.stdin)
|
492
492
|
model_folder = None
|
493
493
|
if arguments.model_folder is not None:
|
@@ -496,12 +496,12 @@ if __name__ == '__main__':
|
|
496
496
|
#Obtain the language
|
497
497
|
lang = knaf_obj.get_language()
|
498
498
|
model_folder = obtain_predefined_model(lang,arguments.domain)
|
499
|
-
|
500
|
-
|
499
|
+
|
500
|
+
|
501
501
|
tag_file_with_opinions(None, sys.stdout,model_folder,kaf_obj=knaf_obj,remove_existing_opinions=(not arguments.keep_opinions),timestamp=arguments.timestamp)
|
502
502
|
sys.exit(0)
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
503
|
+
|
504
|
+
|
505
|
+
|
506
|
+
|
507
|
+
|