opener-opinion-detector-base 2.0.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/core/python-scripts/README.md +78 -3
- data/core/python-scripts/classify_kaf_naf_file.py +94 -94
- data/core/python-scripts/models.cfg +1 -0
- data/core/python-scripts/scripts/config_manager.py +3 -0
- data/core/python-scripts/scripts/extract_features.py +0 -3
- data/core/python-scripts/scripts/relation_classifier.py +1 -1
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -42
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -26
- data/ext/hack/Rakefile +5 -2
- data/lib/opener/opinion_detectors/base.rb +19 -15
- data/lib/opener/opinion_detectors/base/version.rb +1 -1
- data/lib/opener/opinion_detectors/configuration_creator.rb +6 -8
- data/lib/opener/opinion_detectors/de.rb +1 -1
- data/lib/opener/opinion_detectors/es.rb +7 -0
- data/lib/opener/opinion_detectors/fr.rb +7 -0
- data/opener-opinion-detector-base.gemspec +0 -1
- data/pre_install_requirements.txt +3 -0
- metadata +41 -85
- data/core/packages/KafNafParser-1.4.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +0 -22
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +0 -47
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +0 -390
- data/core/site-packages/pre_build/KafNafParser/__init__.py +0 -14
- data/core/site-packages/pre_build/KafNafParser/constituency_data.py +0 -125
- data/core/site-packages/pre_build/KafNafParser/coreference_data.py +0 -52
- data/core/site-packages/pre_build/KafNafParser/dependency_data.py +0 -78
- data/core/site-packages/pre_build/KafNafParser/entity_data.py +0 -59
- data/core/site-packages/pre_build/KafNafParser/external_references_data.py +0 -41
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +0 -2
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +0 -205
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +0 -309
- data/core/site-packages/pre_build/KafNafParser/features_data.py +0 -131
- data/core/site-packages/pre_build/KafNafParser/header_data.py +0 -127
- data/core/site-packages/pre_build/KafNafParser/opinion_data.py +0 -211
- data/core/site-packages/pre_build/KafNafParser/references_data.py +0 -23
- data/core/site-packages/pre_build/KafNafParser/span_data.py +0 -63
- data/core/site-packages/pre_build/KafNafParser/term_data.py +0 -111
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +0 -42
- data/core/site-packages/pre_build/KafNafParser/text_data.py +0 -99
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +0 -14
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +0 -23
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/VUA_pylib/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/common/common.py +0 -28
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +0 -156
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +0 -121
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +0 -1
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +0 -72
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +0 -10
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +0 -7
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +0 -1
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +0 -11
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +0 -1
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +0 -165
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +0 -439
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +0 -7
- data/pre_build_requirements.txt +0 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8cab19f98d9ee9c6ae4938a3be0cebb666126e44
|
4
|
+
data.tar.gz: d83ded3deb19fe5b7cead1ba079cb6bf585c9593
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bb3d6a9b3d9d6fd3a3fa496b6a2de948398d250e49540dd89bd24d1efe98e9837e7bcb5dfef3b973c5fb069f4425dca2cf2c01d66bfdb148884d8af20a0c8792
|
7
|
+
data.tar.gz: ac1f7d71ec3160f279b0e2c8954ea1ee5bf29f56630d7e249a16c46c3498a091a511df97f89606d1bbaece012e011aff0bdd30bd50b84643e0993d8f9598f68a
|
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
##Introduction##
|
4
4
|
|
5
|
-
|
6
5
|
Opinion miner based on machine learning that can be trained using a list of
|
7
6
|
KAF/NAF files. It is important to notice that the opinion miner module will not call
|
8
7
|
to any external module to obtain features. It will read all the features from the input KAF/NAF file,
|
9
8
|
so you have to make sure that your input file contains all the required information in advance (tokens,
|
10
|
-
terms, polarities, constituents, entitiess, dependencies...)
|
9
|
+
terms, polarities, constituents, entitiess, dependencies...).
|
11
10
|
|
12
|
-
The task is divided into 2 steps
|
11
|
+
The task is general divided into 2 steps
|
13
12
|
* Detection of opinion entities (holder, target and expression): using
|
14
13
|
Conditional Random Fields
|
15
14
|
* Opinion entity linking (expression<-target and expression-<holder): using
|
@@ -79,6 +78,82 @@ of CRFsuite and SVMLight. This file will be passed to the main script to detect
|
|
79
78
|
cat my_file.kaf | classify_kaf_naf_file.py your_config_file.cfg
|
80
79
|
````
|
81
80
|
|
81
|
+
There are two basic functionalities:
|
82
|
+
|
83
|
+
* Training: from a corpus of opinion annotated files, induce and learn the models for detecting opinions
|
84
|
+
* Classification: using the previous models, find and extract opinions in new text files.
|
85
|
+
|
86
|
+
We provide models already trained and evaluated on hotel, news, attractions and restaurants domains for all the languages covered
|
87
|
+
by the OpeNER project. Most of the users will just focus on this classification step, using the models that we provide. Some others
|
88
|
+
will need to retrain the system to adapt it to a new domain or language. In the next sections we will introduce these 2 differents
|
89
|
+
usages of the opinion miner deluxe
|
90
|
+
|
91
|
+
##Classification##
|
92
|
+
|
93
|
+
In this case you have the models already trained (either you trained them yourself or got the ones we provide) and you want just to detect
|
94
|
+
the opinions in a new file. The input format of your file needs to be valid KAF format. The script that perfoms the classification is the script
|
95
|
+
`classify_kaf_naf_file.py`. You can get information about the available parameters by running the script with the parameter -h.
|
96
|
+
```shell
|
97
|
+
classify_kaf_naf_file.py -h
|
98
|
+
usage: classify_kaf_naf_file.py [-h]
|
99
|
+
(-m MODEL_FOLDER | -d DOMAIN | -show-models)
|
100
|
+
[-keep-opinions] [-no-time]
|
101
|
+
|
102
|
+
Detect opinion triples in a KAF/NAF file
|
103
|
+
|
104
|
+
optional arguments:
|
105
|
+
-h, --help show this help message and exit
|
106
|
+
-m MODEL_FOLDER Folder storing the trained models
|
107
|
+
-d DOMAIN The domain where the models were trained
|
108
|
+
-show-models Show the models available and finish
|
109
|
+
-keep-opinions Keep the opinions from the input (by default will be deleted)
|
110
|
+
-no-time No include time in timestamp (for testing)
|
111
|
+
```
|
112
|
+
|
113
|
+
The script reads the input KAF file from the standard input and will write the output KAF into the standard output. The main parameter is the model that
|
114
|
+
will be used. There are two ways of specifyng this parameter:
|
115
|
+
* By using the -m FOLDER option, by means of which we can specify that we would like to use exactly the folder stored in the path FOLDER
|
116
|
+
* By using the -d DOMAIN option, where DOMAIN is the domain where the model that we want to use was trained.
|
117
|
+
|
118
|
+
We can get which are the models available by running:
|
119
|
+
```shell
|
120
|
+
classify_kaf_naf_file.py -show-models
|
121
|
+
#########################
|
122
|
+
Models available
|
123
|
+
#########################
|
124
|
+
Model 0
|
125
|
+
Lang: en
|
126
|
+
Domain: hotel
|
127
|
+
Folder: final_models/en/hotel_cfg1
|
128
|
+
Desc: Trained with config1 in the last version of hotel annotations
|
129
|
+
Model 1
|
130
|
+
Lang: en
|
131
|
+
Domain: news
|
132
|
+
Folder: final_models/en/news_cfg1
|
133
|
+
Desc: Trained with config1 using only the sentences annotated with news
|
134
|
+
....
|
135
|
+
....
|
136
|
+
```
|
137
|
+
|
138
|
+
You can train as use as many models as you want. You will need the file `models.cfg` which contains the metadata about which models
|
139
|
+
are available and how to refer to them (the domain). This is an example of the content of this file:
|
140
|
+
```shell
|
141
|
+
#LANG|domain|pathtomodel|description
|
142
|
+
en|hotel|final_models/en/hotel_cfg1|Trained with config1 in the last version of hotel annotations
|
143
|
+
en|news|final_models/en/news_cfg1|Trained with config1 using only the sentences annotated with news
|
144
|
+
nl|hotel|final_models/nl/hotel_cfg1|Trained with config1 in the last version of hotel annotations
|
145
|
+
nl|news|final_models/nl/news_cfg1|Trained with config1 using only the sentences annotated with news
|
146
|
+
```
|
147
|
+
So in each line a model is specified and represented using 4 fields, the language, the domain identifier (which will be used later to refer to this model),
|
148
|
+
the path to the folder and a text with a description. The language for the KAF file will be read directly from the KAF header, and considering this model
|
149
|
+
and the domain id provided to the script, the proper model will be loaded and used.
|
150
|
+
|
151
|
+
So if you want to tag a file with Dutch text called input.nl.kaf with the models trained on hotel reviews, and store the result on the file output.nl.kaf you just
|
152
|
+
should call to the program as:
|
153
|
+
```shell
|
154
|
+
cat input.nl.kaf | python classify_kaf_naf_file.py -d hotel > output.nl.kaf
|
155
|
+
```
|
156
|
+
|
82
157
|
##Training your own models##
|
83
158
|
|
84
159
|
You will need first to install all the requirementes given and then follow these steps:
|
@@ -8,7 +8,7 @@ this_folder = os.path.dirname(os.path.realpath(__file__))
|
|
8
8
|
|
9
9
|
# This updates the load path to ensure that the local site-packages directory
|
10
10
|
# can be used to load packages (e.g. a locally installed copy of lxml).
|
11
|
-
sys.path.append(os.path.join(this_folder, '../site-packages/
|
11
|
+
sys.path.append(os.path.join(this_folder, '../site-packages/pre_install'))
|
12
12
|
|
13
13
|
import csv
|
14
14
|
from tempfile import NamedTemporaryFile
|
@@ -20,17 +20,16 @@ import argparse
|
|
20
20
|
from scripts import lexicons as lexicons_manager
|
21
21
|
from scripts.config_manager import Cconfig_manager
|
22
22
|
from scripts.extract_features import extract_features_from_kaf_naf_file
|
23
|
-
from scripts.crfutils import extract_features_to_crf
|
23
|
+
from scripts.crfutils import extract_features_to_crf
|
24
24
|
from scripts.link_entities_distance import link_entities_distance
|
25
25
|
from scripts.relation_classifier import link_entities_svm
|
26
|
-
from
|
27
|
-
from VUA_pylib import *
|
26
|
+
from KafNafParserPy import *
|
28
27
|
|
29
28
|
|
30
29
|
DEBUG=0
|
31
30
|
|
32
31
|
my_config_manager = Cconfig_manager()
|
33
|
-
__this_folder = os.
|
32
|
+
__this_folder = os.path.dirname(os.path.realpath(__file__))
|
34
33
|
separator = '\t'
|
35
34
|
__desc = 'Deluxe opinion miner (CRF+SVM)'
|
36
35
|
__last_edited = '10jan2014'
|
@@ -59,7 +58,7 @@ def match_crfsuite_out(crfout,list_token_ids):
|
|
59
58
|
if inside:
|
60
59
|
matches.append((current,current_type))
|
61
60
|
current = []
|
62
|
-
inside = False
|
61
|
+
inside = False
|
63
62
|
else:
|
64
63
|
if line=='O':
|
65
64
|
if inside:
|
@@ -73,8 +72,8 @@ def match_crfsuite_out(crfout,list_token_ids):
|
|
73
72
|
if inside:
|
74
73
|
matches.append((current,current_type))
|
75
74
|
current = [list_token_ids[num_token]]
|
76
|
-
inside = True
|
77
|
-
current_type = value
|
75
|
+
inside = True
|
76
|
+
current_type = value
|
78
77
|
elif my_type == 'I':
|
79
78
|
if inside:
|
80
79
|
current.append(list_token_ids[num_token])
|
@@ -92,42 +91,42 @@ def match_crfsuite_out(crfout,list_token_ids):
|
|
92
91
|
def extract_features(kaf_naf_obj):
|
93
92
|
feat_file_desc = NamedTemporaryFile(delete=False)
|
94
93
|
feat_file_desc.close()
|
95
|
-
|
94
|
+
|
96
95
|
out_file = feat_file_desc.name
|
97
96
|
err_file = out_file+'.log'
|
98
|
-
|
97
|
+
|
99
98
|
expressions_lexicon = None
|
100
99
|
targets_lexicon = None
|
101
100
|
if my_config_manager.get_use_training_lexicons():
|
102
101
|
expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
|
103
102
|
target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
|
104
|
-
|
103
|
+
|
105
104
|
expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
|
106
105
|
targets_lexicon =lexicons_manager.load_lexicon(target_lexicon_filename)
|
107
106
|
|
108
107
|
#def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None):
|
109
108
|
labels, separator,polarities_skipped = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file,include_class=False, exp_lex=expressions_lexicon,tar_lex=targets_lexicon)
|
110
109
|
return out_file, err_file
|
111
|
-
|
112
|
-
|
110
|
+
|
111
|
+
|
113
112
|
def convert_to_crf(input_file,templates):
|
114
113
|
out_desc = NamedTemporaryFile(delete=False)
|
115
114
|
out_desc.close()
|
116
|
-
|
115
|
+
|
117
116
|
out_crf = out_desc.name
|
118
|
-
|
117
|
+
|
119
118
|
##Load description of features
|
120
119
|
path_feat_desc = my_config_manager.get_feature_desc_filename()
|
121
120
|
fic = open(path_feat_desc)
|
122
121
|
fields = fic.read().strip()
|
123
122
|
fic.close()
|
124
123
|
####
|
125
|
-
|
124
|
+
|
126
125
|
extract_features_to_crf(input_file,out_crf,fields,separator,templates,possible_classes=None)
|
127
126
|
return out_crf
|
128
|
-
|
129
|
-
|
130
|
-
|
127
|
+
|
128
|
+
|
129
|
+
|
131
130
|
def run_crfsuite_tag(input_file,model_file):
|
132
131
|
crfsuite = my_config_manager.get_crfsuite_binary()
|
133
132
|
cmd = [crfsuite]
|
@@ -150,8 +149,8 @@ def run_crfsuite_tag(input_file,model_file):
|
|
150
149
|
|
151
150
|
def detect_expressions(tab_feat_file,list_token_ids):
|
152
151
|
#1) Convert to the correct CRF
|
153
|
-
templates = my_config_manager.get_templates_expr()
|
154
|
-
|
152
|
+
templates = my_config_manager.get_templates_expr()
|
153
|
+
|
155
154
|
crf_exp_file = convert_to_crf(tab_feat_file,templates)
|
156
155
|
logging.debug('File with crf format for EXPRESSIONS '+crf_exp_file)
|
157
156
|
if DEBUG:
|
@@ -161,10 +160,10 @@ def detect_expressions(tab_feat_file,list_token_ids):
|
|
161
160
|
print>>sys.stderr,f.read()
|
162
161
|
f.close()
|
163
162
|
print>>sys.stderr,'#'*50
|
164
|
-
|
163
|
+
|
165
164
|
model_file = my_config_manager.get_filename_model_expression()
|
166
165
|
output_crf,error_crf = run_crfsuite_tag(crf_exp_file,model_file)
|
167
|
-
|
166
|
+
|
168
167
|
logging.debug('Expressions crf error: '+error_crf)
|
169
168
|
matches_exp = match_crfsuite_out(output_crf, list_token_ids)
|
170
169
|
if DEBUG:
|
@@ -175,19 +174,19 @@ def detect_expressions(tab_feat_file,list_token_ids):
|
|
175
174
|
print>>sys.stderr,'MATCHES:',str(matches_exp)
|
176
175
|
print>>sys.stderr,'TEMP FILE:',crf_exp_file
|
177
176
|
print>>sys.stderr,'#'*50
|
178
|
-
|
179
|
-
|
177
|
+
|
178
|
+
|
180
179
|
logging.debug('Detector expressions out: '+str(matches_exp))
|
181
180
|
os.remove(crf_exp_file)
|
182
181
|
return matches_exp
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
182
|
+
|
183
|
+
|
184
|
+
|
185
|
+
|
186
|
+
|
188
187
|
def detect_targets(tab_feat_file, list_token_ids):
|
189
188
|
templates_target = my_config_manager.get_templates_target()
|
190
|
-
|
189
|
+
|
191
190
|
crf_target_file = convert_to_crf(tab_feat_file,templates_target)
|
192
191
|
logging.debug('File with crf format for TARGETS '+crf_target_file)
|
193
192
|
if DEBUG:
|
@@ -197,13 +196,13 @@ def detect_targets(tab_feat_file, list_token_ids):
|
|
197
196
|
print>>sys.stderr,f.read()
|
198
197
|
f.close()
|
199
198
|
print>>sys.stderr,'#'*50
|
200
|
-
|
199
|
+
|
201
200
|
model_target_file = my_config_manager.get_filename_model_target()
|
202
201
|
out_crf_target,error_crf = run_crfsuite_tag(crf_target_file, model_target_file)
|
203
202
|
logging.debug('TARGETS crf error: '+error_crf)
|
204
203
|
|
205
204
|
matches_tar = match_crfsuite_out(out_crf_target, list_token_ids)
|
206
|
-
|
205
|
+
|
207
206
|
if DEBUG:
|
208
207
|
print>>sys.stderr,'#'*50
|
209
208
|
print>>sys.stderr,'CRF output for TARGETS'
|
@@ -211,18 +210,18 @@ def detect_targets(tab_feat_file, list_token_ids):
|
|
211
210
|
print>>sys.stderr,'List token ids:',str(list_token_ids)
|
212
211
|
print>>sys.stderr,'MATCHES:',str(matches_tar)
|
213
212
|
print>>sys.stderr,'#'*50
|
214
|
-
|
213
|
+
|
215
214
|
logging.debug('Detector targets out: '+str(matches_tar))
|
216
215
|
os.remove(crf_target_file)
|
217
216
|
return matches_tar
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
|
223
222
|
def detect_holders(tab_feat_file, list_token_ids):
|
224
223
|
templates_holder = my_config_manager.get_templates_holder()
|
225
|
-
|
224
|
+
|
226
225
|
crf_holder_file = convert_to_crf(tab_feat_file,templates_holder)
|
227
226
|
logging.debug('File with crf format for HOLDERS '+crf_holder_file)
|
228
227
|
if DEBUG:
|
@@ -232,7 +231,7 @@ def detect_holders(tab_feat_file, list_token_ids):
|
|
232
231
|
print>>sys.stderr,f.read()
|
233
232
|
f.close()
|
234
233
|
print>>sys.stderr,'#'*50
|
235
|
-
|
234
|
+
|
236
235
|
model_holder_file = my_config_manager.get_filename_model_holder()
|
237
236
|
out_crf_holder,error_crf = run_crfsuite_tag(crf_holder_file, model_holder_file)
|
238
237
|
logging.debug('HOLDERS crf error: '+error_crf)
|
@@ -246,12 +245,12 @@ def detect_holders(tab_feat_file, list_token_ids):
|
|
246
245
|
print>>sys.stderr,'List token ids:',str(list_token_ids)
|
247
246
|
print>>sys.stderr,'MATCHES:',str(matches_holder)
|
248
247
|
print>>sys.stderr,'#'*50
|
249
|
-
|
248
|
+
|
250
249
|
logging.debug('Detector HOLDERS out: '+str(matches_holder))
|
251
250
|
os.remove(crf_holder_file)
|
252
251
|
return matches_holder
|
253
|
-
|
254
|
-
|
252
|
+
|
253
|
+
|
255
254
|
|
256
255
|
|
257
256
|
|
@@ -267,19 +266,19 @@ def map_tokens_to_terms(list_tokens,knaf_obj):
|
|
267
266
|
terms_for_token[tokid] = [termid]
|
268
267
|
else:
|
269
268
|
terms_for_token[tokid].append(termid)
|
270
|
-
|
269
|
+
|
271
270
|
ret = set()
|
272
271
|
for my_id in list_tokens:
|
273
272
|
term_ids = terms_for_token[my_id]
|
274
273
|
ret |= set(term_ids)
|
275
274
|
return sorted(list(ret))
|
276
|
-
|
277
|
-
|
278
|
-
|
275
|
+
|
276
|
+
|
277
|
+
|
279
278
|
def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=True,include_polarity_strength=True):
|
280
279
|
num_opinion = 0
|
281
280
|
for type_exp, span_exp, span_tar, span_hol in triples:
|
282
|
-
#Map tokens to terms
|
281
|
+
#Map tokens to terms
|
283
282
|
if map_to_terms:
|
284
283
|
span_exp_terms = map_tokens_to_terms(span_exp,kaf_obj)
|
285
284
|
span_tar_terms = map_tokens_to_terms(span_tar,kaf_obj)
|
@@ -288,16 +287,16 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
|
|
288
287
|
span_hol_terms = span_hol
|
289
288
|
span_tar_terms = span_tar
|
290
289
|
span_exp_terms = span_exp
|
291
|
-
|
290
|
+
|
292
291
|
##Creating holder
|
293
292
|
span_hol = Cspan()
|
294
293
|
span_hol.create_from_ids(span_hol_terms)
|
295
294
|
my_hol = Cholder()
|
296
295
|
my_hol.set_span(span_hol)
|
297
|
-
|
296
|
+
|
298
297
|
hol_text = ' '.join(text_for_tid[tid] for tid in span_hol_terms)
|
299
298
|
my_hol.set_comment(hol_text)
|
300
|
-
|
299
|
+
|
301
300
|
#Creating target
|
302
301
|
span_tar = Cspan()
|
303
302
|
span_tar.create_from_ids(span_tar_terms)
|
@@ -318,7 +317,7 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
|
|
318
317
|
exp_text = ' '.join(text_for_tid[tid] for tid in span_exp_terms)
|
319
318
|
my_exp.set_comment(exp_text)
|
320
319
|
#########################
|
321
|
-
|
320
|
+
|
322
321
|
#To get the first possible ID not already used
|
323
322
|
new_id = None
|
324
323
|
while True:
|
@@ -332,32 +331,33 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
|
|
332
331
|
new_opinion.set_id(new_id)
|
333
332
|
if len(span_hol_terms) != 0: #To avoid empty holders
|
334
333
|
new_opinion.set_holder(my_hol)
|
335
|
-
|
334
|
+
|
336
335
|
if len(span_tar_terms) != 0: #To avoid empty targets
|
337
336
|
new_opinion.set_target(my_tar)
|
338
|
-
|
337
|
+
|
339
338
|
new_opinion.set_expression(my_exp)
|
340
|
-
|
339
|
+
|
341
340
|
knaf_obj.add_opinion(new_opinion)
|
342
|
-
|
341
|
+
|
343
342
|
##
|
344
343
|
# Input_file_stream can be a filename of a stream
|
345
344
|
# Opoutfile_trasm can be a filename of a stream
|
346
345
|
#Config file must be a string filename
|
347
346
|
def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,kaf_obj=None, remove_existing_opinions=True,include_polarity_strength=True,timestamp=True):
|
347
|
+
|
348
348
|
config_filename = os.path.join(model_folder)
|
349
349
|
if not os.path.exists(config_filename):
|
350
350
|
print>>sys.stderr,'Config file not found on:',config_filename
|
351
351
|
sys.exit(-1)
|
352
|
-
|
352
|
+
|
353
353
|
my_config_manager.set_current_folder(__this_folder)
|
354
354
|
my_config_manager.set_config(config_filename)
|
355
|
-
|
355
|
+
|
356
356
|
if kaf_obj is not None:
|
357
357
|
knaf_obj = kaf_obj
|
358
358
|
else:
|
359
359
|
knaf_obj = KafNafParser(input_file_stream)
|
360
|
-
|
360
|
+
|
361
361
|
#Create a temporary file
|
362
362
|
out_feat_file, err_feat_file = extract_features(knaf_obj)
|
363
363
|
if DEBUG:
|
@@ -367,7 +367,7 @@ def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,ka
|
|
367
367
|
print>>sys.stderr,f.read()
|
368
368
|
f.close()
|
369
369
|
print>>sys.stderr,'#'*50
|
370
|
-
|
370
|
+
|
371
371
|
#get all the tokens in order
|
372
372
|
list_token_ids = []
|
373
373
|
text_for_wid = {}
|
@@ -378,67 +378,67 @@ def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,ka
|
|
378
378
|
s_id = token_obj.get_sent()
|
379
379
|
w_id = token_obj.get_id()
|
380
380
|
text_for_wid[w_id] = token
|
381
|
-
|
381
|
+
|
382
382
|
list_token_ids.append(w_id)
|
383
383
|
sentence_for_token[w_id] = s_id
|
384
|
-
|
384
|
+
|
385
385
|
for term in knaf_obj.get_terms():
|
386
386
|
tid = term.get_id()
|
387
387
|
toks = [text_for_wid.get(wid,'') for wid in term.get_span().get_span_ids()]
|
388
388
|
text_for_tid[tid] = ' '.join(toks)
|
389
389
|
|
390
|
-
|
390
|
+
|
391
391
|
expressions = detect_expressions(out_feat_file,list_token_ids)
|
392
392
|
targets = detect_targets(out_feat_file, list_token_ids)
|
393
393
|
holders = detect_holders(out_feat_file, list_token_ids)
|
394
|
-
|
394
|
+
|
395
395
|
os.remove(out_feat_file)
|
396
396
|
os.remove(err_feat_file)
|
397
397
|
|
398
398
|
if DEBUG:
|
399
399
|
print>>sys.stderr,"Expressions detected:"
|
400
400
|
for e in expressions:
|
401
|
-
print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
|
401
|
+
print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
|
402
402
|
print>>sys.stderr
|
403
|
-
|
403
|
+
|
404
404
|
print>>sys.stderr,'Targets detected'
|
405
405
|
for t in targets:
|
406
|
-
print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
|
406
|
+
print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
|
407
407
|
print>>sys.stderr
|
408
|
-
|
408
|
+
|
409
409
|
print>>sys.stderr,'Holders',holders
|
410
410
|
for h in holders:
|
411
|
-
print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
|
411
|
+
print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
|
412
412
|
print>>sys.stderr
|
413
|
-
|
414
|
-
|
413
|
+
|
414
|
+
|
415
415
|
# Entity linker based on distances
|
416
416
|
####triples = link_entities_distance(expressions,targets,holders,sentence_for_token)
|
417
|
-
|
417
|
+
|
418
418
|
triples = link_entities_svm(expressions, targets, holders, knaf_obj, my_config_manager)
|
419
|
-
|
419
|
+
|
420
420
|
ids_used = set()
|
421
421
|
if remove_existing_opinions:
|
422
422
|
knaf_obj.remove_opinion_layer()
|
423
423
|
else:
|
424
424
|
for opi in knaf_obj.get_opinions():
|
425
425
|
ids_used.add(opi.get_id())
|
426
|
-
|
427
|
-
|
428
|
-
add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
|
429
|
-
|
426
|
+
|
427
|
+
|
428
|
+
add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
|
429
|
+
|
430
430
|
#Adding linguistic processor
|
431
431
|
my_lp = Clp()
|
432
432
|
my_lp.set_name(__desc)
|
433
433
|
my_lp.set_version(__last_edited+'_'+__version)
|
434
434
|
if timestamp:
|
435
|
-
|
435
|
+
my_lp.set_timestamp() ##Set to the current date and time
|
436
436
|
else:
|
437
|
-
|
437
|
+
my_lp.set_timestamp('*')
|
438
438
|
knaf_obj.add_linguistic_processor('opinions',my_lp)
|
439
439
|
knaf_obj.dump(output_file_stream)
|
440
|
-
|
441
|
-
|
440
|
+
|
441
|
+
|
442
442
|
|
443
443
|
def obtain_predefined_model(lang,domain,just_show=False):
|
444
444
|
#This function will read the models from the file models.cfg and will return
|
@@ -451,7 +451,7 @@ def obtain_predefined_model(lang,domain,just_show=False):
|
|
451
451
|
print '#'*25
|
452
452
|
print 'Models available'
|
453
453
|
print '#'*25
|
454
|
-
|
454
|
+
|
455
455
|
nm = 0
|
456
456
|
for line in fic:
|
457
457
|
if line[0]!='#':
|
@@ -471,15 +471,15 @@ def obtain_predefined_model(lang,domain,just_show=False):
|
|
471
471
|
if just_show:
|
472
472
|
print '#'*25
|
473
473
|
return use_this_model
|
474
|
-
|
474
|
+
|
475
475
|
if __name__ == '__main__':
|
476
|
-
|
476
|
+
|
477
477
|
argument_parser = argparse.ArgumentParser(description='Detect opinion triples in a KAF/NAF file')
|
478
478
|
group = argument_parser.add_mutually_exclusive_group(required=True)
|
479
479
|
group.add_argument('-m',dest='model_folder',help='Folder storing the trained models')
|
480
480
|
group.add_argument('-d', dest='domain',help='The domain where the models were trained')
|
481
481
|
group.add_argument('-show-models', dest='show_models', action='store_true',help='Show the models available and finish')
|
482
|
-
|
482
|
+
|
483
483
|
argument_parser.add_argument('-keep-opinions',dest='keep_opinions',action='store_true',help='Keep the opinions from the input (by default will be deleted)')
|
484
484
|
argument_parser.add_argument('-no-time',dest='timestamp',action='store_false',help='No include time in timestamp (for testing)')
|
485
485
|
arguments = argument_parser.parse_args()
|
@@ -487,7 +487,7 @@ if __name__ == '__main__':
|
|
487
487
|
if arguments.show_models:
|
488
488
|
obtain_predefined_model(None,None,just_show=True)
|
489
489
|
sys.exit(0)
|
490
|
-
|
490
|
+
|
491
491
|
knaf_obj = KafNafParser(sys.stdin)
|
492
492
|
model_folder = None
|
493
493
|
if arguments.model_folder is not None:
|
@@ -496,12 +496,12 @@ if __name__ == '__main__':
|
|
496
496
|
#Obtain the language
|
497
497
|
lang = knaf_obj.get_language()
|
498
498
|
model_folder = obtain_predefined_model(lang,arguments.domain)
|
499
|
-
|
500
|
-
|
499
|
+
|
500
|
+
|
501
501
|
tag_file_with_opinions(None, sys.stdout,model_folder,kaf_obj=knaf_obj,remove_existing_opinions=(not arguments.keep_opinions),timestamp=arguments.timestamp)
|
502
502
|
sys.exit(0)
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
503
|
+
|
504
|
+
|
505
|
+
|
506
|
+
|
507
|
+
|