opener-opinion-detector-base 2.0.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/core/python-scripts/README.md +78 -3
  3. data/core/python-scripts/classify_kaf_naf_file.py +94 -94
  4. data/core/python-scripts/models.cfg +1 -0
  5. data/core/python-scripts/scripts/config_manager.py +3 -0
  6. data/core/python-scripts/scripts/extract_features.py +0 -3
  7. data/core/python-scripts/scripts/relation_classifier.py +1 -1
  8. data/core/vendor/src/crfsuite/crfsuite.sln +42 -42
  9. data/core/vendor/src/liblbfgs/lbfgs.sln +26 -26
  10. data/ext/hack/Rakefile +5 -2
  11. data/lib/opener/opinion_detectors/base.rb +19 -15
  12. data/lib/opener/opinion_detectors/base/version.rb +1 -1
  13. data/lib/opener/opinion_detectors/configuration_creator.rb +6 -8
  14. data/lib/opener/opinion_detectors/de.rb +1 -1
  15. data/lib/opener/opinion_detectors/es.rb +7 -0
  16. data/lib/opener/opinion_detectors/fr.rb +7 -0
  17. data/opener-opinion-detector-base.gemspec +0 -1
  18. data/pre_install_requirements.txt +3 -0
  19. metadata +41 -85
  20. data/core/packages/KafNafParser-1.4.tar.gz +0 -0
  21. data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
  22. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +0 -10
  23. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +0 -22
  24. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +0 -1
  25. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +0 -47
  26. data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +0 -1
  27. data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +0 -390
  28. data/core/site-packages/pre_build/KafNafParser/__init__.py +0 -14
  29. data/core/site-packages/pre_build/KafNafParser/constituency_data.py +0 -125
  30. data/core/site-packages/pre_build/KafNafParser/coreference_data.py +0 -52
  31. data/core/site-packages/pre_build/KafNafParser/dependency_data.py +0 -78
  32. data/core/site-packages/pre_build/KafNafParser/entity_data.py +0 -59
  33. data/core/site-packages/pre_build/KafNafParser/external_references_data.py +0 -41
  34. data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +0 -2
  35. data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +0 -205
  36. data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +0 -309
  37. data/core/site-packages/pre_build/KafNafParser/features_data.py +0 -131
  38. data/core/site-packages/pre_build/KafNafParser/header_data.py +0 -127
  39. data/core/site-packages/pre_build/KafNafParser/opinion_data.py +0 -211
  40. data/core/site-packages/pre_build/KafNafParser/references_data.py +0 -23
  41. data/core/site-packages/pre_build/KafNafParser/span_data.py +0 -63
  42. data/core/site-packages/pre_build/KafNafParser/term_data.py +0 -111
  43. data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +0 -42
  44. data/core/site-packages/pre_build/KafNafParser/text_data.py +0 -99
  45. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +0 -10
  46. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +0 -14
  47. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +0 -1
  48. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +0 -23
  49. data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +0 -1
  50. data/core/site-packages/pre_build/VUA_pylib/__init__.py +0 -1
  51. data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +0 -1
  52. data/core/site-packages/pre_build/VUA_pylib/common/common.py +0 -28
  53. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +0 -1
  54. data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +0 -156
  55. data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +0 -1
  56. data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +0 -121
  57. data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +0 -1
  58. data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +0 -72
  59. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +0 -10
  60. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +0 -7
  61. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +0 -1
  62. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +0 -11
  63. data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +0 -1
  64. data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +0 -165
  65. data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +0 -439
  66. data/core/site-packages/pre_build/VUKafParserPy/__init__.py +0 -7
  67. data/pre_build_requirements.txt +0 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9c8aef27fcd7c10ed7176b0a73bec253436c9efb
4
- data.tar.gz: 161e62461eded780c02c261e4f980f78ba0577e6
3
+ metadata.gz: 8cab19f98d9ee9c6ae4938a3be0cebb666126e44
4
+ data.tar.gz: d83ded3deb19fe5b7cead1ba079cb6bf585c9593
5
5
  SHA512:
6
- metadata.gz: 5723e548b534b9a743646de5e08adb3d59715b77bd931f381082ca3ee8924db9ed433f7655af6a9d10cef402350c9f60c4ef89afec5c2b87c90e6504a5cdb01d
7
- data.tar.gz: 3f9e5386c7fb1400d4204c232e1498b44af8958ef576cd51441b25f707f65cc331785a3b6bff86b8e36f0e854feef817b270cd0d76727eb4a95a7346d40ca0dc
6
+ metadata.gz: bb3d6a9b3d9d6fd3a3fa496b6a2de948398d250e49540dd89bd24d1efe98e9837e7bcb5dfef3b973c5fb069f4425dca2cf2c01d66bfdb148884d8af20a0c8792
7
+ data.tar.gz: ac1f7d71ec3160f279b0e2c8954ea1ee5bf29f56630d7e249a16c46c3498a091a511df97f89606d1bbaece012e011aff0bdd30bd50b84643e0993d8f9598f68a
@@ -2,14 +2,13 @@
2
2
 
3
3
  ##Introduction##
4
4
 
5
-
6
5
  Opinion miner based on machine learning that can be trained using a list of
7
6
  KAF/NAF files. It is important to notice that the opinion miner module will not call
8
7
  to any external module to obtain features. It will read all the features from the input KAF/NAF file,
9
8
  so you have to make sure that your input file contains all the required information in advance (tokens,
10
- terms, polarities, constituents, entitiess, dependencies...)
9
+ terms, polarities, constituents, entitiess, dependencies...).
11
10
 
12
- The task is divided into 2 steps
11
+ The task is general divided into 2 steps
13
12
  * Detection of opinion entities (holder, target and expression): using
14
13
  Conditional Random Fields
15
14
  * Opinion entity linking (expression<-target and expression-<holder): using
@@ -79,6 +78,82 @@ of CRFsuite and SVMLight. This file will be passed to the main script to detect
79
78
  cat my_file.kaf | classify_kaf_naf_file.py your_config_file.cfg
80
79
  ````
81
80
 
81
+ There are two basic functionalities:
82
+
83
+ * Training: from a corpus of opinion annotated files, induce and learn the models for detecting opinions
84
+ * Classification: using the previous models, find and extract opinions in new text files.
85
+
86
+ We provide models already trained and evaluated on hotel, news, attractions and restaurants domains for all the languages covered
87
+ by the OpeNER project. Most of the users will just focus on this classification step, using the models that we provide. Some others
88
+ will need to retrain the system to adapt it to a new domain or language. In the next sections we will introduce these 2 differents
89
+ usages of the opinion miner deluxe
90
+
91
+ ##Classification##
92
+
93
+ In this case you have the models already trained (either you trained them yourself or got the ones we provide) and you want just to detect
94
+ the opinions in a new file. The input format of your file needs to be valid KAF format. The script that perfoms the classification is the script
95
+ `classify_kaf_naf_file.py`. You can get information about the available parameters by running the script with the parameter -h.
96
+ ```shell
97
+ classify_kaf_naf_file.py -h
98
+ usage: classify_kaf_naf_file.py [-h]
99
+ (-m MODEL_FOLDER | -d DOMAIN | -show-models)
100
+ [-keep-opinions] [-no-time]
101
+
102
+ Detect opinion triples in a KAF/NAF file
103
+
104
+ optional arguments:
105
+ -h, --help show this help message and exit
106
+ -m MODEL_FOLDER Folder storing the trained models
107
+ -d DOMAIN The domain where the models were trained
108
+ -show-models Show the models available and finish
109
+ -keep-opinions Keep the opinions from the input (by default will be deleted)
110
+ -no-time No include time in timestamp (for testing)
111
+ ```
112
+
113
+ The script reads the input KAF file from the standard input and will write the output KAF into the standard output. The main parameter is the model that
114
+ will be used. There are two ways of specifyng this parameter:
115
+ * By using the -m FOLDER option, by means of which we can specify that we would like to use exactly the folder stored in the path FOLDER
116
+ * By using the -d DOMAIN option, where DOMAIN is the domain where the model that we want to use was trained.
117
+
118
+ We can get which are the models available by running:
119
+ ```shell
120
+ classify_kaf_naf_file.py -show-models
121
+ #########################
122
+ Models available
123
+ #########################
124
+ Model 0
125
+ Lang: en
126
+ Domain: hotel
127
+ Folder: final_models/en/hotel_cfg1
128
+ Desc: Trained with config1 in the last version of hotel annotations
129
+ Model 1
130
+ Lang: en
131
+ Domain: news
132
+ Folder: final_models/en/news_cfg1
133
+ Desc: Trained with config1 using only the sentences annotated with news
134
+ ....
135
+ ....
136
+ ```
137
+
138
+ You can train as use as many models as you want. You will need the file `models.cfg` which contains the metadata about which models
139
+ are available and how to refer to them (the domain). This is an example of the content of this file:
140
+ ```shell
141
+ #LANG|domain|pathtomodel|description
142
+ en|hotel|final_models/en/hotel_cfg1|Trained with config1 in the last version of hotel annotations
143
+ en|news|final_models/en/news_cfg1|Trained with config1 using only the sentences annotated with news
144
+ nl|hotel|final_models/nl/hotel_cfg1|Trained with config1 in the last version of hotel annotations
145
+ nl|news|final_models/nl/news_cfg1|Trained with config1 using only the sentences annotated with news
146
+ ```
147
+ So in each line a model is specified and represented using 4 fields, the language, the domain identifier (which will be used later to refer to this model),
148
+ the path to the folder and a text with a description. The language for the KAF file will be read directly from the KAF header, and considering this model
149
+ and the domain id provided to the script, the proper model will be loaded and used.
150
+
151
+ So if you want to tag a file with Dutch text called input.nl.kaf with the models trained on hotel reviews, and store the result on the file output.nl.kaf you just
152
+ should call to the program as:
153
+ ```shell
154
+ cat input.nl.kaf | python classify_kaf_naf_file.py -d hotel > output.nl.kaf
155
+ ```
156
+
82
157
  ##Training your own models##
83
158
 
84
159
  You will need first to install all the requirementes given and then follow these steps:
@@ -8,7 +8,7 @@ this_folder = os.path.dirname(os.path.realpath(__file__))
8
8
 
9
9
  # This updates the load path to ensure that the local site-packages directory
10
10
  # can be used to load packages (e.g. a locally installed copy of lxml).
11
- sys.path.append(os.path.join(this_folder, '../site-packages/pre_build'))
11
+ sys.path.append(os.path.join(this_folder, '../site-packages/pre_install'))
12
12
 
13
13
  import csv
14
14
  from tempfile import NamedTemporaryFile
@@ -20,17 +20,16 @@ import argparse
20
20
  from scripts import lexicons as lexicons_manager
21
21
  from scripts.config_manager import Cconfig_manager
22
22
  from scripts.extract_features import extract_features_from_kaf_naf_file
23
- from scripts.crfutils import extract_features_to_crf
23
+ from scripts.crfutils import extract_features_to_crf
24
24
  from scripts.link_entities_distance import link_entities_distance
25
25
  from scripts.relation_classifier import link_entities_svm
26
- from KafNafParser import *
27
- from VUA_pylib import *
26
+ from KafNafParserPy import *
28
27
 
29
28
 
30
29
  DEBUG=0
31
30
 
32
31
  my_config_manager = Cconfig_manager()
33
- __this_folder = os.getcwd()
32
+ __this_folder = os.path.dirname(os.path.realpath(__file__))
34
33
  separator = '\t'
35
34
  __desc = 'Deluxe opinion miner (CRF+SVM)'
36
35
  __last_edited = '10jan2014'
@@ -59,7 +58,7 @@ def match_crfsuite_out(crfout,list_token_ids):
59
58
  if inside:
60
59
  matches.append((current,current_type))
61
60
  current = []
62
- inside = False
61
+ inside = False
63
62
  else:
64
63
  if line=='O':
65
64
  if inside:
@@ -73,8 +72,8 @@ def match_crfsuite_out(crfout,list_token_ids):
73
72
  if inside:
74
73
  matches.append((current,current_type))
75
74
  current = [list_token_ids[num_token]]
76
- inside = True
77
- current_type = value
75
+ inside = True
76
+ current_type = value
78
77
  elif my_type == 'I':
79
78
  if inside:
80
79
  current.append(list_token_ids[num_token])
@@ -92,42 +91,42 @@ def match_crfsuite_out(crfout,list_token_ids):
92
91
  def extract_features(kaf_naf_obj):
93
92
  feat_file_desc = NamedTemporaryFile(delete=False)
94
93
  feat_file_desc.close()
95
-
94
+
96
95
  out_file = feat_file_desc.name
97
96
  err_file = out_file+'.log'
98
-
97
+
99
98
  expressions_lexicon = None
100
99
  targets_lexicon = None
101
100
  if my_config_manager.get_use_training_lexicons():
102
101
  expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
103
102
  target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
104
-
103
+
105
104
  expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
106
105
  targets_lexicon =lexicons_manager.load_lexicon(target_lexicon_filename)
107
106
 
108
107
  #def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None):
109
108
  labels, separator,polarities_skipped = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file,include_class=False, exp_lex=expressions_lexicon,tar_lex=targets_lexicon)
110
109
  return out_file, err_file
111
-
112
-
110
+
111
+
113
112
  def convert_to_crf(input_file,templates):
114
113
  out_desc = NamedTemporaryFile(delete=False)
115
114
  out_desc.close()
116
-
115
+
117
116
  out_crf = out_desc.name
118
-
117
+
119
118
  ##Load description of features
120
119
  path_feat_desc = my_config_manager.get_feature_desc_filename()
121
120
  fic = open(path_feat_desc)
122
121
  fields = fic.read().strip()
123
122
  fic.close()
124
123
  ####
125
-
124
+
126
125
  extract_features_to_crf(input_file,out_crf,fields,separator,templates,possible_classes=None)
127
126
  return out_crf
128
-
129
-
130
-
127
+
128
+
129
+
131
130
  def run_crfsuite_tag(input_file,model_file):
132
131
  crfsuite = my_config_manager.get_crfsuite_binary()
133
132
  cmd = [crfsuite]
@@ -150,8 +149,8 @@ def run_crfsuite_tag(input_file,model_file):
150
149
 
151
150
  def detect_expressions(tab_feat_file,list_token_ids):
152
151
  #1) Convert to the correct CRF
153
- templates = my_config_manager.get_templates_expr()
154
-
152
+ templates = my_config_manager.get_templates_expr()
153
+
155
154
  crf_exp_file = convert_to_crf(tab_feat_file,templates)
156
155
  logging.debug('File with crf format for EXPRESSIONS '+crf_exp_file)
157
156
  if DEBUG:
@@ -161,10 +160,10 @@ def detect_expressions(tab_feat_file,list_token_ids):
161
160
  print>>sys.stderr,f.read()
162
161
  f.close()
163
162
  print>>sys.stderr,'#'*50
164
-
163
+
165
164
  model_file = my_config_manager.get_filename_model_expression()
166
165
  output_crf,error_crf = run_crfsuite_tag(crf_exp_file,model_file)
167
-
166
+
168
167
  logging.debug('Expressions crf error: '+error_crf)
169
168
  matches_exp = match_crfsuite_out(output_crf, list_token_ids)
170
169
  if DEBUG:
@@ -175,19 +174,19 @@ def detect_expressions(tab_feat_file,list_token_ids):
175
174
  print>>sys.stderr,'MATCHES:',str(matches_exp)
176
175
  print>>sys.stderr,'TEMP FILE:',crf_exp_file
177
176
  print>>sys.stderr,'#'*50
178
-
179
-
177
+
178
+
180
179
  logging.debug('Detector expressions out: '+str(matches_exp))
181
180
  os.remove(crf_exp_file)
182
181
  return matches_exp
183
-
184
-
185
-
186
-
187
-
182
+
183
+
184
+
185
+
186
+
188
187
  def detect_targets(tab_feat_file, list_token_ids):
189
188
  templates_target = my_config_manager.get_templates_target()
190
-
189
+
191
190
  crf_target_file = convert_to_crf(tab_feat_file,templates_target)
192
191
  logging.debug('File with crf format for TARGETS '+crf_target_file)
193
192
  if DEBUG:
@@ -197,13 +196,13 @@ def detect_targets(tab_feat_file, list_token_ids):
197
196
  print>>sys.stderr,f.read()
198
197
  f.close()
199
198
  print>>sys.stderr,'#'*50
200
-
199
+
201
200
  model_target_file = my_config_manager.get_filename_model_target()
202
201
  out_crf_target,error_crf = run_crfsuite_tag(crf_target_file, model_target_file)
203
202
  logging.debug('TARGETS crf error: '+error_crf)
204
203
 
205
204
  matches_tar = match_crfsuite_out(out_crf_target, list_token_ids)
206
-
205
+
207
206
  if DEBUG:
208
207
  print>>sys.stderr,'#'*50
209
208
  print>>sys.stderr,'CRF output for TARGETS'
@@ -211,18 +210,18 @@ def detect_targets(tab_feat_file, list_token_ids):
211
210
  print>>sys.stderr,'List token ids:',str(list_token_ids)
212
211
  print>>sys.stderr,'MATCHES:',str(matches_tar)
213
212
  print>>sys.stderr,'#'*50
214
-
213
+
215
214
  logging.debug('Detector targets out: '+str(matches_tar))
216
215
  os.remove(crf_target_file)
217
216
  return matches_tar
218
-
219
-
220
-
221
-
222
-
217
+
218
+
219
+
220
+
221
+
223
222
  def detect_holders(tab_feat_file, list_token_ids):
224
223
  templates_holder = my_config_manager.get_templates_holder()
225
-
224
+
226
225
  crf_holder_file = convert_to_crf(tab_feat_file,templates_holder)
227
226
  logging.debug('File with crf format for HOLDERS '+crf_holder_file)
228
227
  if DEBUG:
@@ -232,7 +231,7 @@ def detect_holders(tab_feat_file, list_token_ids):
232
231
  print>>sys.stderr,f.read()
233
232
  f.close()
234
233
  print>>sys.stderr,'#'*50
235
-
234
+
236
235
  model_holder_file = my_config_manager.get_filename_model_holder()
237
236
  out_crf_holder,error_crf = run_crfsuite_tag(crf_holder_file, model_holder_file)
238
237
  logging.debug('HOLDERS crf error: '+error_crf)
@@ -246,12 +245,12 @@ def detect_holders(tab_feat_file, list_token_ids):
246
245
  print>>sys.stderr,'List token ids:',str(list_token_ids)
247
246
  print>>sys.stderr,'MATCHES:',str(matches_holder)
248
247
  print>>sys.stderr,'#'*50
249
-
248
+
250
249
  logging.debug('Detector HOLDERS out: '+str(matches_holder))
251
250
  os.remove(crf_holder_file)
252
251
  return matches_holder
253
-
254
-
252
+
253
+
255
254
 
256
255
 
257
256
 
@@ -267,19 +266,19 @@ def map_tokens_to_terms(list_tokens,knaf_obj):
267
266
  terms_for_token[tokid] = [termid]
268
267
  else:
269
268
  terms_for_token[tokid].append(termid)
270
-
269
+
271
270
  ret = set()
272
271
  for my_id in list_tokens:
273
272
  term_ids = terms_for_token[my_id]
274
273
  ret |= set(term_ids)
275
274
  return sorted(list(ret))
276
-
277
-
278
-
275
+
276
+
277
+
279
278
  def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=True,include_polarity_strength=True):
280
279
  num_opinion = 0
281
280
  for type_exp, span_exp, span_tar, span_hol in triples:
282
- #Map tokens to terms
281
+ #Map tokens to terms
283
282
  if map_to_terms:
284
283
  span_exp_terms = map_tokens_to_terms(span_exp,kaf_obj)
285
284
  span_tar_terms = map_tokens_to_terms(span_tar,kaf_obj)
@@ -288,16 +287,16 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
288
287
  span_hol_terms = span_hol
289
288
  span_tar_terms = span_tar
290
289
  span_exp_terms = span_exp
291
-
290
+
292
291
  ##Creating holder
293
292
  span_hol = Cspan()
294
293
  span_hol.create_from_ids(span_hol_terms)
295
294
  my_hol = Cholder()
296
295
  my_hol.set_span(span_hol)
297
-
296
+
298
297
  hol_text = ' '.join(text_for_tid[tid] for tid in span_hol_terms)
299
298
  my_hol.set_comment(hol_text)
300
-
299
+
301
300
  #Creating target
302
301
  span_tar = Cspan()
303
302
  span_tar.create_from_ids(span_tar_terms)
@@ -318,7 +317,7 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
318
317
  exp_text = ' '.join(text_for_tid[tid] for tid in span_exp_terms)
319
318
  my_exp.set_comment(exp_text)
320
319
  #########################
321
-
320
+
322
321
  #To get the first possible ID not already used
323
322
  new_id = None
324
323
  while True:
@@ -332,32 +331,33 @@ def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=Tr
332
331
  new_opinion.set_id(new_id)
333
332
  if len(span_hol_terms) != 0: #To avoid empty holders
334
333
  new_opinion.set_holder(my_hol)
335
-
334
+
336
335
  if len(span_tar_terms) != 0: #To avoid empty targets
337
336
  new_opinion.set_target(my_tar)
338
-
337
+
339
338
  new_opinion.set_expression(my_exp)
340
-
339
+
341
340
  knaf_obj.add_opinion(new_opinion)
342
-
341
+
343
342
  ##
344
343
  # Input_file_stream can be a filename of a stream
345
344
  # Opoutfile_trasm can be a filename of a stream
346
345
  #Config file must be a string filename
347
346
  def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,kaf_obj=None, remove_existing_opinions=True,include_polarity_strength=True,timestamp=True):
347
+
348
348
  config_filename = os.path.join(model_folder)
349
349
  if not os.path.exists(config_filename):
350
350
  print>>sys.stderr,'Config file not found on:',config_filename
351
351
  sys.exit(-1)
352
-
352
+
353
353
  my_config_manager.set_current_folder(__this_folder)
354
354
  my_config_manager.set_config(config_filename)
355
-
355
+
356
356
  if kaf_obj is not None:
357
357
  knaf_obj = kaf_obj
358
358
  else:
359
359
  knaf_obj = KafNafParser(input_file_stream)
360
-
360
+
361
361
  #Create a temporary file
362
362
  out_feat_file, err_feat_file = extract_features(knaf_obj)
363
363
  if DEBUG:
@@ -367,7 +367,7 @@ def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,ka
367
367
  print>>sys.stderr,f.read()
368
368
  f.close()
369
369
  print>>sys.stderr,'#'*50
370
-
370
+
371
371
  #get all the tokens in order
372
372
  list_token_ids = []
373
373
  text_for_wid = {}
@@ -378,67 +378,67 @@ def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,ka
378
378
  s_id = token_obj.get_sent()
379
379
  w_id = token_obj.get_id()
380
380
  text_for_wid[w_id] = token
381
-
381
+
382
382
  list_token_ids.append(w_id)
383
383
  sentence_for_token[w_id] = s_id
384
-
384
+
385
385
  for term in knaf_obj.get_terms():
386
386
  tid = term.get_id()
387
387
  toks = [text_for_wid.get(wid,'') for wid in term.get_span().get_span_ids()]
388
388
  text_for_tid[tid] = ' '.join(toks)
389
389
 
390
-
390
+
391
391
  expressions = detect_expressions(out_feat_file,list_token_ids)
392
392
  targets = detect_targets(out_feat_file, list_token_ids)
393
393
  holders = detect_holders(out_feat_file, list_token_ids)
394
-
394
+
395
395
  os.remove(out_feat_file)
396
396
  os.remove(err_feat_file)
397
397
 
398
398
  if DEBUG:
399
399
  print>>sys.stderr,"Expressions detected:"
400
400
  for e in expressions:
401
- print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
401
+ print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
402
402
  print>>sys.stderr
403
-
403
+
404
404
  print>>sys.stderr,'Targets detected'
405
405
  for t in targets:
406
- print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
406
+ print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
407
407
  print>>sys.stderr
408
-
408
+
409
409
  print>>sys.stderr,'Holders',holders
410
410
  for h in holders:
411
- print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
411
+ print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
412
412
  print>>sys.stderr
413
-
414
-
413
+
414
+
415
415
  # Entity linker based on distances
416
416
  ####triples = link_entities_distance(expressions,targets,holders,sentence_for_token)
417
-
417
+
418
418
  triples = link_entities_svm(expressions, targets, holders, knaf_obj, my_config_manager)
419
-
419
+
420
420
  ids_used = set()
421
421
  if remove_existing_opinions:
422
422
  knaf_obj.remove_opinion_layer()
423
423
  else:
424
424
  for opi in knaf_obj.get_opinions():
425
425
  ids_used.add(opi.get_id())
426
-
427
-
428
- add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
429
-
426
+
427
+
428
+ add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
429
+
430
430
  #Adding linguistic processor
431
431
  my_lp = Clp()
432
432
  my_lp.set_name(__desc)
433
433
  my_lp.set_version(__last_edited+'_'+__version)
434
434
  if timestamp:
435
- my_lp.set_timestamp() ##Set to the current date and time
435
+ my_lp.set_timestamp() ##Set to the current date and time
436
436
  else:
437
- my_lp.set_timestamp('*')
437
+ my_lp.set_timestamp('*')
438
438
  knaf_obj.add_linguistic_processor('opinions',my_lp)
439
439
  knaf_obj.dump(output_file_stream)
440
-
441
-
440
+
441
+
442
442
 
443
443
  def obtain_predefined_model(lang,domain,just_show=False):
444
444
  #This function will read the models from the file models.cfg and will return
@@ -451,7 +451,7 @@ def obtain_predefined_model(lang,domain,just_show=False):
451
451
  print '#'*25
452
452
  print 'Models available'
453
453
  print '#'*25
454
-
454
+
455
455
  nm = 0
456
456
  for line in fic:
457
457
  if line[0]!='#':
@@ -471,15 +471,15 @@ def obtain_predefined_model(lang,domain,just_show=False):
471
471
  if just_show:
472
472
  print '#'*25
473
473
  return use_this_model
474
-
474
+
475
475
  if __name__ == '__main__':
476
-
476
+
477
477
  argument_parser = argparse.ArgumentParser(description='Detect opinion triples in a KAF/NAF file')
478
478
  group = argument_parser.add_mutually_exclusive_group(required=True)
479
479
  group.add_argument('-m',dest='model_folder',help='Folder storing the trained models')
480
480
  group.add_argument('-d', dest='domain',help='The domain where the models were trained')
481
481
  group.add_argument('-show-models', dest='show_models', action='store_true',help='Show the models available and finish')
482
-
482
+
483
483
  argument_parser.add_argument('-keep-opinions',dest='keep_opinions',action='store_true',help='Keep the opinions from the input (by default will be deleted)')
484
484
  argument_parser.add_argument('-no-time',dest='timestamp',action='store_false',help='No include time in timestamp (for testing)')
485
485
  arguments = argument_parser.parse_args()
@@ -487,7 +487,7 @@ if __name__ == '__main__':
487
487
  if arguments.show_models:
488
488
  obtain_predefined_model(None,None,just_show=True)
489
489
  sys.exit(0)
490
-
490
+
491
491
  knaf_obj = KafNafParser(sys.stdin)
492
492
  model_folder = None
493
493
  if arguments.model_folder is not None:
@@ -496,12 +496,12 @@ if __name__ == '__main__':
496
496
  #Obtain the language
497
497
  lang = knaf_obj.get_language()
498
498
  model_folder = obtain_predefined_model(lang,arguments.domain)
499
-
500
-
499
+
500
+
501
501
  tag_file_with_opinions(None, sys.stdout,model_folder,kaf_obj=knaf_obj,remove_existing_opinions=(not arguments.keep_opinions),timestamp=arguments.timestamp)
502
502
  sys.exit(0)
503
-
504
-
505
-
506
-
507
-
503
+
504
+
505
+
506
+
507
+