opener-opinion-detector-base 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +101 -0
- data/bin/opinion-detector-base +19 -0
- data/core/annotation.cfg.erb +9 -0
- data/core/packages/KafNafParser-1.4.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
- data/core/python-scripts/LICENSE +339 -0
- data/core/python-scripts/README.md +226 -0
- data/core/python-scripts/classify_kaf_naf_file.py +499 -0
- data/core/python-scripts/cross_validation.py +634 -0
- data/core/python-scripts/generate_folds.py +134 -0
- data/core/python-scripts/models.cfg +10 -0
- data/core/python-scripts/my_templates/README +33 -0
- data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
- data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
- data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
- data/core/python-scripts/my_templates/templates_exp.txt +10 -0
- data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
- data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
- data/core/python-scripts/my_templates/templates_holder.txt +10 -0
- data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
- data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
- data/core/python-scripts/my_templates/templates_target.txt +10 -0
- data/core/python-scripts/run_all_experiments.sh +49 -0
- data/core/python-scripts/run_basic.py +20 -0
- data/core/python-scripts/run_experiment.sh +42 -0
- data/core/python-scripts/scripts/__init__.py +1 -0
- data/core/python-scripts/scripts/config_manager.py +314 -0
- data/core/python-scripts/scripts/crfutils.py +215 -0
- data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
- data/core/python-scripts/scripts/extract_features.py +376 -0
- data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
- data/core/python-scripts/scripts/lexicons.py +44 -0
- data/core/python-scripts/scripts/link_entities_distance.py +77 -0
- data/core/python-scripts/scripts/relation_classifier.py +250 -0
- data/core/python-scripts/train.py +566 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
- data/core/vendor/src/crfsuite/AUTHORS +1 -0
- data/core/vendor/src/crfsuite/COPYING +27 -0
- data/core/vendor/src/crfsuite/ChangeLog +103 -0
- data/core/vendor/src/crfsuite/INSTALL +236 -0
- data/core/vendor/src/crfsuite/Makefile.am +19 -0
- data/core/vendor/src/crfsuite/Makefile.in +783 -0
- data/core/vendor/src/crfsuite/README +183 -0
- data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
- data/core/vendor/src/crfsuite/autogen.sh +38 -0
- data/core/vendor/src/crfsuite/compile +143 -0
- data/core/vendor/src/crfsuite/config.guess +1502 -0
- data/core/vendor/src/crfsuite/config.h.in +198 -0
- data/core/vendor/src/crfsuite/config.sub +1714 -0
- data/core/vendor/src/crfsuite/configure +14273 -0
- data/core/vendor/src/crfsuite/configure.in +149 -0
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
- data/core/vendor/src/crfsuite/depcomp +630 -0
- data/core/vendor/src/crfsuite/example/chunking.py +49 -0
- data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
- data/core/vendor/src/crfsuite/example/ner.py +270 -0
- data/core/vendor/src/crfsuite/example/pos.py +78 -0
- data/core/vendor/src/crfsuite/example/template.py +88 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
- data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
- data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
- data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
- data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
- data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
- data/core/vendor/src/crfsuite/frontend/main.c +137 -0
- data/core/vendor/src/crfsuite/frontend/option.c +93 -0
- data/core/vendor/src/crfsuite/frontend/option.h +86 -0
- data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
- data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
- data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
- data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
- data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
- data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
- data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
- data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
- data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
- data/core/vendor/src/crfsuite/include/os.h +61 -0
- data/core/vendor/src/crfsuite/install-sh +520 -0
- data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
- data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
- data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
- data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
- data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
- data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
- data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
- data/core/vendor/src/crfsuite/missing +376 -0
- data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
- data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
- data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
- data/core/vendor/src/crfsuite/swig/export.i +32 -0
- data/core/vendor/src/crfsuite/swig/python/README +92 -0
- data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
- data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
- data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
- data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
- data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
- data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
- data/core/vendor/src/liblbfgs/AUTHORS +1 -0
- data/core/vendor/src/liblbfgs/COPYING +22 -0
- data/core/vendor/src/liblbfgs/ChangeLog +120 -0
- data/core/vendor/src/liblbfgs/INSTALL +231 -0
- data/core/vendor/src/liblbfgs/Makefile.am +10 -0
- data/core/vendor/src/liblbfgs/Makefile.in +638 -0
- data/core/vendor/src/liblbfgs/NEWS +0 -0
- data/core/vendor/src/liblbfgs/README +71 -0
- data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
- data/core/vendor/src/liblbfgs/autogen.sh +38 -0
- data/core/vendor/src/liblbfgs/config.guess +1411 -0
- data/core/vendor/src/liblbfgs/config.h.in +64 -0
- data/core/vendor/src/liblbfgs/config.sub +1500 -0
- data/core/vendor/src/liblbfgs/configure +21146 -0
- data/core/vendor/src/liblbfgs/configure.in +107 -0
- data/core/vendor/src/liblbfgs/depcomp +522 -0
- data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
- data/core/vendor/src/liblbfgs/install-sh +322 -0
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
- data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
- data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
- data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
- data/core/vendor/src/liblbfgs/missing +353 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
- data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
- data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
- data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
- data/core/vendor/src/svm_light/LICENSE.txt +59 -0
- data/core/vendor/src/svm_light/Makefile +105 -0
- data/core/vendor/src/svm_light/kernel.h +40 -0
- data/core/vendor/src/svm_light/svm_classify.c +197 -0
- data/core/vendor/src/svm_light/svm_common.c +985 -0
- data/core/vendor/src/svm_light/svm_common.h +301 -0
- data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
- data/core/vendor/src/svm_light/svm_learn.c +4147 -0
- data/core/vendor/src/svm_light/svm_learn.h +169 -0
- data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
- data/core/vendor/src/svm_light/svm_loqo.c +211 -0
- data/ext/hack/Rakefile +17 -0
- data/ext/hack/support.rb +88 -0
- data/lib/opener/opinion_detectors/base.rb +112 -0
- data/lib/opener/opinion_detectors/base/version.rb +7 -0
- data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
- data/lib/opener/opinion_detectors/de.rb +7 -0
- data/lib/opener/opinion_detectors/en.rb +7 -0
- data/lib/opener/opinion_detectors/it.rb +7 -0
- data/lib/opener/opinion_detectors/nl.rb +6 -0
- data/opener-opinion-detector-base.gemspec +35 -0
- data/pre_build_requirements.txt +3 -0
- metadata +374 -0
@@ -0,0 +1,376 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import codecs
|
5
|
+
import csv
|
6
|
+
import os
|
7
|
+
from operator import itemgetter
|
8
|
+
|
9
|
+
#from VUA_pylib.lexicon import MPQA_subjectivity_lexicon
|
10
|
+
|
11
|
+
|
12
|
+
def get_first_term_id(token_data,term_data,this_ids):
|
13
|
+
vector_tid_pos = []
|
14
|
+
for tid in this_ids:
|
15
|
+
span_token = term_data[tid][2]
|
16
|
+
min_token_pos = min(token_data[tok_id][2] for tok_id in span_token)
|
17
|
+
vector_tid_pos.append((tid,min_token_pos))
|
18
|
+
vector_tid_pos.sort(key=itemgetter(1))
|
19
|
+
return vector_tid_pos[0][0]
|
20
|
+
|
21
|
+
|
22
|
+
def get_mapping_from_lexicon(token_ids,lexicon):
|
23
|
+
#Create index offset --> ids
|
24
|
+
idx = 0
|
25
|
+
my_map = {}
|
26
|
+
text = ' '
|
27
|
+
for token, tid in token_ids:
|
28
|
+
for c in token:
|
29
|
+
my_map[idx] = tid
|
30
|
+
idx+=1
|
31
|
+
text += token+' '
|
32
|
+
idx+=1
|
33
|
+
####
|
34
|
+
all_extracted = [] # List of [(ids,polarity), (ids, polarity)
|
35
|
+
|
36
|
+
|
37
|
+
for substring, polarity in lexicon.items():
|
38
|
+
current_found = 0
|
39
|
+
while True:
|
40
|
+
start = text.find(' '+substring+' ',current_found)
|
41
|
+
if start == -1:
|
42
|
+
break
|
43
|
+
end = start + len(substring)
|
44
|
+
current_found = end
|
45
|
+
ids = set(my_map[myidx] for myidx in range(start,end) if myidx in my_map)
|
46
|
+
if len(ids) != 0:
|
47
|
+
all_extracted.append((ids,polarity))
|
48
|
+
|
49
|
+
final_selected = {}
|
50
|
+
|
51
|
+
#If w15 has been selected first, for instance (w14,w15,w16) will not be selected later in this file
|
52
|
+
for ids,polarity in sorted(all_extracted, key=lambda t: len(t[0])):
|
53
|
+
already_selected = False
|
54
|
+
for this_id in ids:
|
55
|
+
if this_id in final_selected:
|
56
|
+
already_selected = True
|
57
|
+
|
58
|
+
if not already_selected:
|
59
|
+
for this_id in ids:
|
60
|
+
final_selected[this_id] = polarity
|
61
|
+
return final_selected
|
62
|
+
|
63
|
+
|
64
|
+
def load_propagation_lexicon(propagation_lex_filename):
|
65
|
+
##Creates a lexicon (map) [lemma] --> polarity
|
66
|
+
propagated_lexicon = {}
|
67
|
+
if not os.path.exists(propagation_lex_filename):
|
68
|
+
print>>sys.stderr,'The propagated lexicon on', propagation_lex_filename,'does not exist'
|
69
|
+
else:
|
70
|
+
fic = open(propagation_lex_filename,'r')
|
71
|
+
for line in fic:
|
72
|
+
line = line.decode('utf-8').rstrip()
|
73
|
+
tokens = line.split(';')
|
74
|
+
lemma = tokens[4]
|
75
|
+
polarity = tokens[2]
|
76
|
+
propagated_lexicon[lemma] = polarity
|
77
|
+
return propagated_lexicon
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None, tar_lex=None, propagation_lex_filename=None):
|
82
|
+
|
83
|
+
labels = []
|
84
|
+
|
85
|
+
polarities_found_and_skipped = []
|
86
|
+
separator = '\t'
|
87
|
+
restore_out = None
|
88
|
+
log_on = False
|
89
|
+
|
90
|
+
if log_file is not None:
|
91
|
+
log_desc = codecs.open(log_file, 'w', encoding='UTF-8')
|
92
|
+
log_on = True
|
93
|
+
|
94
|
+
if out_file is not None:
|
95
|
+
restore_out = sys.stdout
|
96
|
+
sys.stdout = open(out_file,'a')
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
print>>log_desc,'Extracting features from ',knaf_obj.get_filename()
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
###########################
|
105
|
+
## EXTRACTING TOKENS #######
|
106
|
+
token_data = {} ## token_data['w_1'] = ('house','s_1')
|
107
|
+
tokens_in_order = []
|
108
|
+
num_token = 0
|
109
|
+
tokens_ids = []
|
110
|
+
for token_obj in knaf_obj.get_tokens():
|
111
|
+
token = token_obj.get_text()
|
112
|
+
s_id = token_obj.get_sent()
|
113
|
+
w_id = token_obj.get_id()
|
114
|
+
tokens_ids.append((token,w_id))
|
115
|
+
token_data[w_id] = (token,s_id,num_token)
|
116
|
+
tokens_in_order.append(w_id)
|
117
|
+
num_token += 1
|
118
|
+
if log_on:
|
119
|
+
print>>log_desc,' Number of tokens: ',len(tokens_in_order)
|
120
|
+
###########################
|
121
|
+
|
122
|
+
#Lexicons from the training data
|
123
|
+
mapping_wid_polarity = {}
|
124
|
+
if exp_lex is not None:
|
125
|
+
mapping_wid_polarity = get_mapping_from_lexicon(tokens_ids,exp_lex)
|
126
|
+
|
127
|
+
mapping_wid_aspect = {}
|
128
|
+
if tar_lex is not None:
|
129
|
+
mapping_wid_aspect = get_mapping_from_lexicon(tokens_ids, tar_lex)
|
130
|
+
|
131
|
+
propagated_lex = {}
|
132
|
+
if propagation_lex_filename is not None:
|
133
|
+
#Lexicon of [lemma] ==> polarity
|
134
|
+
propagated_lex = load_propagation_lexicon(propagation_lex_filename)
|
135
|
+
|
136
|
+
###########################
|
137
|
+
## EXTRACTING TERMS #######
|
138
|
+
term_data = {} #(term_lemma,term_pos,term_span,polarity)
|
139
|
+
term_for_token = {}
|
140
|
+
sentence_for_term = {}
|
141
|
+
for term_obj in knaf_obj.get_terms():
|
142
|
+
term_id = term_obj.get_id()
|
143
|
+
term_lemma = term_obj.get_lemma()
|
144
|
+
term_pos = term_obj.get_morphofeat()
|
145
|
+
# if there is no morphofeat feature, we try to get the pos from the 'pos' attrib
|
146
|
+
if term_pos == None:
|
147
|
+
term_pos = term_obj.get_pos()
|
148
|
+
if term_pos is not None:
|
149
|
+
term_pos = term_pos.split(' ')[0] #[:2] ## Only the 2 first chars of the pos string
|
150
|
+
else:
|
151
|
+
term_pos = 'unknown'
|
152
|
+
|
153
|
+
|
154
|
+
term_span = term_obj.get_span().get_span_ids()
|
155
|
+
|
156
|
+
sentiment = term_obj.get_sentiment()
|
157
|
+
polarity = None
|
158
|
+
if sentiment is not None:
|
159
|
+
polarity = sentiment.get_polarity()
|
160
|
+
if polarity is None:
|
161
|
+
modifier = sentiment.get_modifier()
|
162
|
+
polarity = modifier
|
163
|
+
if polarity is None: polarity='-'
|
164
|
+
|
165
|
+
term_data[term_id] = (term_lemma,term_pos,term_span,polarity)
|
166
|
+
for tok_id in term_span:
|
167
|
+
term_for_token[tok_id] = term_id
|
168
|
+
|
169
|
+
if tok_id in token_data:
|
170
|
+
sentence_id = token_data[tok_id][1]
|
171
|
+
sentence_for_term[term_id] = sentence_id
|
172
|
+
else:
|
173
|
+
sentence_for_term[term_id] = '0'
|
174
|
+
|
175
|
+
if log_on:
|
176
|
+
print>>log_desc,' Number of terms loaded: '+str(len(term_data))
|
177
|
+
###########################
|
178
|
+
|
179
|
+
###########################
|
180
|
+
# EXTRACTING ENTITIES FOR EACH TERM
|
181
|
+
###########################
|
182
|
+
entity_for_term = {}
|
183
|
+
for ent_obj in knaf_obj.get_entities():
|
184
|
+
ent_type = ent_obj.get_type()
|
185
|
+
for reference_obj in ent_obj.get_references():
|
186
|
+
for span_obj in reference_obj:
|
187
|
+
for t_id in span_obj.get_span_ids():
|
188
|
+
entity_for_term[t_id] = ent_type
|
189
|
+
if log_on:
|
190
|
+
print>>log_desc,'Entities:'+str(entity_for_term)
|
191
|
+
|
192
|
+
###########################
|
193
|
+
# EXTRACTING PROPERTIES FOR EACH TERM
|
194
|
+
###########################
|
195
|
+
property_for_term = {}
|
196
|
+
for prop_obj in knaf_obj.get_properties():
|
197
|
+
prop_type = prop_obj.get_type()
|
198
|
+
for reference_obj in prop_obj.get_references():
|
199
|
+
for span_obj in reference_obj:
|
200
|
+
for t_id in span_obj.get_span_ids():
|
201
|
+
property_for_term[t_id] = prop_type
|
202
|
+
if log_on:
|
203
|
+
print>>log_desc,'Properties:'+str(property_for_term)
|
204
|
+
|
205
|
+
###########################
|
206
|
+
# EXTRACTING CLASS FOR EACH TERM
|
207
|
+
###########################
|
208
|
+
class_for_term_id = {}
|
209
|
+
if include_class:
|
210
|
+
for opinion in knaf_obj.get_opinions():
|
211
|
+
## opinion expression
|
212
|
+
opinion_id = opinion.get_id()
|
213
|
+
opinion_exp = opinion.get_expression()
|
214
|
+
exp_type = ''
|
215
|
+
exp_strength = ''
|
216
|
+
exp_ids = []
|
217
|
+
if opinion_exp is not None:
|
218
|
+
exp_type = opinion_exp.get_polarity()
|
219
|
+
exp_strength = opinion_exp.get_strength()
|
220
|
+
span = opinion_exp.get_span()
|
221
|
+
if span is not None:
|
222
|
+
exp_ids = span.get_span_ids()
|
223
|
+
|
224
|
+
opinion_hol = opinion.get_holder()
|
225
|
+
hol_ids = []
|
226
|
+
if opinion_hol is not None:
|
227
|
+
span = opinion_hol.get_span()
|
228
|
+
if span is not None:
|
229
|
+
hol_ids = span.get_span_ids()
|
230
|
+
|
231
|
+
opinion_tar = opinion.get_target()
|
232
|
+
tar_ids = []
|
233
|
+
if opinion_tar is not None:
|
234
|
+
span = opinion_tar.get_span()
|
235
|
+
if span is not None:
|
236
|
+
tar_ids = span.get_span_ids()
|
237
|
+
|
238
|
+
############################
|
239
|
+
|
240
|
+
if accepted_opinions is not None:
|
241
|
+
if exp_type in accepted_opinions:
|
242
|
+
#Get the mapping label
|
243
|
+
mapped_type = accepted_opinions[exp_type]
|
244
|
+
else:
|
245
|
+
# This opinion wont be considered
|
246
|
+
polarities_found_and_skipped.append(exp_type)
|
247
|
+
continue
|
248
|
+
else:
|
249
|
+
mapped_type = exp_type
|
250
|
+
|
251
|
+
|
252
|
+
if log_on:
|
253
|
+
print>>log_desc,' Opinion',opinion_id
|
254
|
+
print>>log_desc,' Expression:'
|
255
|
+
print>>log_desc,' ids:',exp_ids
|
256
|
+
print>>log_desc,' terms:',[term_data[i][0] for i in exp_ids]
|
257
|
+
|
258
|
+
if len(exp_ids) != 0:
|
259
|
+
first_term_id = get_first_term_id(token_data,term_data,exp_ids)
|
260
|
+
for t_id in exp_ids:
|
261
|
+
if t_id == first_term_id: type='B-'
|
262
|
+
else: type='I-'
|
263
|
+
class_for_term_id[t_id]=type+mapped_type
|
264
|
+
|
265
|
+
|
266
|
+
|
267
|
+
if log_on:
|
268
|
+
print>>log_desc,' Target:'
|
269
|
+
print>>log_desc,' ids:',tar_ids
|
270
|
+
print>>log_desc,' terms:',[term_data[i][0] for i in tar_ids]
|
271
|
+
|
272
|
+
if len(tar_ids) != 0:
|
273
|
+
first_term_id = get_first_term_id(token_data,term_data,tar_ids)
|
274
|
+
for t_id in tar_ids:
|
275
|
+
if t_id == first_term_id: type='B-'
|
276
|
+
else: type='I-'
|
277
|
+
class_for_term_id[t_id]=type+'target'
|
278
|
+
|
279
|
+
if log_on:
|
280
|
+
print>>log_desc,' Holder:'
|
281
|
+
print>>log_desc,' ids:',hol_ids
|
282
|
+
print>>log_desc,' terms:',[term_data[i][0] for i in hol_ids]
|
283
|
+
|
284
|
+
if len(hol_ids) != 0:
|
285
|
+
first_term_id = get_first_term_id(token_data,term_data,hol_ids)
|
286
|
+
for t_id in hol_ids:
|
287
|
+
if t_id == first_term_id: type='B-'
|
288
|
+
else: type='I-'
|
289
|
+
class_for_term_id[t_id]=type+'holder'
|
290
|
+
##############
|
291
|
+
|
292
|
+
|
293
|
+
#my_mpqa_subj_lex = MPQA_subjectivity_lexicon()
|
294
|
+
## WRITE TO THE OUTPUT
|
295
|
+
|
296
|
+
|
297
|
+
|
298
|
+
|
299
|
+
|
300
|
+
prev_sent = None
|
301
|
+
for token_id in tokens_in_order:
|
302
|
+
token,sentence_id,num_token = token_data[token_id]
|
303
|
+
|
304
|
+
term_id = term_for_token.get(token_id,None)
|
305
|
+
|
306
|
+
#This is required for wrong KAF files that contain missing terms (tokens not linked with terms)
|
307
|
+
if term_id is not None:
|
308
|
+
data = term_data.get(term_id,None)
|
309
|
+
if data is not None:
|
310
|
+
term_lemma,term_pos,term_span,polarity = data
|
311
|
+
entity = entity_for_term.get(term_id,'-')
|
312
|
+
property = property_for_term.get(term_id,'-')
|
313
|
+
this_class = class_for_term_id.get(term_id,'O')
|
314
|
+
|
315
|
+
'''
|
316
|
+
#Mpqa subjectivy from the mpqa corpus
|
317
|
+
mpqa_type = mpqa_pol = '-'
|
318
|
+
if my_mpqa_subj_lex is not None:
|
319
|
+
mpqa_data = my_mpqa_subj_lex.get_type_and_polarity(token,term_pos)
|
320
|
+
if mpqa_data is not None:
|
321
|
+
mpqa_type, mpqa_pol = mpqa_data
|
322
|
+
'''
|
323
|
+
|
324
|
+
|
325
|
+
## Constituency features
|
326
|
+
constituency_extractor = knaf_obj.get_constituency_extractor()
|
327
|
+
feature_phrase = 'XXX'
|
328
|
+
if constituency_extractor is not None:
|
329
|
+
this_phrase, subsumed_together = constituency_extractor.get_deepest_phrase_for_termid(term_id)
|
330
|
+
if this_phrase is not None:
|
331
|
+
feature_phrase = this_phrase
|
332
|
+
######################
|
333
|
+
|
334
|
+
### Expression from the domain lexicon
|
335
|
+
polarity_from_domain = mapping_wid_polarity.get(token_id,'-')
|
336
|
+
|
337
|
+
## Polarity from the propagated lexicon
|
338
|
+
polarity_from_propagation = propagated_lex.get(term_lemma,'-')
|
339
|
+
|
340
|
+
## Target from the training lexicon
|
341
|
+
aspect_from_domain = mapping_wid_aspect.get(token_id,'-')
|
342
|
+
|
343
|
+
##############################################################################################
|
344
|
+
## FEATURE GENERATION!!!!
|
345
|
+
##############################################################################################
|
346
|
+
labels = ['sentence_id','token_id','token','lemma', 'pos', 'term_id', 'pol/mod', 'poldomain', 'aspect_training']
|
347
|
+
features = [ sentence_id, token_id, token, term_lemma, term_pos, term_id, polarity ,polarity_from_domain,aspect_from_domain]
|
348
|
+
|
349
|
+
|
350
|
+
|
351
|
+
|
352
|
+
labels.extend(['entity','property','phrase_type','propagation_polarity','y'])
|
353
|
+
features.extend([entity,property,feature_phrase,polarity_from_propagation,this_class])
|
354
|
+
|
355
|
+
##############################################################################################
|
356
|
+
##############################################################################################
|
357
|
+
|
358
|
+
|
359
|
+
if prev_sent is not None and sentence_id != prev_sent: print>>sys.stdout #breakline
|
360
|
+
print>>sys.stdout,(separator.join(features)).encode('utf-8')
|
361
|
+
|
362
|
+
prev_sent=sentence_id
|
363
|
+
print>>sys.stdout #Last breakline required for crfsuite
|
364
|
+
|
365
|
+
|
366
|
+
print>>log_desc
|
367
|
+
## Restoring
|
368
|
+
if log_on:
|
369
|
+
log_desc.close()
|
370
|
+
|
371
|
+
if restore_out is not None:
|
372
|
+
sys.stdout.close()
|
373
|
+
sys.stdout = restore_out
|
374
|
+
|
375
|
+
return labels, separator, polarities_found_and_skipped
|
376
|
+
|
@@ -0,0 +1,105 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
# Separator of field values.
|
4
|
+
separator = '\t'
|
5
|
+
|
6
|
+
# Field names of the input data.
|
7
|
+
# From file extract_feats_from_kaf
|
8
|
+
# print sentence_id+'\t'+token_id+'\t'+token+'\t'+term_id+'\t'+lemma+'\t'+pos+'\t'+entity_for_token+'\t'+property_for_token+'\t'+class_for_token
|
9
|
+
|
10
|
+
#9 wop140 competitor t141 competitor NN negative O
|
11
|
+
|
12
|
+
#fields = 'sentence_id token_id tok term_id lem pos pol train_pol y'
|
13
|
+
#fields = 'sentence_id token_id tok term_id lem pos pol y'
|
14
|
+
fields = 'sentence_id token_id tok term_id lem pos polope polmpqa poltra y'
|
15
|
+
fields = 'sentence_id token_id tok term_id lem pos polmpqa y'
|
16
|
+
# Attribute templates.
|
17
|
+
|
18
|
+
templates = (
|
19
|
+
#(('tok',-4),), (('lem',-4),), (('polmpqa',-4),),
|
20
|
+
#(('tok',-3),), (('lem',-3),), (('polmpqa',-3),),
|
21
|
+
#(('tok',-2),), (('lem',-2),), (('pos',-2),),(('polmpqa',-2),), #(('poltra',-2),),(('polope',-2),),
|
22
|
+
#(('tok',-4),), (('lem',-4),), (('pos',-4),),(('polmpqa',-4),),
|
23
|
+
#(('tok',-3),), (('lem',-3),), (('pos',-3),),(('polmpqa',-3),),
|
24
|
+
#(('tok',-2),), (('lem',-2),), (('pos',-2),),(('polmpqa',-2),),
|
25
|
+
(('tok',-1),), (('lem',-1),), (('pos',-1),),(('polmpqa',-1),), #(('poltra',-1),),(('polope',-1),),
|
26
|
+
(('tok',0),), (('lem',0),), (('pos',0),),(('polmpqa',0),), #(('poltra',0),),(('polope',0),),
|
27
|
+
(('tok',1),), (('lem',1),), (('pos',1),),(('polmpqa',1),), #(('poltra',1),),(('polope',1),),
|
28
|
+
#(('tok',2),), (('lem',2),), (('pos',2),),(('polmpqa',2),),
|
29
|
+
#(('tok',3),), (('lem',3),), (('pos',3),),(('polmpqa',3),),
|
30
|
+
#(('tok',4),), (('lem',4),), (('pos',4),),(('polmpqa',4),),
|
31
|
+
#(('tok',2),), (('lem',2),), (('pos',2),),(('polmpqa',2),), #(('poltra',2),),(('polope',2),),
|
32
|
+
#(('tok',3),), (('lem',3),), (('polmpqa',3),),
|
33
|
+
#(('tok',4),), (('lem',4),), (('polmpqa',4),),
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
templates1234 = (
|
38
|
+
(('tok',-1),), (('pos',-1),), (('lem',-1),),(('train_pol',-1),) , (('pol',-1),),
|
39
|
+
(('tok',0),), (('pos',0),), (('lem',0),),(('train_pol',0),) , (('pol',0),),
|
40
|
+
(('tok',1),), (('pos',1),), (('lem',1),), (('train_pol',1),) , (('pol',1),),
|
41
|
+
)
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
templates_default = (
|
46
|
+
(('tok', -1), ), (('pos', -1), ), (('lem', -1), ), (('pol', -1), ),
|
47
|
+
(('tok', 0), ), (('pos', 0), ), (('lem', 0), ), (('pol', 0), ),
|
48
|
+
(('tok', 1), ), (('pos', 1), ), (('lem', 1), ), (('pol', 1), ),
|
49
|
+
(('tok',-1),('tok',0)),(('pos',-1),('pos',0)), (('lem',-1),('lem',0)), (('pol',-1),('pol',0)),
|
50
|
+
(('tok',0),('tok',1)),(('pos',0),('pos',1)), (('lem',0),('lem',1)), (('pol',0),('pol',1)),
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
templates2222 = (
|
55
|
+
# (('tok', -5), ), (('lem', -5), ),(('pol', -5), ), (('train_pol',-5),),
|
56
|
+
(('tok', -4), ), (('lem', -4), ),(('pol', -4), ),(('train_pol',-4),),
|
57
|
+
(('tok', -3), ), (('lem', -3), ),(('pol', -3), ),(('train_pol',-3),),
|
58
|
+
(('tok', -2), ), (('lem', -2), ),(('pol', -2), ), (('train_pol',-2),),
|
59
|
+
(('tok', -1), ), (('lem', -1), ),(('pol', -1), ), (('train_pol',-1),),
|
60
|
+
(('tok', 0), ), (('lem', 0), ),(('pol', 0), ), (('pos', 0),), (('train_pol',0),),
|
61
|
+
(('tok', 1), ), (('lem', 1), ),(('pol', 1), ), (('train_pol',1),),
|
62
|
+
(('tok', 2), ), (('lem', 2), ),(('pol', 2), ), (('train_pol',2),),
|
63
|
+
(('tok', 3), ), (('lem', 3), ), (('pol', 3), ),(('train_pol',3),),
|
64
|
+
(('tok', 4), ), (('lem', 4), ), (('pol', 4), ),(('train_pol',4),),
|
65
|
+
(('tok', +5), ), (('lem', +5), ),(('pol', +5), ),(('train_pol',5),),
|
66
|
+
)
|
67
|
+
|
68
|
+
templates22 = (
|
69
|
+
(('tok', -5), ), (('lem', -5), ),(('pol', -5), ),
|
70
|
+
(('tok', -4), ), (('lem', -4), ),(('pol', -4), ),
|
71
|
+
(('tok', -3), ), (('lem', -3), ),(('pol', -3), ),
|
72
|
+
(('tok', -2), ), (('lem', -2), ),(('pol', -2), ), (('pos', -2),),
|
73
|
+
(('tok', -1), ), (('lem', -1), ),(('pol', -1), ), (('pos', -1),),
|
74
|
+
(('tok', 0), ), (('lem', 0), ),(('pol', 0), ), (('pos', 0),),
|
75
|
+
(('tok', 1), ), (('lem', 1), ),(('pol', 1), ), (('pos', 1),),
|
76
|
+
(('tok', 2), ), (('lem', 2), ),(('pol', 2), ), (('pos', 2),),
|
77
|
+
(('tok', 3), ), (('lem', 3), ), (('pol', 3), ),
|
78
|
+
(('tok', 4), ), (('lem', 4), ), (('pol', 4), ),
|
79
|
+
(('tok', +5), ), (('lem', +5), ),(('pol', +5), ),
|
80
|
+
)
|
81
|
+
|
82
|
+
import crfutils
|
83
|
+
|
84
|
+
def feature_extractor(X):
|
85
|
+
# Apply attribute templates to obtain features (in fact, attributes)
|
86
|
+
crfutils.apply_templates(X, templates)
|
87
|
+
if X:
|
88
|
+
# Append BOS and EOS features manually
|
89
|
+
X[0]['F'].append('__BOS__') # BOS feature
|
90
|
+
X[-1]['F'].append('__EOS__') # EOS feature
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
def extract_features(inputfile,outputfile):
|
95
|
+
fi = open(inputfile,'r')
|
96
|
+
fo = open(outputfile,'w')
|
97
|
+
crfutils.main(feature_extractor,fields=fields,sep=separator,fi=fi,fo=fo)
|
98
|
+
fi.close()
|
99
|
+
fo.close()
|
100
|
+
|
101
|
+
|
102
|
+
if __name__ == '__main__':
|
103
|
+
crfutils.main(feature_extractor, fields=fields, sep=separator)
|
104
|
+
|
105
|
+
|