opener-opinion-detector-base 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +101 -0
- data/bin/opinion-detector-base +19 -0
- data/core/annotation.cfg.erb +9 -0
- data/core/packages/KafNafParser-1.4.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
- data/core/python-scripts/LICENSE +339 -0
- data/core/python-scripts/README.md +226 -0
- data/core/python-scripts/classify_kaf_naf_file.py +499 -0
- data/core/python-scripts/cross_validation.py +634 -0
- data/core/python-scripts/generate_folds.py +134 -0
- data/core/python-scripts/models.cfg +10 -0
- data/core/python-scripts/my_templates/README +33 -0
- data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
- data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
- data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
- data/core/python-scripts/my_templates/templates_exp.txt +10 -0
- data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
- data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
- data/core/python-scripts/my_templates/templates_holder.txt +10 -0
- data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
- data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
- data/core/python-scripts/my_templates/templates_target.txt +10 -0
- data/core/python-scripts/run_all_experiments.sh +49 -0
- data/core/python-scripts/run_basic.py +20 -0
- data/core/python-scripts/run_experiment.sh +42 -0
- data/core/python-scripts/scripts/__init__.py +1 -0
- data/core/python-scripts/scripts/config_manager.py +314 -0
- data/core/python-scripts/scripts/crfutils.py +215 -0
- data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
- data/core/python-scripts/scripts/extract_features.py +376 -0
- data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
- data/core/python-scripts/scripts/lexicons.py +44 -0
- data/core/python-scripts/scripts/link_entities_distance.py +77 -0
- data/core/python-scripts/scripts/relation_classifier.py +250 -0
- data/core/python-scripts/train.py +566 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
- data/core/vendor/src/crfsuite/AUTHORS +1 -0
- data/core/vendor/src/crfsuite/COPYING +27 -0
- data/core/vendor/src/crfsuite/ChangeLog +103 -0
- data/core/vendor/src/crfsuite/INSTALL +236 -0
- data/core/vendor/src/crfsuite/Makefile.am +19 -0
- data/core/vendor/src/crfsuite/Makefile.in +783 -0
- data/core/vendor/src/crfsuite/README +183 -0
- data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
- data/core/vendor/src/crfsuite/autogen.sh +38 -0
- data/core/vendor/src/crfsuite/compile +143 -0
- data/core/vendor/src/crfsuite/config.guess +1502 -0
- data/core/vendor/src/crfsuite/config.h.in +198 -0
- data/core/vendor/src/crfsuite/config.sub +1714 -0
- data/core/vendor/src/crfsuite/configure +14273 -0
- data/core/vendor/src/crfsuite/configure.in +149 -0
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
- data/core/vendor/src/crfsuite/depcomp +630 -0
- data/core/vendor/src/crfsuite/example/chunking.py +49 -0
- data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
- data/core/vendor/src/crfsuite/example/ner.py +270 -0
- data/core/vendor/src/crfsuite/example/pos.py +78 -0
- data/core/vendor/src/crfsuite/example/template.py +88 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
- data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
- data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
- data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
- data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
- data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
- data/core/vendor/src/crfsuite/frontend/main.c +137 -0
- data/core/vendor/src/crfsuite/frontend/option.c +93 -0
- data/core/vendor/src/crfsuite/frontend/option.h +86 -0
- data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
- data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
- data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
- data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
- data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
- data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
- data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
- data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
- data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
- data/core/vendor/src/crfsuite/include/os.h +61 -0
- data/core/vendor/src/crfsuite/install-sh +520 -0
- data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
- data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
- data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
- data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
- data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
- data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
- data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
- data/core/vendor/src/crfsuite/missing +376 -0
- data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
- data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
- data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
- data/core/vendor/src/crfsuite/swig/export.i +32 -0
- data/core/vendor/src/crfsuite/swig/python/README +92 -0
- data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
- data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
- data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
- data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
- data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
- data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
- data/core/vendor/src/liblbfgs/AUTHORS +1 -0
- data/core/vendor/src/liblbfgs/COPYING +22 -0
- data/core/vendor/src/liblbfgs/ChangeLog +120 -0
- data/core/vendor/src/liblbfgs/INSTALL +231 -0
- data/core/vendor/src/liblbfgs/Makefile.am +10 -0
- data/core/vendor/src/liblbfgs/Makefile.in +638 -0
- data/core/vendor/src/liblbfgs/NEWS +0 -0
- data/core/vendor/src/liblbfgs/README +71 -0
- data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
- data/core/vendor/src/liblbfgs/autogen.sh +38 -0
- data/core/vendor/src/liblbfgs/config.guess +1411 -0
- data/core/vendor/src/liblbfgs/config.h.in +64 -0
- data/core/vendor/src/liblbfgs/config.sub +1500 -0
- data/core/vendor/src/liblbfgs/configure +21146 -0
- data/core/vendor/src/liblbfgs/configure.in +107 -0
- data/core/vendor/src/liblbfgs/depcomp +522 -0
- data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
- data/core/vendor/src/liblbfgs/install-sh +322 -0
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
- data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
- data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
- data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
- data/core/vendor/src/liblbfgs/missing +353 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
- data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
- data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
- data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
- data/core/vendor/src/svm_light/LICENSE.txt +59 -0
- data/core/vendor/src/svm_light/Makefile +105 -0
- data/core/vendor/src/svm_light/kernel.h +40 -0
- data/core/vendor/src/svm_light/svm_classify.c +197 -0
- data/core/vendor/src/svm_light/svm_common.c +985 -0
- data/core/vendor/src/svm_light/svm_common.h +301 -0
- data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
- data/core/vendor/src/svm_light/svm_learn.c +4147 -0
- data/core/vendor/src/svm_light/svm_learn.h +169 -0
- data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
- data/core/vendor/src/svm_light/svm_loqo.c +211 -0
- data/ext/hack/Rakefile +17 -0
- data/ext/hack/support.rb +88 -0
- data/lib/opener/opinion_detectors/base.rb +112 -0
- data/lib/opener/opinion_detectors/base/version.rb +7 -0
- data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
- data/lib/opener/opinion_detectors/de.rb +7 -0
- data/lib/opener/opinion_detectors/en.rb +7 -0
- data/lib/opener/opinion_detectors/it.rb +7 -0
- data/lib/opener/opinion_detectors/nl.rb +6 -0
- data/opener-opinion-detector-base.gemspec +35 -0
- data/pre_build_requirements.txt +3 -0
- metadata +374 -0
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import os
|
5
|
+
import csv
|
6
|
+
from subprocess import Popen,PIPE
|
7
|
+
|
8
|
+
def create_lexicons(path_to_script, training_file,exp_filename, tar_filename):
|
9
|
+
cmd = ['python']
|
10
|
+
cmd.append(path_to_script)
|
11
|
+
cmd.append('-exp_csv')
|
12
|
+
cmd.append(exp_filename)
|
13
|
+
cmd.append('-tar_csv')
|
14
|
+
cmd.append(tar_filename)
|
15
|
+
cmd.append('-l')
|
16
|
+
cmd.append(training_file)
|
17
|
+
folder = os.path.dirname(exp_filename)
|
18
|
+
log_out = open(os.path.join(folder,'log.out'),'wb')
|
19
|
+
log_err = open(os.path.join(folder,'log.err'),'wb')
|
20
|
+
|
21
|
+
lexicon_generator = Popen(' '.join(cmd),stdout=log_out, stderr=log_err, shell=True)
|
22
|
+
ret_code = lexicon_generator.wait()
|
23
|
+
log_out.close()
|
24
|
+
log_err.close()
|
25
|
+
|
26
|
+
print>>sys.stderr,' Lexicons created, on',folder,' ret code:',ret_code
|
27
|
+
|
28
|
+
|
29
|
+
def load_lexicon(lexicon_filename):
|
30
|
+
### LEXICON FROM THE DOMAIN
|
31
|
+
fd = open(lexicon_filename,'rb')
|
32
|
+
##dialect = csv.Sniffer().sniff(fd.read(1024))
|
33
|
+
##fd.seek(0)
|
34
|
+
#lex_reader = csv.reader(fd,dialect)
|
35
|
+
lex_reader = csv.reader(fd,delimiter=';')
|
36
|
+
my_lexicon = {}
|
37
|
+
for n,row in enumerate(lex_reader):
|
38
|
+
if n != 0:
|
39
|
+
text_type,ratio,rel_freq,over_freq,lemmas,postags,freqwords = row
|
40
|
+
this_pos = text_type.rfind('#')
|
41
|
+
text = text_type[:this_pos]
|
42
|
+
my_type = text_type[this_pos+1:]
|
43
|
+
my_lexicon[text.decode('utf-8')] = my_type.decode('utf-8')
|
44
|
+
return my_lexicon
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#####
|
2
|
+
import sys
|
3
|
+
import logging
|
4
|
+
from operator import itemgetter
|
5
|
+
|
6
|
+
|
7
|
+
def get_min(l):
|
8
|
+
min = None
|
9
|
+
for ele in l:
|
10
|
+
digits = ''
|
11
|
+
for c in ele:
|
12
|
+
if c.isdigit(): digits+=c
|
13
|
+
value = int(digits)
|
14
|
+
if min==None or value<min:
|
15
|
+
min = value
|
16
|
+
return min
|
17
|
+
|
18
|
+
#Returns the maximum position from a list of token ids
|
19
|
+
def get_max(l):
|
20
|
+
max = -1
|
21
|
+
for ele in l:
|
22
|
+
digits = ''
|
23
|
+
for c in ele:
|
24
|
+
if c.isdigit(): digits+=c
|
25
|
+
value = int(digits)
|
26
|
+
if value>max:
|
27
|
+
max = value
|
28
|
+
return max
|
29
|
+
|
30
|
+
|
31
|
+
## Gets the distance in number of tokens between two lisf of ids
|
32
|
+
def get_distance(list1, list2):
|
33
|
+
min_1 = get_min(list1)
|
34
|
+
max_1 = get_max(list1)
|
35
|
+
min_2 = get_min(list2)
|
36
|
+
max_2 = get_max(list2)
|
37
|
+
|
38
|
+
if max_1 < min_2:
|
39
|
+
distance = min_2 - max_1
|
40
|
+
elif max_2 < min_1:
|
41
|
+
distance = min_1 - max_2
|
42
|
+
else:
|
43
|
+
distance = 0
|
44
|
+
return distance
|
45
|
+
|
46
|
+
def link_entities_distance(expressions,targets,holders, sentence_for_token):
|
47
|
+
triples = []
|
48
|
+
weight_crossing_sentence = 200
|
49
|
+
|
50
|
+
for exp_ids, type_exp in expressions:
|
51
|
+
sentence_exp = int(sentence_for_token[exp_ids[0]])
|
52
|
+
|
53
|
+
final_tar = []
|
54
|
+
list_tar_dist = []
|
55
|
+
for tar_ids, target_label in targets:
|
56
|
+
sentence_tar = int(sentence_for_token[tar_ids[0]])
|
57
|
+
dist_tar_exp = get_distance(exp_ids,tar_ids)
|
58
|
+
final_distance = dist_tar_exp + weight_crossing_sentence * abs(sentence_exp - sentence_tar)
|
59
|
+
list_tar_dist.append((tar_ids,final_distance))
|
60
|
+
if len(list_tar_dist) != 0:
|
61
|
+
list_tar_dist.sort(key=itemgetter(1))
|
62
|
+
final_tar = list_tar_dist[0][0]
|
63
|
+
|
64
|
+
final_hol = []
|
65
|
+
list_hol_dist = []
|
66
|
+
for hol_ids, target_label in holders:
|
67
|
+
sentence_hol = int(sentence_for_token[hol_ids[0]])
|
68
|
+
dist_hol_exp = get_distance(exp_ids,hol_ids)
|
69
|
+
final_distance = dist_hol_exp + weight_crossing_sentence * abs(sentence_exp - sentence_hol)
|
70
|
+
list_hol_dist.append((hol_ids,final_distance))
|
71
|
+
if len(list_hol_dist) != 0:
|
72
|
+
list_hol_dist.sort(key=itemgetter(1))
|
73
|
+
final_hol = list_hol_dist[0][0]
|
74
|
+
|
75
|
+
triples.append((type_exp,exp_ids,final_tar,final_hol))
|
76
|
+
return triples
|
77
|
+
|
@@ -0,0 +1,250 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
from extract_feats_relations import *
|
4
|
+
from tempfile import NamedTemporaryFile
|
5
|
+
from subprocess import Popen, PIPE
|
6
|
+
from VUA_pylib.io import Cfeature_index
|
7
|
+
import os
|
8
|
+
|
9
|
+
config_manager = None
|
10
|
+
|
11
|
+
|
12
|
+
def link_exp_tar(expressions,targets, knaf_obj,use_dependencies=True,use_tokens=True, use_lemmas=True):
|
13
|
+
assigned_targets = [] # (expression_type, exp_ids,
|
14
|
+
|
15
|
+
if len(targets) == 0:
|
16
|
+
for exp_ids in expressions:
|
17
|
+
assigned_targets.append([])
|
18
|
+
elif len(targets) == 1:
|
19
|
+
for exp_ids in expressions:
|
20
|
+
assigned_targets.append(targets[0])
|
21
|
+
else:
|
22
|
+
feat_index_filename = config_manager.get_index_features_exp_tar_filename()
|
23
|
+
feat_index = Cfeature_index()
|
24
|
+
feat_index.load_from_file(feat_index_filename)
|
25
|
+
examples_file = NamedTemporaryFile(delete=False)
|
26
|
+
for exp_ids in expressions:
|
27
|
+
for tar_ids in targets:
|
28
|
+
feats = extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
|
29
|
+
feat_index.encode_example_for_classification(feats, examples_file,my_class='0')
|
30
|
+
examples_file.close()
|
31
|
+
## In examples_file.name we can find the examples file
|
32
|
+
|
33
|
+
## The format in the example file will be:
|
34
|
+
# exp1 --> tar1
|
35
|
+
# exp1 --> tar2
|
36
|
+
# exp1 --> tar3
|
37
|
+
# exp2 --> tar1
|
38
|
+
# exp2 --> tar2
|
39
|
+
# exp2 --> tar3
|
40
|
+
|
41
|
+
model_file = config_manager.get_filename_model_exp_tar()
|
42
|
+
results = run_svm_classify(examples_file.name, model_file)
|
43
|
+
|
44
|
+
idx = 0 # This idx will iterate from 0 to num_exp X num_tar
|
45
|
+
selected = [] # will stor for each exp --> (best_tar_idx, best_svm_val)
|
46
|
+
for exp in expressions:
|
47
|
+
#Selecting the best for this exp
|
48
|
+
best_value = -100
|
49
|
+
best_idx = -100
|
50
|
+
#print>>sys.stderr,' Exp:', exp
|
51
|
+
for num_tar , tar in enumerate(targets):
|
52
|
+
|
53
|
+
#This is the probably of exp to be related with the target num_tar
|
54
|
+
value = results[idx]
|
55
|
+
#print>>sys.stderr,' Target:',tar
|
56
|
+
#print>>sys.stderr,' Value:', value
|
57
|
+
#print>>sys.stderr, exp
|
58
|
+
#print>>sys.stderr, tar
|
59
|
+
#print>>sys.stderr, num_tar, value
|
60
|
+
#print
|
61
|
+
|
62
|
+
#We select the best among the targets for the exp processed
|
63
|
+
if value > best_value:
|
64
|
+
best_value = value
|
65
|
+
best_idx = num_tar
|
66
|
+
idx += 1
|
67
|
+
selected.append((best_idx,best_value))
|
68
|
+
#print>>sys.stderr,' Selected:', targets[best_idx]
|
69
|
+
#print selected
|
70
|
+
|
71
|
+
for best_tar_idx, best_value in selected:
|
72
|
+
assigned_targets.append(targets[best_tar_idx])
|
73
|
+
#print>>sys.stderr, 'SELECTED',best_tar_idx,targets[best_tar_idx]
|
74
|
+
os.remove(examples_file.name)
|
75
|
+
return assigned_targets
|
76
|
+
|
77
|
+
def link_exp_tar_all(expressions,targets, knaf_obj,threshold, use_dependencies=True,use_tokens=True, use_lemmas=True):
|
78
|
+
pairs = []
|
79
|
+
|
80
|
+
if len(targets) == 0:
|
81
|
+
for exp_ids, exp_type in expressions:
|
82
|
+
pairs.append((exp_ids,exp_type,[]))
|
83
|
+
else:
|
84
|
+
feat_index_filename = config_manager.get_index_features_exp_tar_filename()
|
85
|
+
feat_index = Cfeature_index()
|
86
|
+
feat_index.load_from_file(feat_index_filename)
|
87
|
+
examples_file = NamedTemporaryFile(delete=False)
|
88
|
+
for exp_ids, exp_type in expressions:
|
89
|
+
for tar_ids in targets:
|
90
|
+
feats = extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
|
91
|
+
feat_index.encode_example_for_classification(feats, examples_file,my_class='0')
|
92
|
+
examples_file.close()
|
93
|
+
|
94
|
+
model_file = config_manager.get_filename_model_exp_tar()
|
95
|
+
results = run_svm_classify(examples_file.name, model_file)
|
96
|
+
|
97
|
+
|
98
|
+
threshold = -0.75
|
99
|
+
idx = 0
|
100
|
+
for exp,exp_type in expressions:
|
101
|
+
at_least_one = False
|
102
|
+
for num_tar, tar in enumerate(targets):
|
103
|
+
value = results[idx]
|
104
|
+
idx += 1
|
105
|
+
if value >= threshold:
|
106
|
+
pairs.append((exp,exp_type,tar))
|
107
|
+
at_least_one = True
|
108
|
+
|
109
|
+
if not at_least_one:
|
110
|
+
pairs.append((exp,exp_type,[]))
|
111
|
+
|
112
|
+
os.remove(examples_file.name)
|
113
|
+
return pairs
|
114
|
+
|
115
|
+
def link_exp_hol(expressions,holders, knaf_obj,threshold_hol,use_dependencies=True,use_tokens=True,use_lemmas=True):
|
116
|
+
assigned_holders = [] # (expression_type, exp_ids,
|
117
|
+
|
118
|
+
if len(holders) == 0:
|
119
|
+
for exp_ids in expressions:
|
120
|
+
assigned_holders.append([])
|
121
|
+
else:
|
122
|
+
feat_index_filename = config_manager.get_index_features_exp_hol_filename()
|
123
|
+
feat_index = Cfeature_index()
|
124
|
+
feat_index.load_from_file(feat_index_filename)
|
125
|
+
examples_file = NamedTemporaryFile(delete=False)
|
126
|
+
for exp_ids in expressions:
|
127
|
+
for hol_ids in holders:
|
128
|
+
feats = extract_feats_exp_hol(exp_ids,hol_ids,knaf_obj, use_dependencies=use_dependencies,use_tokens=use_tokens,use_lemmas=use_lemmas)
|
129
|
+
feat_index.encode_example_for_classification(feats,examples_file,my_class='0')
|
130
|
+
examples_file.close()
|
131
|
+
## In examples_file.name we can find the examples file
|
132
|
+
|
133
|
+
## The format in the example file will be:
|
134
|
+
# exp1 --> hol1
|
135
|
+
# exp1 --> hol2
|
136
|
+
# exp1 --> hol3
|
137
|
+
# exp2 --> hol1
|
138
|
+
# exp2 --> hol2
|
139
|
+
# exp2 --> hol3
|
140
|
+
|
141
|
+
model_file = config_manager.get_filename_model_exp_hol()
|
142
|
+
results = run_svm_classify(examples_file.name, model_file)
|
143
|
+
|
144
|
+
idx = 0 # This idx will iterate from 0 to num_exp X num_tar
|
145
|
+
selected = [] # will stor for each exp --> (best_tar_idx, best_svm_val)
|
146
|
+
for exp in expressions:
|
147
|
+
#Selecting the best for this exp
|
148
|
+
best_value = -1
|
149
|
+
best_idx = -1
|
150
|
+
for num_hol , hol in enumerate(holders):
|
151
|
+
#This is the probably of exp to be related with the target num_tar
|
152
|
+
value = results[idx]
|
153
|
+
|
154
|
+
#We select the best among the targets for the exp processed
|
155
|
+
if value > best_value:
|
156
|
+
best_value = value
|
157
|
+
best_idx = num_hol
|
158
|
+
idx += 1
|
159
|
+
selected.append((best_idx,best_value))
|
160
|
+
#print selected
|
161
|
+
|
162
|
+
for best_hol_idx, best_value in selected:
|
163
|
+
if best_value >= threshold_hol:
|
164
|
+
assigned_holders.append(holders[best_hol_idx])
|
165
|
+
else:
|
166
|
+
assigned_holders.append([])
|
167
|
+
os.remove(examples_file.name)
|
168
|
+
return assigned_holders
|
169
|
+
|
170
|
+
|
171
|
+
|
172
|
+
def run_svm_classify(example_file,model_file):
|
173
|
+
#usage: svm_classify [options] example_file model_file output_file
|
174
|
+
svmlight = config_manager.get_svm_classify_binary()
|
175
|
+
if not os.path.exists(svmlight):
|
176
|
+
print>>sys.stderr,'SVMlight learn not found on',svmlight
|
177
|
+
print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
|
178
|
+
print>>sys.stderr,'[svmlight]\npath_to_binary_learn = yourpathtolocalsvmlightlearn'
|
179
|
+
sys.exit(-1)
|
180
|
+
|
181
|
+
cmd = [svmlight]
|
182
|
+
cmd.append(example_file)
|
183
|
+
cmd.append(model_file)
|
184
|
+
tempout = NamedTemporaryFile(delete=False)
|
185
|
+
tempout.close()
|
186
|
+
|
187
|
+
cmd.append(tempout.name)
|
188
|
+
svm_process = Popen(' '.join(cmd),stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
|
189
|
+
svm_process.wait()
|
190
|
+
str_err = svm_process.stderr.read()
|
191
|
+
if len(str_err) != 0:
|
192
|
+
print>>sys.stderr,'SVM light classify error '+str_err
|
193
|
+
sys.exit(-1)
|
194
|
+
#logging.debug('SVMlight classigfy log'+err_file)
|
195
|
+
results = []
|
196
|
+
fout = open(tempout.name,'r')
|
197
|
+
for line in fout:
|
198
|
+
results.append(float(line.strip()))
|
199
|
+
fout.close()
|
200
|
+
os.remove(tempout.name)
|
201
|
+
return results
|
202
|
+
|
203
|
+
|
204
|
+
|
205
|
+
def link_entities_svm(expressions, targets, holders, knaf_obj,this_config_manager):
|
206
|
+
all_types = []
|
207
|
+
all_exp_ids = []
|
208
|
+
all_tar_ids = []
|
209
|
+
all_hol_ids = []
|
210
|
+
global config_manager
|
211
|
+
config_manager = this_config_manager
|
212
|
+
|
213
|
+
for exp_ids,exp_type in expressions:
|
214
|
+
all_types.append(exp_type)
|
215
|
+
exp_term_ids = knaf_obj.map_tokens_to_terms(exp_ids)
|
216
|
+
all_exp_ids.append((exp_term_ids, exp_type))
|
217
|
+
|
218
|
+
for tar_ids, tar_type in targets:
|
219
|
+
tar_term_ids = knaf_obj.map_tokens_to_terms(tar_ids)
|
220
|
+
all_tar_ids.append(tar_term_ids)
|
221
|
+
|
222
|
+
for hol_ids, hol_type in holders:
|
223
|
+
hol_term_ids = knaf_obj.map_tokens_to_terms(hol_ids)
|
224
|
+
all_hol_ids.append(hol_term_ids)
|
225
|
+
|
226
|
+
#assigned_targets = link_exp_tar(all_exp_ids, all_tar_ids,knaf_obj)
|
227
|
+
|
228
|
+
svm_thres_exp_tar = config_manager.get_svm_threshold_exp_tar()
|
229
|
+
use_deps_now = config_manager.get_use_dependencies()
|
230
|
+
use_tokens_lemmas = config_manager.get_use_training_lexicons()
|
231
|
+
pairs_exp_tar = link_exp_tar_all(all_exp_ids, all_tar_ids, knaf_obj,svm_thres_exp_tar,use_dependencies=use_deps_now,use_tokens=use_tokens_lemmas,use_lemmas=use_tokens_lemmas)
|
232
|
+
|
233
|
+
results = []
|
234
|
+
sets_exp_ids = []
|
235
|
+
for exp_ids, exp_type, tar_ids in pairs_exp_tar:
|
236
|
+
sets_exp_ids.append(exp_ids)
|
237
|
+
|
238
|
+
|
239
|
+
# The holders are calculated in the old fashion
|
240
|
+
svm_thres_exp_hol = config_manager.get_svm_threshold_exp_hol()
|
241
|
+
assigned_holders = link_exp_hol(sets_exp_ids, all_hol_ids,knaf_obj,svm_thres_exp_hol,use_dependencies=use_deps_now,use_tokens=use_tokens_lemmas,use_lemmas=use_tokens_lemmas)
|
242
|
+
|
243
|
+
for index, (exp_ids, exp_type, tar_ids) in enumerate(pairs_exp_tar):
|
244
|
+
results.append((exp_type,exp_ids,tar_ids,assigned_holders[index]))
|
245
|
+
|
246
|
+
del config_manager
|
247
|
+
config_manager = None
|
248
|
+
return results
|
249
|
+
|
250
|
+
|
@@ -0,0 +1,566 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
import os
|
5
|
+
import logging
|
6
|
+
import shutil
|
7
|
+
import glob
|
8
|
+
from subprocess import Popen, PIPE
|
9
|
+
import cPickle
|
10
|
+
import time
|
11
|
+
import csv
|
12
|
+
from collections import defaultdict
|
13
|
+
|
14
|
+
|
15
|
+
from scripts import lexicons as lexicons_manager
|
16
|
+
from scripts.config_manager import Cconfig_manager, internal_config_filename
|
17
|
+
from scripts.extract_features import extract_features_from_kaf_naf_file
|
18
|
+
from scripts.crfutils import extract_features_to_crf
|
19
|
+
from scripts.extract_feats_relations import create_rel_exp_tar_training, create_rel_exp_hol_training
|
20
|
+
from VUA_pylib.io import Cfeature_file, Cfeature_index
|
21
|
+
from KafNafParserPy import KafNafParser
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
#Globa configuration
|
26
|
+
my_config_manager = Cconfig_manager()
|
27
|
+
|
28
|
+
__this_folder = os.path.dirname(os.path.realpath(__file__))
|
29
|
+
|
30
|
+
|
31
|
+
def save_obj_to_file(obj,filename):
|
32
|
+
fic = open(filename,'wb')
|
33
|
+
cPickle.dump(obj,fic)
|
34
|
+
fic.close()
|
35
|
+
|
36
|
+
def create_folders(config_filename):
|
37
|
+
global my_config_manager
|
38
|
+
|
39
|
+
# Read configuration from the config file
|
40
|
+
my_config_manager.set_current_folder(__this_folder)
|
41
|
+
my_config_manager.set_config(config_filename)
|
42
|
+
|
43
|
+
out_folder = my_config_manager.get_output_folder()
|
44
|
+
|
45
|
+
|
46
|
+
logging.debug('Complete path to output folder: '+out_folder)
|
47
|
+
|
48
|
+
# Remove the folder if it exists
|
49
|
+
if os.path.exists(out_folder):
|
50
|
+
shutil.rmtree(out_folder)
|
51
|
+
logging.debug('Output folder exists and was removed')
|
52
|
+
|
53
|
+
os.mkdir(out_folder)
|
54
|
+
logging.debug('Created '+out_folder)
|
55
|
+
|
56
|
+
#Copy the config filename to out_folder/config.cfg
|
57
|
+
my_cfg = os.path.join(out_folder,internal_config_filename)
|
58
|
+
shutil.copyfile(config_filename,my_cfg)
|
59
|
+
|
60
|
+
feat_folder = my_config_manager.get_feature_folder_name()
|
61
|
+
logging.debug('Created '+feat_folder)
|
62
|
+
os.mkdir(feat_folder)
|
63
|
+
|
64
|
+
crf_exp = my_config_manager.get_crf_expression_folder()
|
65
|
+
os.mkdir(crf_exp)
|
66
|
+
logging.debug('Created '+crf_exp)
|
67
|
+
|
68
|
+
crf_target = my_config_manager.get_crf_target_folder()
|
69
|
+
os.mkdir(crf_target)
|
70
|
+
logging.debug('Created '+crf_target)
|
71
|
+
|
72
|
+
crf_holder = my_config_manager.get_crf_holder_folder()
|
73
|
+
os.mkdir(crf_holder)
|
74
|
+
logging.debug('Created '+crf_holder)
|
75
|
+
|
76
|
+
datasets_folder = my_config_manager.get_training_datasets_folder()
|
77
|
+
os.mkdir(datasets_folder)
|
78
|
+
logging.debug('Created '+datasets_folder)
|
79
|
+
|
80
|
+
models_folder = my_config_manager.get_model_foldername()
|
81
|
+
os.mkdir(models_folder)
|
82
|
+
logging.debug('Created '+models_folder)
|
83
|
+
|
84
|
+
relation_folder = my_config_manager.get_folder_relation_classifier()
|
85
|
+
os.mkdir(relation_folder)
|
86
|
+
logging.debug('Created '+relation_folder)
|
87
|
+
|
88
|
+
##Templates folder
|
89
|
+
template_folder = my_config_manager.get_feature_template_folder_name()
|
90
|
+
os.mkdir(template_folder)
|
91
|
+
logging.debug('Created '+template_folder)
|
92
|
+
|
93
|
+
##Copy template files
|
94
|
+
my_config_manager.copy_feature_templates()
|
95
|
+
|
96
|
+
##Folder for lexicons
|
97
|
+
lexicons_folder = my_config_manager.get_lexicons_folder()
|
98
|
+
os.mkdir(lexicons_folder)
|
99
|
+
logging.debug('Created '+lexicons_folder)
|
100
|
+
|
101
|
+
def load_training_files():
|
102
|
+
file_training_files_cfg = my_config_manager.get_file_training_list()
|
103
|
+
train_files = []
|
104
|
+
path_to_file = ''
|
105
|
+
if os.path.isabs(file_training_files_cfg):
|
106
|
+
path_to_file = file_training_files_cfg
|
107
|
+
else:
|
108
|
+
path_to_file = os.path.join(__this_folder,file_training_files_cfg)
|
109
|
+
logging.debug('Reading training files from '+path_to_file)
|
110
|
+
try:
|
111
|
+
fic = open(path_to_file,'r')
|
112
|
+
for line in fic:
|
113
|
+
train_files.append(line.strip())
|
114
|
+
fic.close()
|
115
|
+
except Exception as e:
|
116
|
+
print>>sys.stderr,'Exception reading '+path_to_file,' -->'+str(e)
|
117
|
+
sys.exit(-1)
|
118
|
+
return train_files
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
def extract_all_features():
|
123
|
+
train_files = load_training_files()
|
124
|
+
logging.debug('Loaded '+str(len(train_files))+' files')
|
125
|
+
|
126
|
+
feat_folder = my_config_manager.get_feature_folder_name()
|
127
|
+
label_feats = separator = None
|
128
|
+
my_stdout, my_stderr = sys.stdout,sys.stderr
|
129
|
+
|
130
|
+
rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename()
|
131
|
+
exp_tar_rel_fic = open(rel_exp_tar_filename,'w')
|
132
|
+
|
133
|
+
rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename()
|
134
|
+
exp_hol_rel_fic = open(rel_exp_hol_filename,'w')
|
135
|
+
|
136
|
+
### LEXICON FROM THE DOMAIN
|
137
|
+
expressions_lexicon = None
|
138
|
+
targets_lexicon = None
|
139
|
+
if my_config_manager.get_use_training_lexicons():
|
140
|
+
# Create the lexicons
|
141
|
+
|
142
|
+
##GUESS THE LANG:
|
143
|
+
first_train_file = train_files[0]
|
144
|
+
obj = KafNafParser(first_train_file)
|
145
|
+
lang = obj.get_language()
|
146
|
+
|
147
|
+
expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
|
148
|
+
target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
|
149
|
+
|
150
|
+
|
151
|
+
this_exp_lex = my_config_manager.get_use_this_expression_lexicon()
|
152
|
+
this_tar_lex = my_config_manager.get_use_this_target_lexicon()
|
153
|
+
|
154
|
+
|
155
|
+
if this_exp_lex is None or this_tar_lex is None:
|
156
|
+
path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py'
|
157
|
+
training_filename = my_config_manager.get_file_training_list()
|
158
|
+
lexicons_manager.create_lexicons(path_to_lex_creator,training_filename,expression_lexicon_filename,target_lexicon_filename)
|
159
|
+
|
160
|
+
##Once created we have to copy the previous one in case:
|
161
|
+
if this_exp_lex is not None:
|
162
|
+
if "$LANG" in this_exp_lex:
|
163
|
+
this_exp_lex = this_exp_lex.replace('$LANG',lang)
|
164
|
+
shutil.copy(this_exp_lex, expression_lexicon_filename)
|
165
|
+
|
166
|
+
if this_tar_lex is not None:
|
167
|
+
if "$LANG" in this_tar_lex:
|
168
|
+
this_tar_lex = this_tar_lex.replace('$LANG',lang)
|
169
|
+
shutil.copy(this_tar_lex,target_lexicon_filename)
|
170
|
+
|
171
|
+
expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
|
172
|
+
targets_lexicon = lexicons_manager.load_lexicon(target_lexicon_filename)
|
173
|
+
|
174
|
+
this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name()
|
175
|
+
if this_propagation_lexicon is not None:
|
176
|
+
if "$LANG" in this_propagation_lexicon:
|
177
|
+
this_propagation_lexicon = this_propagation_lexicon.replace('$LANG',lang)
|
178
|
+
|
179
|
+
print>>sys.stderr,'Propagated lexicon',this_propagation_lexicon
|
180
|
+
|
181
|
+
|
182
|
+
|
183
|
+
|
184
|
+
## Configuration for the relational alcasifier
|
185
|
+
use_deps_now = my_config_manager.get_use_dependencies()
|
186
|
+
use_toks_lems_now = my_config_manager.get_use_tokens_lemmas()
|
187
|
+
|
188
|
+
accepted_opinions = my_config_manager.get_mapping_valid_opinions()
|
189
|
+
use_dependencies_now = my_config_manager.get_use_dependencies()
|
190
|
+
polarities_found_and_skipped = []
|
191
|
+
for num_file, train_file in enumerate(train_files):
|
192
|
+
logging.debug('Extracting features '+os.path.basename(train_file))
|
193
|
+
base_name = os.path.basename(train_file)
|
194
|
+
out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat")
|
195
|
+
err_file = out_file+'.log'
|
196
|
+
|
197
|
+
#Creates the output file
|
198
|
+
# Returns the labels for the features and the separator used
|
199
|
+
if True:
|
200
|
+
kaf_naf_obj = KafNafParser(train_file)
|
201
|
+
|
202
|
+
label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file,
|
203
|
+
accepted_opinions=accepted_opinions,
|
204
|
+
exp_lex=expressions_lexicon,
|
205
|
+
tar_lex=targets_lexicon,
|
206
|
+
propagation_lex_filename=this_propagation_lexicon)
|
207
|
+
polarities_found_and_skipped.extend(pols_skipped_this)
|
208
|
+
print>>exp_tar_rel_fic,'#'+train_file
|
209
|
+
print>>exp_hol_rel_fic,'#'+train_file
|
210
|
+
# SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations
|
211
|
+
create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
|
212
|
+
create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
|
213
|
+
if False:
|
214
|
+
#except Exception as e:
|
215
|
+
sys.stdout, sys.stderr = my_stdout, my_stderr
|
216
|
+
print>>sys.stderr,str(e),dir(e)
|
217
|
+
pass
|
218
|
+
|
219
|
+
##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed
|
220
|
+
count = defaultdict(int)
|
221
|
+
for exp_label in polarities_found_and_skipped:
|
222
|
+
count[exp_label] += 1
|
223
|
+
info = '\nOpinions skipped because the polarity label is not included in the configuration\n'
|
224
|
+
info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n'
|
225
|
+
info += 'Number of complete opinions skipped\n'
|
226
|
+
for label, c in count.items():
|
227
|
+
info+=' '+label+' :'+str(c)+'\n'
|
228
|
+
info+='\n'
|
229
|
+
logging.debug(info)
|
230
|
+
###################################################
|
231
|
+
|
232
|
+
|
233
|
+
|
234
|
+
#Re-set the stdout and stderr
|
235
|
+
exp_tar_rel_fic.close()
|
236
|
+
exp_hol_rel_fic.close()
|
237
|
+
|
238
|
+
sys.stdout,sys.stderr = my_stdout, my_stderr
|
239
|
+
#Sabe labelfeats and separator in a file
|
240
|
+
filename = my_config_manager.get_feature_desc_filename()
|
241
|
+
fic = open(filename,'w')
|
242
|
+
fic.write(' '.join(label_feats)+'\n')
|
243
|
+
fic.close()
|
244
|
+
logging.debug('Description of features --> '+filename)
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
def train_expression_classifier():
|
249
|
+
# 1) Create the training file from all the features
|
250
|
+
# Load the feature description
|
251
|
+
path_feat_desc = my_config_manager.get_feature_desc_filename()
|
252
|
+
fic = open(path_feat_desc)
|
253
|
+
fields = fic.read().strip()
|
254
|
+
fic.close()
|
255
|
+
separator = '\t'
|
256
|
+
feat_folder = my_config_manager.get_feature_folder_name()
|
257
|
+
crf_folder = my_config_manager.get_crf_expression_folder()
|
258
|
+
# Create all the CRF files calling to the crfutils.extract_features_to_crf
|
259
|
+
|
260
|
+
crf_out_files = []
|
261
|
+
|
262
|
+
templates_exp = my_config_manager.get_templates_expr()
|
263
|
+
possible_classes = my_config_manager.get_possible_expression_values()
|
264
|
+
|
265
|
+
# Only set the target class for the tokens of possible_classes
|
266
|
+
# For others, it's set to O (out sequence)
|
267
|
+
for feat_file in glob.glob(feat_folder+'/*.feat'):
|
268
|
+
base_name = os.path.basename(feat_file)
|
269
|
+
base_name = base_name[:-5]
|
270
|
+
out_crf = os.path.join(crf_folder,base_name)
|
271
|
+
logging.debug('Creating crf file in --> '+out_crf)
|
272
|
+
|
273
|
+
try:
|
274
|
+
extract_features_to_crf(feat_file,out_crf,fields,separator,templates_exp,possible_classes)
|
275
|
+
crf_out_files.append(out_crf)
|
276
|
+
except:
|
277
|
+
print>>sys.stderr,'Failed conversion to tab-expression -> CRF: ',feat_file
|
278
|
+
###########################################################################################
|
279
|
+
|
280
|
+
# Concatenate all the crf files just created
|
281
|
+
out_f = open(my_config_manager.get_training_dataset_exp(),'w')
|
282
|
+
for crf_file in crf_out_files:
|
283
|
+
f = open(crf_file)
|
284
|
+
out_f.write(f.read())
|
285
|
+
f.close()
|
286
|
+
out_f.close()
|
287
|
+
logging.debug('Created training data for crf, op.exp '+my_config_manager.get_training_dataset_exp())
|
288
|
+
#############################################
|
289
|
+
|
290
|
+
#Train the model
|
291
|
+
crf_params = my_config_manager.get_crfsuite_params()
|
292
|
+
input_file = my_config_manager.get_training_dataset_exp()
|
293
|
+
model_file = my_config_manager.get_filename_model_expression()
|
294
|
+
logging.debug('Training the classifier for opinion expressions (could take a while)')
|
295
|
+
run_crfsuite(crf_params,input_file,model_file)
|
296
|
+
|
297
|
+
|
298
|
+
|
299
|
+
def train_target_classifier():
|
300
|
+
|
301
|
+
# 1) Create the training file from all the features
|
302
|
+
# Load the feature description
|
303
|
+
path_feat_desc = my_config_manager.get_feature_desc_filename()
|
304
|
+
fic = open(path_feat_desc)
|
305
|
+
fields = fic.read().strip()
|
306
|
+
fic.close()
|
307
|
+
separator = '\t'
|
308
|
+
feat_folder = my_config_manager.get_feature_folder_name()
|
309
|
+
crf_folder = my_config_manager.get_crf_target_folder()
|
310
|
+
# Create all the CRF files calling to the crfutils.extract_features_to_crf
|
311
|
+
crf_out_files = []
|
312
|
+
templates_target = my_config_manager.get_templates_target()
|
313
|
+
possible_classes = ['target']
|
314
|
+
for feat_file in glob.glob(feat_folder+'/*.feat'):
|
315
|
+
base_name = os.path.basename(feat_file)
|
316
|
+
base_name = base_name[:-5]
|
317
|
+
out_crf = os.path.join(crf_folder,base_name)
|
318
|
+
logging.debug('Creating crf file in --> '+out_crf)
|
319
|
+
|
320
|
+
try:
|
321
|
+
extract_features_to_crf(feat_file,out_crf,fields,separator,templates_target,possible_classes)
|
322
|
+
crf_out_files.append(out_crf)
|
323
|
+
except:
|
324
|
+
print>>sys.stderr,'Failed conversion to tab-target-> CRF: ',feat_file
|
325
|
+
###########################################################################################
|
326
|
+
|
327
|
+
# Concatenate all the crf files just created
|
328
|
+
out_f = open(my_config_manager.get_training_dataset_target(),'w')
|
329
|
+
for crf_file in crf_out_files:
|
330
|
+
f = open(crf_file)
|
331
|
+
out_f.write(f.read())
|
332
|
+
f.close()
|
333
|
+
out_f.close()
|
334
|
+
logging.debug('Created training data for crf, op.exp '+my_config_manager.get_training_dataset_target())
|
335
|
+
#############################################
|
336
|
+
|
337
|
+
#Train the model
|
338
|
+
crf_params = my_config_manager.get_crfsuite_params()
|
339
|
+
input_file = my_config_manager.get_training_dataset_target()
|
340
|
+
model_file = my_config_manager.get_filename_model_target()
|
341
|
+
logging.debug('Training the classifier for opinion target (could take a while)')
|
342
|
+
run_crfsuite(crf_params,input_file,model_file)
|
343
|
+
|
344
|
+
|
345
|
+
|
346
|
+
|
347
|
+
def train_holder_classifier():
|
348
|
+
|
349
|
+
# 1) Create the training file from all the features
|
350
|
+
# Load the feature description
|
351
|
+
path_feat_desc = my_config_manager.get_feature_desc_filename()
|
352
|
+
fic = open(path_feat_desc)
|
353
|
+
fields = fic.read().strip()
|
354
|
+
fic.close()
|
355
|
+
separator = '\t'
|
356
|
+
feat_folder = my_config_manager.get_feature_folder_name()
|
357
|
+
crf_folder = my_config_manager.get_crf_holder_folder()
|
358
|
+
# Create all the CRF files calling to the crfutils.extract_features_to_crf
|
359
|
+
crf_out_files = []
|
360
|
+
templates_holder = my_config_manager.get_templates_holder()
|
361
|
+
possible_classes = ['holder']
|
362
|
+
for feat_file in glob.glob(feat_folder+'/*.feat'):
|
363
|
+
base_name = os.path.basename(feat_file)
|
364
|
+
base_name = base_name[:-5]
|
365
|
+
out_crf = os.path.join(crf_folder,base_name)
|
366
|
+
logging.debug('Creating crf file in --> '+out_crf)
|
367
|
+
|
368
|
+
try:
|
369
|
+
extract_features_to_crf(feat_file,out_crf,fields,separator,templates_holder,possible_classes)
|
370
|
+
crf_out_files.append(out_crf)
|
371
|
+
except:
|
372
|
+
print>>sys.stderr,'Failed conversion to tab-holder -> CRF: ',feat_file
|
373
|
+
###########################################################################################
|
374
|
+
|
375
|
+
# Concatenate all the crf files just created
|
376
|
+
out_f = open(my_config_manager.get_training_dataset_holder(),'w')
|
377
|
+
for crf_file in crf_out_files:
|
378
|
+
f = open(crf_file)
|
379
|
+
out_f.write(f.read())
|
380
|
+
f.close()
|
381
|
+
out_f.close()
|
382
|
+
logging.debug('Created training data for crf, op.exp '+my_config_manager.get_training_dataset_holder())
|
383
|
+
#############################################
|
384
|
+
|
385
|
+
#Train the model
|
386
|
+
crf_params = my_config_manager.get_crfsuite_params()
|
387
|
+
input_file = my_config_manager.get_training_dataset_holder()
|
388
|
+
model_file = my_config_manager.get_filename_model_holder()
|
389
|
+
logging.debug('Training the classifier for opinion holder (could take a while)')
|
390
|
+
run_crfsuite(crf_params,input_file,model_file)
|
391
|
+
|
392
|
+
|
393
|
+
def run_crfsuite(crf_params,input_file,model_file):
|
394
|
+
|
395
|
+
crfsuite = my_config_manager.get_crfsuite_binary()
|
396
|
+
if not os.path.exists(crfsuite):
|
397
|
+
print>>sys.stderr,'CRFsuite not found on',crfsuite
|
398
|
+
print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
|
399
|
+
print>>sys.stderr,'[crfsuite]\npath_to_binary = yourpathtolocalcrfsuite'
|
400
|
+
sys.exit(-1)
|
401
|
+
|
402
|
+
cmd = [crfsuite]
|
403
|
+
cmd.append('learn')
|
404
|
+
cmd.append(crf_params)
|
405
|
+
cmd.append('-m '+model_file)
|
406
|
+
cmd.append(input_file)
|
407
|
+
err_file = model_file+'.log'
|
408
|
+
err_fic = open(err_file,'w')
|
409
|
+
crf_process = Popen(' '.join(cmd), stdin=PIPE, stdout=err_fic, stderr=PIPE, shell=True)
|
410
|
+
crf_process.wait()
|
411
|
+
str_err = crf_process.stderr.read()
|
412
|
+
if len(str_err) != 0:
|
413
|
+
print>>sys.stderr,'CRF error!!: '+str_err
|
414
|
+
sys.exit(-1)
|
415
|
+
err_fic.close()
|
416
|
+
logging.debug('Crfsuite log '+err_file)
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
|
421
|
+
############################################
|
422
|
+
################ RELATION TRAINING #########
|
423
|
+
###########################################
|
424
|
+
|
425
|
+
def train_classifier_relation_exp_tar():
|
426
|
+
#Load the human readable training file
|
427
|
+
train_filename = my_config_manager.get_relation_exp_tar_training_filename()
|
428
|
+
feature_file_obj = Cfeature_file(train_filename)
|
429
|
+
###########################################
|
430
|
+
|
431
|
+
|
432
|
+
# Convert it into index based feature file, for svm-light
|
433
|
+
feature_index = Cfeature_index()
|
434
|
+
feat_bin_filename = my_config_manager.get_rel_exp_tar_training_idx_filename()
|
435
|
+
fic_out = open(feat_bin_filename,'w')
|
436
|
+
feature_index.encode_feature_file_to_svm(feature_file_obj,fic_out)
|
437
|
+
fic_out.close()
|
438
|
+
###########################################
|
439
|
+
|
440
|
+
|
441
|
+
## Save the feature index
|
442
|
+
feat_index_filename = my_config_manager.get_index_features_exp_tar_filename()
|
443
|
+
feature_index.save_to_file(feat_index_filename)
|
444
|
+
#########################
|
445
|
+
|
446
|
+
# Train the model
|
447
|
+
example_file = my_config_manager.get_rel_exp_tar_training_idx_filename()
|
448
|
+
model = my_config_manager.get_filename_model_exp_tar()
|
449
|
+
svm_opts = my_config_manager.get_svm_params()
|
450
|
+
logging.debug('Training SVMlight classifier for RELATION(expression,target) in '+model+ '(could take a while)')
|
451
|
+
run_svmlight_learn(example_file,model,svm_opts)
|
452
|
+
###########################################
|
453
|
+
|
454
|
+
|
455
|
+
|
456
|
+
|
457
|
+
def train_classifier_relation_exp_hol():
|
458
|
+
#Load the human readable training file
|
459
|
+
train_filename = my_config_manager.get_relation_exp_hol_training_filename()
|
460
|
+
feature_file_obj = Cfeature_file(train_filename)
|
461
|
+
###########################################
|
462
|
+
|
463
|
+
|
464
|
+
# Convert it into index based feature file, for svm-light
|
465
|
+
feature_index = Cfeature_index()
|
466
|
+
feat_bin_filename = my_config_manager.get_rel_exp_hol_training_idx_filename()
|
467
|
+
fic_out = open(feat_bin_filename,'w')
|
468
|
+
feature_index.encode_feature_file_to_svm(feature_file_obj,fic_out)
|
469
|
+
fic_out.close()
|
470
|
+
###########################################
|
471
|
+
|
472
|
+
|
473
|
+
## Save the feature index
|
474
|
+
feat_index_filename = my_config_manager.get_index_features_exp_hol_filename()
|
475
|
+
feature_index.save_to_file(feat_index_filename)
|
476
|
+
#########################
|
477
|
+
|
478
|
+
# Train the model
|
479
|
+
example_file = my_config_manager.get_rel_exp_hol_training_idx_filename()
|
480
|
+
model = my_config_manager.get_filename_model_exp_hol()
|
481
|
+
svm_opts = my_config_manager.get_svm_params()
|
482
|
+
logging.debug('Training SVMlight classifier for RELATION(expression,holder) in '+model+ '(could take a while)')
|
483
|
+
run_svmlight_learn(example_file,model,svm_opts)
|
484
|
+
###########################################
|
485
|
+
|
486
|
+
|
487
|
+
def run_svmlight_learn(example_file,model_file,params):
|
488
|
+
svmlight = my_config_manager.get_svm_learn_binary()
|
489
|
+
|
490
|
+
if not os.path.exists(svmlight):
|
491
|
+
print>>sys.stderr,'SVMlight learn not found on',svmlight
|
492
|
+
print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
|
493
|
+
print>>sys.stderr,'[svmlight]\npath_to_binary_learn = yourpathtolocalsvmlightlearn'
|
494
|
+
sys.exit(-1)
|
495
|
+
|
496
|
+
cmd = [svmlight]
|
497
|
+
cmd.append(params)
|
498
|
+
cmd.append(example_file)
|
499
|
+
cmd.append(model_file)
|
500
|
+
err_file = model_file+'.log'
|
501
|
+
err_fic = open(err_file,'w')
|
502
|
+
svm_process = Popen(' '.join(cmd),stdin=PIPE, stdout=err_fic, stderr=PIPE, shell=True)
|
503
|
+
svm_process.wait()
|
504
|
+
str_err = svm_process.stderr.read()
|
505
|
+
if len(str_err) != 0:
|
506
|
+
print>>sys.stderr,'SVM light error '+str_err
|
507
|
+
sys.exit(-1)
|
508
|
+
err_fic.close()
|
509
|
+
logging.debug('SVMlight learn log'+err_file)
|
510
|
+
|
511
|
+
def write_to_flag(msg,openas='a'):
|
512
|
+
flag = open(my_config_manager.get_flag_filename(),openas)
|
513
|
+
my_time = time.strftime('%Y-%m-%dT%H:%M:%S%Z')
|
514
|
+
flag.write(msg+' --> '+my_time+'\n')
|
515
|
+
flag.close()
|
516
|
+
|
517
|
+
def train_all(file_config):
|
518
|
+
|
519
|
+
|
520
|
+
|
521
|
+
|
522
|
+
# Check if the output folder exists or create it
|
523
|
+
create_folders(file_config)
|
524
|
+
write_to_flag('Beginning\n','w')
|
525
|
+
|
526
|
+
#Will create the subfolder out_folder/subfolder_feats with files *feat
|
527
|
+
write_to_flag('START extract features')
|
528
|
+
extract_all_features()
|
529
|
+
write_to_flag('DONE extract features\n')
|
530
|
+
|
531
|
+
# training the expression classifier
|
532
|
+
write_to_flag('START training expression classifier')
|
533
|
+
train_expression_classifier()
|
534
|
+
write_to_flag('DONE training expression classifier\n')
|
535
|
+
|
536
|
+
|
537
|
+
# Training the target classifier
|
538
|
+
write_to_flag('START training target classifier')
|
539
|
+
train_target_classifier()
|
540
|
+
write_to_flag('DONE training target classifier\n')
|
541
|
+
|
542
|
+
# training the holder classifier
|
543
|
+
write_to_flag('START training expression classifier')
|
544
|
+
train_holder_classifier()
|
545
|
+
write_to_flag('DONE training holder classifier\n')
|
546
|
+
|
547
|
+
|
548
|
+
write_to_flag('START training relation expression - target classifier')
|
549
|
+
train_classifier_relation_exp_tar()
|
550
|
+
write_to_flag('DONE training relation expression - target classifier\n')
|
551
|
+
|
552
|
+
write_to_flag('START training relation expression - holder classifier')
|
553
|
+
train_classifier_relation_exp_hol()
|
554
|
+
write_to_flag('DONE training relation expression - holder classifier\n')
|
555
|
+
|
556
|
+
|
557
|
+
logging.debug('ALL TRAINING DONE')
|
558
|
+
write_to_flag('FINISHED ')
|
559
|
+
|
560
|
+
|
561
|
+
if __name__ == '__main__':
|
562
|
+
logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s\n %(message)s', level=logging.DEBUG)
|
563
|
+
file_config = sys.argv[1]
|
564
|
+
train_all(file_config)
|
565
|
+
|
566
|
+
sys.exit(0)
|