RubyGems - opener-opinion-detector-base - Versions diffs - 2.0.0 - Mend

opener-opinion-detector-base 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (261) hide show

checksums.yaml +7 -0
data/README.md +101 -0
data/bin/opinion-detector-base +19 -0
data/core/annotation.cfg.erb +9 -0
data/core/packages/KafNafParser-1.4.tar.gz +0 -0
data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
data/core/python-scripts/LICENSE +339 -0
data/core/python-scripts/README.md +226 -0
data/core/python-scripts/classify_kaf_naf_file.py +499 -0
data/core/python-scripts/cross_validation.py +634 -0
data/core/python-scripts/generate_folds.py +134 -0
data/core/python-scripts/models.cfg +10 -0
data/core/python-scripts/my_templates/README +33 -0
data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
data/core/python-scripts/my_templates/templates_exp.txt +10 -0
data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
data/core/python-scripts/my_templates/templates_holder.txt +10 -0
data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
data/core/python-scripts/my_templates/templates_target.txt +10 -0
data/core/python-scripts/run_all_experiments.sh +49 -0
data/core/python-scripts/run_basic.py +20 -0
data/core/python-scripts/run_experiment.sh +42 -0
data/core/python-scripts/scripts/__init__.py +1 -0
data/core/python-scripts/scripts/config_manager.py +314 -0
data/core/python-scripts/scripts/crfutils.py +215 -0
data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
data/core/python-scripts/scripts/extract_features.py +376 -0
data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
data/core/python-scripts/scripts/lexicons.py +44 -0
data/core/python-scripts/scripts/link_entities_distance.py +77 -0
data/core/python-scripts/scripts/relation_classifier.py +250 -0
data/core/python-scripts/train.py +566 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
data/core/vendor/src/crfsuite/AUTHORS +1 -0
data/core/vendor/src/crfsuite/COPYING +27 -0
data/core/vendor/src/crfsuite/ChangeLog +103 -0
data/core/vendor/src/crfsuite/INSTALL +236 -0
data/core/vendor/src/crfsuite/Makefile.am +19 -0
data/core/vendor/src/crfsuite/Makefile.in +783 -0
data/core/vendor/src/crfsuite/README +183 -0
data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
data/core/vendor/src/crfsuite/autogen.sh +38 -0
data/core/vendor/src/crfsuite/compile +143 -0
data/core/vendor/src/crfsuite/config.guess +1502 -0
data/core/vendor/src/crfsuite/config.h.in +198 -0
data/core/vendor/src/crfsuite/config.sub +1714 -0
data/core/vendor/src/crfsuite/configure +14273 -0
data/core/vendor/src/crfsuite/configure.in +149 -0
data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
data/core/vendor/src/crfsuite/depcomp +630 -0
data/core/vendor/src/crfsuite/example/chunking.py +49 -0
data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
data/core/vendor/src/crfsuite/example/ner.py +270 -0
data/core/vendor/src/crfsuite/example/pos.py +78 -0
data/core/vendor/src/crfsuite/example/template.py +88 -0
data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
data/core/vendor/src/crfsuite/frontend/main.c +137 -0
data/core/vendor/src/crfsuite/frontend/option.c +93 -0
data/core/vendor/src/crfsuite/frontend/option.h +86 -0
data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
data/core/vendor/src/crfsuite/include/os.h +61 -0
data/core/vendor/src/crfsuite/install-sh +520 -0
data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
data/core/vendor/src/crfsuite/missing +376 -0
data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
data/core/vendor/src/crfsuite/swig/export.i +32 -0
data/core/vendor/src/crfsuite/swig/python/README +92 -0
data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
data/core/vendor/src/liblbfgs/AUTHORS +1 -0
data/core/vendor/src/liblbfgs/COPYING +22 -0
data/core/vendor/src/liblbfgs/ChangeLog +120 -0
data/core/vendor/src/liblbfgs/INSTALL +231 -0
data/core/vendor/src/liblbfgs/Makefile.am +10 -0
data/core/vendor/src/liblbfgs/Makefile.in +638 -0
data/core/vendor/src/liblbfgs/NEWS +0 -0
data/core/vendor/src/liblbfgs/README +71 -0
data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
data/core/vendor/src/liblbfgs/autogen.sh +38 -0
data/core/vendor/src/liblbfgs/config.guess +1411 -0
data/core/vendor/src/liblbfgs/config.h.in +64 -0
data/core/vendor/src/liblbfgs/config.sub +1500 -0
data/core/vendor/src/liblbfgs/configure +21146 -0
data/core/vendor/src/liblbfgs/configure.in +107 -0
data/core/vendor/src/liblbfgs/depcomp +522 -0
data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
data/core/vendor/src/liblbfgs/install-sh +322 -0
data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
data/core/vendor/src/liblbfgs/missing +353 -0
data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
data/core/vendor/src/svm_light/LICENSE.txt +59 -0
data/core/vendor/src/svm_light/Makefile +105 -0
data/core/vendor/src/svm_light/kernel.h +40 -0
data/core/vendor/src/svm_light/svm_classify.c +197 -0
data/core/vendor/src/svm_light/svm_common.c +985 -0
data/core/vendor/src/svm_light/svm_common.h +301 -0
data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
data/core/vendor/src/svm_light/svm_learn.c +4147 -0
data/core/vendor/src/svm_light/svm_learn.h +169 -0
data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
data/core/vendor/src/svm_light/svm_loqo.c +211 -0
data/ext/hack/Rakefile +17 -0
data/ext/hack/support.rb +88 -0
data/lib/opener/opinion_detectors/base.rb +112 -0
data/lib/opener/opinion_detectors/base/version.rb +7 -0
data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
data/lib/opener/opinion_detectors/de.rb +7 -0
data/lib/opener/opinion_detectors/en.rb +7 -0
data/lib/opener/opinion_detectors/it.rb +7 -0
data/lib/opener/opinion_detectors/nl.rb +6 -0
data/opener-opinion-detector-base.gemspec +35 -0
data/pre_build_requirements.txt +3 -0
metadata +374 -0

data/core/python-scripts/README.md ADDED Viewed

@@ -0,0 +1,226 @@
+#Opinion miner deluxe#
+##Introduction##
+Opinion miner based on machine learning that can be trained using a list of
+KAF/NAF files. It is important to notice that the opinion miner module will not call
+to any external module to obtain features. It will read all the features from the input KAF/NAF file,
+so you have to make sure that your input file contains all the required information in advance (tokens,
+terms, polarities, constituents, entitiess, dependencies...)
+The task is divided into 2 steps
+* Detection of opinion entities (holder, target and expression): using
+Conditional Random Fields
+* Opinion entity linking (expression<-target and expression-<holder): using
+binary Support Vector Machines
+In next subsections, a brief explanation of the 2 steps is given.
+###Opinion Entity detection###
+The first step when extracting opinions from text is to determine which portions of text represent the different opinion entities:
+- Opinion expressions: very nice, really ugly ...
+- Opinion targets: the hotel, the rooms, the staff ...
+- Opinion holders: I, our family, the manager ...
+In order to do this, three different Conditional Random Fields (CRF) classifiers have been trained using by default this set of features: tokens,
+lemmas, part-of-speech tags, constituent labels and polarity of words and entities. These classifiers detect portions of text representeing differnet opinion
+entities.
+###Opinion Entity linking###
+This step takes as input the opinion entities detected in the previous step, and links them to create the final opinions <expression/target/holder>.
+In this case we have trained two binary Support Vector Machines (SVM), one that indicates the degree of association between a given target and a given expression,
+and another one that gives the degree of linkage between a holder and an opinion expression. So given a list of expressions, a list of targets and holders detected
+by the CRF classifiers, the SVM models try to select the best candidate from the target list for each expressions, and the best holder from the holder list, to create
+the final opinion triple.
+Considering a certain opinion expression and a target, these are the features by default used to represent this data for the SVM engine:
+1) Textual features: tokens and lemmas of the expression and the target
+2) Distance features: features representing the relative distance of both elements in the text (normalized to a discrete list of possible values: far/medium/close for instance),
+  and if both elements are in the same sentence or not
+3) Dependency features: to indicate the dependency relations between the two elements in the text (dependency path, and dependencies relations with the root of the sentence)
+##Requirements##
+This is the list of required libraries:
++ SVMLight: library for Support Vector Machines (http://svmlight.joachims.org/)
++ CRFsuite: library for Conditional Random Fields (http://www.chokkan.org/software/crfsuite/)
++ KafNafParserPy: library for parsing KAF or NAF files (https://github.com/cltl/KafNafParserPy)
++ VUA_pylib: library with functions used by the system (https://github.com/cltl/VUA_pylib)
+To install SVMLight and CRFsuite please visit the corresponding webpages and follow the instructions given. For the last two python libraries,
+you will only to clone the repositories and make sure that both are in the python path so Python is able to find them (the easiest way is
+to modify the variable PYTHON_PATH to include the path to these libraries if you don't want to modify your system files).
+##Setting the opinion miner##
+You will need first to install all the requirements on your local machine and then create a configuration file like this one:
+```shell
+[general]
+output_folder = feat
+[crfsuite]
+path_to_binary = crfsuite
+[svmlight]
+path_to_binary_learn = /home/izquierdo/tools/svm_light/svm_learn
+path_to_binary_classify = /home/izquierdo/tools/svm_light/svm_classify
+````
+The `output_folder` variable is the folder where the trained models have been stored. The rest of parameters are the local paths to your installation
+of CRFsuite and SVMLight. This file will be passed to the main script to detect opinions in a new KAF/NAF file:
+````shell
+cat my_file.kaf | classify_kaf_naf_file.py your_config_file.cfg
+````
+##Training your own models##
+You will need first to install all the requirementes given and then follow these steps:
+1) Prepare the KAF/NAF files that you will be used for training, with as many layers as possible (for the default configuration, preferably KAF
+files with tokens, terms, polarities, entities, aspects, constituents and dependencies). A file with the complete path to each training KAF
+file needs to be created (my_list_kafs.txt, for instance)
+2) Create the feature template files or modify the existing ones on the folder `my_templates`
+3) Prepare a configuration file (or modify the existing one my_training.cfg) like this one:
+````shell
+[general]
+output_folder = feat
+filename_training_list = /home/izquierdo/data/MPQA/13jan2014/list.25
+[feature_templates]
+expression = my_templates/templates_exp.txt
+holder = my_templates/templates_holder.txt
+target = my_templates/templates_target.txt
+[valid_opinions]
+negative = sentiment-neg
+positive = sentiment-pos
+[crfsuite]
+path_to_binary = /home/izquierdo/bin/crfsuite
+parameters = -a lbfgs
+[svmlight]
+path_to_binary_learn = /home/izquierdo/tools/svm_light/svm_learn
+path_to_binary_classify = /home/izquierdo/tools/svm_light/svm_classify
+parameters = -c 0.1
+````
+The `output_folder` variable is where you want to store your new models (will be used later for tagging new files), and the `filename_training_list` is the file
+you created with the paths to all your training KAF/NAF files (my_list_kafs.txt). The section feature_templates contains pointers to the feature template files
+you want to use. The section valid_opinions allows you to specify which opinions from the training KAF files you want to use, and a mapping from all the labels
+used in the KAF files. So with this configuration:
+````shell
+[valid_opinions]
+negative = sentiment-neg
+positive = sentiment-pos
+````
+the opinion expressions classifier will be trained for two classes (negative and positive), and for instance all the opinion expressions with the label sentiment-neg in
+your KAF files will be used as training instance for the negative classifier. This allows you to use different sets of labels for the opinion expressions, for instance
+you could use KAF files with differente labels for the negative expressions, like sentiment-low-negative, sentiment-medium-negative and sentiment-high-negative. To train the
+system considering all these instances as training material for the negative classifier you will need to specify:
+````shell
+[valid_opinions]
+negative = sentiment-low-negative;sentiment-medium-negative;sentiment-high-negative
+positive = sentiment-pos
+````
+The rest of sections on the config file (crfsuite and svm_light) indicate the paths to your local installation of these libraries and the parameters accepted
+by these  (check the webpage of the libraries for information about these parameters)
+ 4) Once completed the previous step, the training can be performed calling to the script train.py:
+````shell
+train.py my_modified_train.cfg
+````
+This will used the config file (my_modified_train.cfg) to train the system and will store all the models and different intermediate files on the folder you set.
+##How to add new features##
+This section explains how to add new features to the system
+###Adding new features to the opinion entity detection (CRF)###
+1) Modify the function that generates the features `scripts/extract_features.py-> extract_features_from_kaf_naf_file(...)`
+1.1) Modify the variable `features`, is a list of features for each token
+1.2) Modify the variable labels, which gives a name to each feature (lenghts must match)
+2) With the previous step you can extract the features for a single token only. You need specify which features you want to use from the context,
+and if you want to use bigrams/trigrams. In order to do this 3 different features templates have to be filled. These files are plain text files, and
+the default files used can be found on the subfolder `my_templates`. One different feature template can be specify for each CRF classifier. The format
+of these files are a set of lines like `1 token -2 -1 0`, where:
+- The first 1 is the length of the template, in this case unigram
+- Then 'n' labels that will be used (must match with the labels generated by the feature extractor)
+- Then the positions, in case of 2grams 3grams each position must be n/m/p
+An example with bigrams: `2 token token -2/-1 -1/0 0/1 1/2` which would generate these templates:
+````shell
+(('token',-2),('token',-1))
+(('token',-1),('token',0))
+(('token',1),('token',1))
+````
+An one more example with trigrams: `3 token lemma pos -2/0/4 9/8/3`.
+````shell
+(('token',-2),('lemma',0),('pos',4))
+(('token',9),('lemma',8),('pos',3))
+````
+###Adding new features to the opinion entity linking (SVM)###
+You will need to modify the script `scripts/extract_feats_relations.py`. There is one function to extract the features from an opinion
+expression and a target, for the SVM model expression - target, and another function with the same purpose for the SVM model expression-holder.
+These functions are:
+````shell
+def extract_feats_exp_tar(exp_ids,tar_ids,knaf_obj):
+    ...
+def extract_feats_exp_hol(exp_ids,hol_ids,knaf_obj):
+    ...
+````
+Both take as input a list of term identifiers for the expression and for the target/holder, and a kaf/naf tree object representing the input file,
+so there is no need to parse it again. These functions return a list of features for the expression, a list of features for the holder/target and two
+extra list of features (for the expression and for the target/holder), that will be used later to stablish features that represent a relation (like
+the dependencies or whether both are in the same sentence or not.) In order to to this, there are two functions that take as input two set of features
+and generate this relation features:
+````shell
+def get_extra_feats_exp_tar(extra_e, extra_t):
+    ...
+def get_extra_feats_exp_hol(extra_e, extra_h):
+    ...
+````
+The main reason of this is that the features for each expression, target and holder is extracted only once, but later for instance each target will act
+as a positive example in one case (with its correct expression), but as negative example for the rest of possible expressions in the file. So the relation
+features can not be extracted in advance for a pair expression/target but has to be computed for each pair we consider, and in order to do this we need
+the two get_extra_feats functions indicated above.
+##Contact##
+* Ruben Izquierdo
+* Vrije University of Amsterdam
+* ruben.izquierdobevia@vu.nl

data/core/python-scripts/classify_kaf_naf_file.py ADDED Viewed

@@ -0,0 +1,499 @@
+#!/usr/bin/env python
+import sys
+import os
+import csv
+from tempfile import NamedTemporaryFile
+from subprocess import Popen, PIPE
+import logging
+import cPickle
+import argparse
+from scripts import lexicons as lexicons_manager
+from scripts.config_manager import Cconfig_manager, internal_config_filename
+from scripts.extract_features import extract_features_from_kaf_naf_file
+from scripts.crfutils import extract_features_to_crf
+from scripts.link_entities_distance import link_entities_distance
+from scripts.relation_classifier import link_entities_svm
+from KafNafParserPy import *
+DEBUG=0
+my_config_manager = Cconfig_manager()
+__this_folder = os.path.dirname(os.path.realpath(__file__))
+separator = '\t'
+__desc = 'Deluxe opinion miner (CRF+SVM)'
+__last_edited = '10jan2014'
+__version = '2.0'
+logging.basicConfig(stream=sys.stderr,format='%(asctime)s - %(levelname)s\n  + %(message)s', level=logging.CRITICAL)
+terms_for_token = None
+def load_obj_from_file(filename):
+    fic = open(filename,'rb')
+    obj = cPickle.load(fic)
+    return obj
+# Gets the output of crf and a list of token ids, and parses the B- or I- ...
+# Output: [(['id0', 'id1', 'id2', 'id3'], 'holder'), (['id4', 'id5', 'id6'], 'target')]
+def match_crfsuite_out(crfout,list_token_ids):
+    matches = []
+    inside = False
+    current = []
+    current_type = None
+    num_token = 0
+    for line in crfout.splitlines():
+        if len(line) == 0:  #new sentence
+            if inside:
+                matches.append((current,current_type))
+                current = []
+                inside = False
+        else:
+            if line=='O':
+                if inside:
+                    matches.append((current,current_type))
+                    current = []
+                    inside = False
+            else:
+                my_type = line[0]
+                value = line[2:]
+                if my_type == 'B':
+                    if inside:
+                        matches.append((current,current_type))
+                    current = [list_token_ids[num_token]]
+                    inside = True
+                    current_type = value
+                elif my_type == 'I':
+                    if inside:
+                        current.append(list_token_ids[num_token])
+                    else:
+                        current = [list_token_ids[num_token]]
+                        current_type = value
+                        inside = True
+            num_token += 1
+    if inside:
+        matches.append((current,current_type))
+    return matches
+def extract_features(kaf_naf_obj):
+    feat_file_desc = NamedTemporaryFile(delete=False)
+    feat_file_desc.close()
+    out_file = feat_file_desc.name
+    err_file = out_file+'.log'
+    expressions_lexicon = None
+    targets_lexicon = None
+    if my_config_manager.get_use_training_lexicons():
+        expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
+        target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
+        expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
+        targets_lexicon =lexicons_manager.load_lexicon(target_lexicon_filename)
+    #def extract_features_from_kaf_naf_file(knaf_obj,out_file=None,log_file=None,include_class=True,accepted_opinions=None, exp_lex= None):
+    labels, separator,polarities_skipped = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file,include_class=False, exp_lex=expressions_lexicon,tar_lex=targets_lexicon)
+    return out_file, err_file
+def convert_to_crf(input_file,templates):
+    out_desc = NamedTemporaryFile(delete=False)
+    out_desc.close()
+    out_crf = out_desc.name
+    ##Load description of features
+    path_feat_desc = my_config_manager.get_feature_desc_filename()
+    fic = open(path_feat_desc)
+    fields = fic.read().strip()
+    fic.close()
+    ####
+    extract_features_to_crf(input_file,out_crf,fields,separator,templates,possible_classes=None)
+    return out_crf
+def run_crfsuite_tag(input_file,model_file):
+    crfsuite = my_config_manager.get_crfsuite_binary()
+    cmd = [crfsuite]
+    if not os.path.exists(crfsuite):
+        print>>sys.stderr,'CRFsuite not found on',crfsuite
+        print>>sys.stderr,'Check the config filename and make sure the path is correctly set'
+        print>>sys.stderr,'[crfsuite]\npath_to_binary = yourpathtolocalcrfsuite'
+        sys.exit(-1)
+    cmd.append('tag')
+    cmd.append('-m '+model_file)
+    cmd.append(input_file)
+    crf_process = Popen(' '.join(cmd), stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
+    crf_process.wait()
+    output = crf_process.stdout.read()
+    error = crf_process.stderr.read()
+    return output,error
+def detect_expressions(tab_feat_file,list_token_ids):
+    #1) Convert to the correct CRF
+    templates = my_config_manager.get_templates_expr()
+    crf_exp_file = convert_to_crf(tab_feat_file,templates)
+    logging.debug('File with crf format for EXPRESSIONS '+crf_exp_file)
+    if DEBUG:
+        print>>sys.stderr,'#'*50
+        print>>sys.stderr,'CRF FEATURES EXPRESSION'
+        f = open(crf_exp_file)
+        print>>sys.stderr,f.read()
+        f.close()
+        print>>sys.stderr,'#'*50
+    model_file = my_config_manager.get_filename_model_expression()
+    output_crf,error_crf = run_crfsuite_tag(crf_exp_file,model_file)
+    logging.debug('Expressions crf error: '+error_crf)
+    matches_exp = match_crfsuite_out(output_crf, list_token_ids)
+    if DEBUG:
+        print>>sys.stderr,'#'*50
+        print>>sys.stderr,'CRF output for EXPRESSION'
+        print>>sys.stderr,'Raw output CRF:', output_crf
+        print>>sys.stderr,'List token ids:',str(list_token_ids)
+        print>>sys.stderr,'MATCHES:',str(matches_exp)
+        print>>sys.stderr,'TEMP FILE:',crf_exp_file
+        print>>sys.stderr,'#'*50
+    logging.debug('Detector expressions out: '+str(matches_exp))
+    os.remove(crf_exp_file)
+    return matches_exp
+def detect_targets(tab_feat_file, list_token_ids):
+    templates_target =  my_config_manager.get_templates_target()
+    crf_target_file = convert_to_crf(tab_feat_file,templates_target)
+    logging.debug('File with crf format for TARGETS '+crf_target_file)
+    if DEBUG:
+        print>>sys.stderr,'#'*50
+        print>>sys.stderr,'CRF FEATURES TARGETS'
+        f = open(crf_target_file)
+        print>>sys.stderr,f.read()
+        f.close()
+        print>>sys.stderr,'#'*50
+    model_target_file = my_config_manager.get_filename_model_target()
+    out_crf_target,error_crf = run_crfsuite_tag(crf_target_file, model_target_file)
+    logging.debug('TARGETS crf error: '+error_crf)
+    matches_tar = match_crfsuite_out(out_crf_target, list_token_ids)
+    if DEBUG:
+        print>>sys.stderr,'#'*50
+        print>>sys.stderr,'CRF output for TARGETS'
+        print>>sys.stderr,'Raw output CRF:', out_crf_target
+        print>>sys.stderr,'List token ids:',str(list_token_ids)
+        print>>sys.stderr,'MATCHES:',str(matches_tar)
+        print>>sys.stderr,'#'*50
+    logging.debug('Detector targets out: '+str(matches_tar))
+    os.remove(crf_target_file)
+    return matches_tar
+def detect_holders(tab_feat_file, list_token_ids):
+    templates_holder = my_config_manager.get_templates_holder()
+    crf_holder_file = convert_to_crf(tab_feat_file,templates_holder)
+    logging.debug('File with crf format for HOLDERS '+crf_holder_file)
+    if DEBUG:
+        print>>sys.stderr,'#'*50
+        print>>sys.stderr,'CRF FEATURES HOLDERS'
+        f = open(crf_holder_file)
+        print>>sys.stderr,f.read()
+        f.close()
+        print>>sys.stderr,'#'*50
+    model_holder_file = my_config_manager.get_filename_model_holder()
+    out_crf_holder,error_crf = run_crfsuite_tag(crf_holder_file, model_holder_file)
+    logging.debug('HOLDERS crf error: '+error_crf)
+    matches_holder = match_crfsuite_out(out_crf_holder, list_token_ids)
+    if DEBUG:
+        print>>sys.stderr,'#'*50
+        print>>sys.stderr,'CRF output for HOLDERS'
+        print>>sys.stderr,'Raw output CRF:', out_crf_holder
+        print>>sys.stderr,'List token ids:',str(list_token_ids)
+        print>>sys.stderr,'MATCHES:',str(matches_holder)
+        print>>sys.stderr,'#'*50
+    logging.debug('Detector HOLDERS out: '+str(matches_holder))
+    os.remove(crf_holder_file)
+    return matches_holder
+def map_tokens_to_terms(list_tokens,knaf_obj):
+    global terms_for_token
+    if terms_for_token is None:
+        terms_for_token = {}
+        for term in knaf_obj.get_terms():
+            termid = term.get_id()
+            token_ids = term.get_span().get_span_ids()
+            for tokid in token_ids:
+                if tokid not in terms_for_token:
+                    terms_for_token[tokid] = [termid]
+                else:
+                    terms_for_token[tokid].append(termid)
+    ret = set()
+    for my_id in list_tokens:
+        term_ids = terms_for_token[my_id]
+        ret |= set(term_ids)
+    return sorted(list(ret))
+def add_opinions_to_knaf(triples,knaf_obj,text_for_tid,ids_used, map_to_terms=True,include_polarity_strength=True):
+    num_opinion =  0
+    for type_exp, span_exp, span_tar, span_hol in triples:
+        #Map tokens to terms
+        if map_to_terms:
+            span_exp_terms = map_tokens_to_terms(span_exp,kaf_obj)
+            span_tar_terms = map_tokens_to_terms(span_tar,kaf_obj)
+            span_hol_terms = map_tokens_to_terms(span_hol, kaf_obj)
+        else:
+            span_hol_terms = span_hol
+            span_tar_terms = span_tar
+            span_exp_terms = span_exp
+        ##Creating holder
+        span_hol = Cspan()
+        span_hol.create_from_ids(span_hol_terms)
+        my_hol = Cholder()
+        my_hol.set_span(span_hol)
+        hol_text = ' '.join(text_for_tid[tid] for tid in span_hol_terms)
+        my_hol.set_comment(hol_text)
+        #Creating target
+        span_tar = Cspan()
+        span_tar.create_from_ids(span_tar_terms)
+        my_tar = opinion_data.Ctarget()
+        my_tar.set_span(span_tar)
+        tar_text = ' '.join(text_for_tid[tid] for tid in span_tar_terms)
+        my_tar.set_comment(tar_text)
+        #########################
+        ##Creating expression
+        span_exp = Cspan()
+        span_exp.create_from_ids(span_exp_terms)
+        my_exp = Cexpression()
+        my_exp.set_span(span_exp)
+        my_exp.set_polarity(type_exp)
+        if include_polarity_strength:
+            my_exp.set_strength("1")
+        exp_text = ' '.join(text_for_tid[tid] for tid in span_exp_terms)
+        my_exp.set_comment(exp_text)
+        #########################
+        #To get the first possible ID not already used
+        new_id = None
+        while True:
+            new_id = 'o'+str(num_opinion+1)
+            if new_id not in ids_used:
+                ids_used.add(new_id)
+                break
+            else:
+                num_opinion += 1
+        new_opinion = Copinion(type=knaf_obj.get_type())
+        new_opinion.set_id(new_id)
+        if len(span_hol_terms) != 0:    #To avoid empty holders
+            new_opinion.set_holder(my_hol)
+        if len(span_tar_terms) != 0:    #To avoid empty targets
+            new_opinion.set_target(my_tar)
+        new_opinion.set_expression(my_exp)
+        knaf_obj.add_opinion(new_opinion)
+##
+# Input_file_stream can be a filename of a stream
+# Opoutfile_trasm can be a filename of a stream
+#Config file must be a string filename
+def tag_file_with_opinions(input_file_stream, output_file_stream,model_folder,kaf_obj=None, remove_existing_opinions=True,include_polarity_strength=True,timestamp=True):
+    config_filename = os.path.join(model_folder,internal_config_filename)
+    if not os.path.exists(config_filename):
+        print>>sys.stderr,'Config file not found on:',config_filename
+        sys.exit(-1)
+    my_config_manager.set_current_folder(__this_folder)
+    my_config_manager.set_config(config_filename)
+    if kaf_obj is not None:
+        knaf_obj = kaf_obj
+    else:
+        knaf_obj = KafNafParser(input_file_stream)
+    #Create a temporary file
+    out_feat_file, err_feat_file = extract_features(knaf_obj)
+    if DEBUG:
+        print>>sys.stderr,'#'*50
+        print>>sys.stderr,'FEATURE FILE'
+        f = open(out_feat_file)
+        print>>sys.stderr,f.read()
+        f.close()
+        print>>sys.stderr,'#'*50
+    #get all the tokens in order
+    list_token_ids = []
+    text_for_wid = {}
+    text_for_tid = {}
+    sentence_for_token = {}
+    for token_obj in knaf_obj.get_tokens():
+        token = token_obj.get_text()
+        s_id = token_obj.get_sent()
+        w_id = token_obj.get_id()
+        text_for_wid[w_id] = token
+        list_token_ids.append(w_id)
+        sentence_for_token[w_id] = s_id
+    for term in knaf_obj.get_terms():
+        tid = term.get_id()
+        toks = [text_for_wid.get(wid,'') for wid in term.get_span().get_span_ids()]
+        text_for_tid[tid] = ' '.join(toks)
+    expressions = detect_expressions(out_feat_file,list_token_ids)
+    targets = detect_targets(out_feat_file, list_token_ids)
+    holders = detect_holders(out_feat_file, list_token_ids)
+    os.remove(out_feat_file)
+    os.remove(err_feat_file)
+    if DEBUG:
+        print>>sys.stderr,"Expressions detected:"
+        for e in expressions:
+            print>>sys.stderr,'\t',e, ' '.join([text_for_wid[wid] for wid in e[0] ])
+        print>>sys.stderr
+        print>>sys.stderr,'Targets detected'
+        for t in targets:
+            print>>sys.stderr,'\t',t, ' '.join([text_for_wid[wid] for wid in t[0] ])
+        print>>sys.stderr
+        print>>sys.stderr,'Holders',holders
+        for h in holders:
+            print>>sys.stderr,'\t',h, ' '.join([text_for_wid[wid] for wid in h[0] ])
+        print>>sys.stderr
+    # Entity linker based on distances
+    ####triples = link_entities_distance(expressions,targets,holders,sentence_for_token)
+    triples = link_entities_svm(expressions, targets, holders, knaf_obj, my_config_manager)
+    ids_used = set()
+    if remove_existing_opinions:
+        knaf_obj.remove_opinion_layer()
+    else:
+        for opi in knaf_obj.get_opinions():
+            ids_used.add(opi.get_id())
+    add_opinions_to_knaf(triples, knaf_obj,text_for_tid,ids_used, map_to_terms=False,include_polarity_strength=include_polarity_strength)
+    #Adding linguistic processor
+    my_lp = Clp()
+    my_lp.set_name(__desc)
+    my_lp.set_version(__last_edited+'_'+__version)
+    if timestamp:
+        my_lp.set_timestamp()   ##Set to the current date and time
+    else:
+        my_lp.set_timestamp('*')
+    knaf_obj.add_linguistic_processor('opinions',my_lp)
+    knaf_obj.dump(output_file_stream)
+def obtain_predefined_model(lang,domain,just_show=False):
+    #This function will read the models from the file models.cfg and will return
+    #The model folder for the lang and domain
+    # format of the file: 1 model per line: lang|domain|path_to_folder
+    model_file = os.path.join(__this_folder,'models.cfg')
+    fic = open(model_file)
+    use_this_model = None
+    if just_show:
+        print '#'*25
+        print 'Models available'
+        print '#'*25
+    nm = 0
+    for line in fic:
+        if line[0]!='#':
+            this_lang, this_domain, this_model,this_desc = line.strip().split('|')
+            if just_show:
+                print '  Model',nm
+                print '    Lang:',this_lang
+                print '    Domain:', this_domain
+                print '    Folder:',this_model
+                print '    Desc:',this_desc
+                nm+= 1
+            else:
+                if this_lang == lang and this_domain == domain:
+                    use_this_model = this_model
+                    break
+    fic.close()
+    if just_show:
+         print '#'*25
+    return use_this_model
+if __name__ == '__main__':
+    argument_parser = argparse.ArgumentParser(description='Detect opinion triples in a KAF/NAF file')
+    group = argument_parser.add_mutually_exclusive_group(required=True)
+    group.add_argument('-m',dest='model_folder',help='Folder storing the trained models')
+    group.add_argument('-d', dest='domain',help='The domain where the models were trained')
+    group.add_argument('-show-models', dest='show_models', action='store_true',help='Show the models available and finish')
+    argument_parser.add_argument('-keep-opinions',dest='keep_opinions',action='store_true',help='Keep the opinions from the input (by default will be deleted)')
+    argument_parser.add_argument('-no-time',dest='timestamp',action='store_false',help='No include time in timestamp (for testing)')
+    arguments = argument_parser.parse_args()
+    if arguments.show_models:
+        obtain_predefined_model(None,None,just_show=True)
+        sys.exit(0)
+    knaf_obj = KafNafParser(sys.stdin)
+    model_folder = None
+    if arguments.model_folder is not None:
+        model_folder = arguments.model_folder
+    else:
+        #Obtain the language
+        lang = knaf_obj.get_language()
+        model_folder = obtain_predefined_model(lang,arguments.domain)
+    tag_file_with_opinions(None, sys.stdout,model_folder,kaf_obj=knaf_obj,remove_existing_opinions=(not arguments.keep_opinions),timestamp=arguments.timestamp)
+    sys.exit(0)