opener-opinion-detector-base 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +101 -0
- data/bin/opinion-detector-base +19 -0
- data/core/annotation.cfg.erb +9 -0
- data/core/packages/KafNafParser-1.4.tar.gz +0 -0
- data/core/packages/VUA_pylib-1.5.tar.gz +0 -0
- data/core/python-scripts/LICENSE +339 -0
- data/core/python-scripts/README.md +226 -0
- data/core/python-scripts/classify_kaf_naf_file.py +499 -0
- data/core/python-scripts/cross_validation.py +634 -0
- data/core/python-scripts/generate_folds.py +134 -0
- data/core/python-scripts/models.cfg +10 -0
- data/core/python-scripts/my_templates/README +33 -0
- data/core/python-scripts/my_templates/templates_exp.only0.txt +6 -0
- data/core/python-scripts/my_templates/templates_exp.pol0.txt +10 -0
- data/core/python-scripts/my_templates/templates_exp.red.txt +7 -0
- data/core/python-scripts/my_templates/templates_exp.txt +10 -0
- data/core/python-scripts/my_templates/templates_holder.only0.txt +11 -0
- data/core/python-scripts/my_templates/templates_holder.red.txt +9 -0
- data/core/python-scripts/my_templates/templates_holder.txt +10 -0
- data/core/python-scripts/my_templates/templates_target.only0.txt +11 -0
- data/core/python-scripts/my_templates/templates_target.red.txt +9 -0
- data/core/python-scripts/my_templates/templates_target.txt +10 -0
- data/core/python-scripts/run_all_experiments.sh +49 -0
- data/core/python-scripts/run_basic.py +20 -0
- data/core/python-scripts/run_experiment.sh +42 -0
- data/core/python-scripts/scripts/__init__.py +1 -0
- data/core/python-scripts/scripts/config_manager.py +314 -0
- data/core/python-scripts/scripts/crfutils.py +215 -0
- data/core/python-scripts/scripts/extract_feats_relations.py +295 -0
- data/core/python-scripts/scripts/extract_features.py +376 -0
- data/core/python-scripts/scripts/feats_to_crf.exp.py +105 -0
- data/core/python-scripts/scripts/lexicons.py +44 -0
- data/core/python-scripts/scripts/link_entities_distance.py +77 -0
- data/core/python-scripts/scripts/relation_classifier.py +250 -0
- data/core/python-scripts/train.py +566 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/SOURCES.txt +22 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/installed-files.txt +47 -0
- data/core/site-packages/pre_build/KafNafParser-1.4-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.py +390 -0
- data/core/site-packages/pre_build/KafNafParser/KafNafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.py +14 -0
- data/core/site-packages/pre_build/KafNafParser/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.py +125 -0
- data/core/site-packages/pre_build/KafNafParser/constituency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.py +52 -0
- data/core/site-packages/pre_build/KafNafParser/coreference_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.py +78 -0
- data/core/site-packages/pre_build/KafNafParser/dependency_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.py +59 -0
- data/core/site-packages/pre_build/KafNafParser/entity_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.py +41 -0
- data/core/site-packages/pre_build/KafNafParser/external_references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.py +2 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/__init__.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.py +205 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/constituency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.py +309 -0
- data/core/site-packages/pre_build/KafNafParser/feature_extractor/dependency.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.py +131 -0
- data/core/site-packages/pre_build/KafNafParser/features_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.py +127 -0
- data/core/site-packages/pre_build/KafNafParser/header_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.py +211 -0
- data/core/site-packages/pre_build/KafNafParser/opinion_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.py +23 -0
- data/core/site-packages/pre_build/KafNafParser/references_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.py +63 -0
- data/core/site-packages/pre_build/KafNafParser/span_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.py +111 -0
- data/core/site-packages/pre_build/KafNafParser/term_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.py +42 -0
- data/core/site-packages/pre_build/KafNafParser/term_sentiment_data.pyc +0 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.py +99 -0
- data/core/site-packages/pre_build/KafNafParser/text_data.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/SOURCES.txt +14 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/installed-files.txt +23 -0
- data/core/site-packages/pre_build/VUA_pylib-1.5-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/common/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.py +28 -0
- data/core/site-packages/pre_build/VUA_pylib/common/common.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.py +156 -0
- data/core/site-packages/pre_build/VUA_pylib/corpus_reader/google_web_nl.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.py +121 -0
- data/core/site-packages/pre_build/VUA_pylib/io_utils/feature_file.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.py +1 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/__init__.pyc +0 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.py +72 -0
- data/core/site-packages/pre_build/VUA_pylib/lexicon/lexicon.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/PKG-INFO +10 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/SOURCES.txt +7 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/dependency_links.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/installed-files.txt +11 -0
- data/core/site-packages/pre_build/VUKafParserPy-1.0-py2.7.egg-info/top_level.txt +1 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.py +165 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafDataObjectsMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.py +439 -0
- data/core/site-packages/pre_build/VUKafParserPy/KafParserMod.pyc +0 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.py +7 -0
- data/core/site-packages/pre_build/VUKafParserPy/__init__.pyc +0 -0
- data/core/vendor/src/crfsuite/AUTHORS +1 -0
- data/core/vendor/src/crfsuite/COPYING +27 -0
- data/core/vendor/src/crfsuite/ChangeLog +103 -0
- data/core/vendor/src/crfsuite/INSTALL +236 -0
- data/core/vendor/src/crfsuite/Makefile.am +19 -0
- data/core/vendor/src/crfsuite/Makefile.in +783 -0
- data/core/vendor/src/crfsuite/README +183 -0
- data/core/vendor/src/crfsuite/aclocal.m4 +9018 -0
- data/core/vendor/src/crfsuite/autogen.sh +38 -0
- data/core/vendor/src/crfsuite/compile +143 -0
- data/core/vendor/src/crfsuite/config.guess +1502 -0
- data/core/vendor/src/crfsuite/config.h.in +198 -0
- data/core/vendor/src/crfsuite/config.sub +1714 -0
- data/core/vendor/src/crfsuite/configure +14273 -0
- data/core/vendor/src/crfsuite/configure.in +149 -0
- data/core/vendor/src/crfsuite/crfsuite.sln +42 -0
- data/core/vendor/src/crfsuite/depcomp +630 -0
- data/core/vendor/src/crfsuite/example/chunking.py +49 -0
- data/core/vendor/src/crfsuite/example/crfutils.py +179 -0
- data/core/vendor/src/crfsuite/example/ner.py +270 -0
- data/core/vendor/src/crfsuite/example/pos.py +78 -0
- data/core/vendor/src/crfsuite/example/template.py +88 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.am +29 -0
- data/core/vendor/src/crfsuite/frontend/Makefile.in +640 -0
- data/core/vendor/src/crfsuite/frontend/dump.c +116 -0
- data/core/vendor/src/crfsuite/frontend/frontend.vcxproj +129 -0
- data/core/vendor/src/crfsuite/frontend/iwa.c +273 -0
- data/core/vendor/src/crfsuite/frontend/iwa.h +65 -0
- data/core/vendor/src/crfsuite/frontend/learn.c +439 -0
- data/core/vendor/src/crfsuite/frontend/main.c +137 -0
- data/core/vendor/src/crfsuite/frontend/option.c +93 -0
- data/core/vendor/src/crfsuite/frontend/option.h +86 -0
- data/core/vendor/src/crfsuite/frontend/readdata.h +38 -0
- data/core/vendor/src/crfsuite/frontend/reader.c +136 -0
- data/core/vendor/src/crfsuite/frontend/tag.c +427 -0
- data/core/vendor/src/crfsuite/genbinary.sh.in +15 -0
- data/core/vendor/src/crfsuite/include/Makefile.am +11 -0
- data/core/vendor/src/crfsuite/include/Makefile.in +461 -0
- data/core/vendor/src/crfsuite/include/crfsuite.h +1063 -0
- data/core/vendor/src/crfsuite/include/crfsuite.hpp +555 -0
- data/core/vendor/src/crfsuite/include/crfsuite_api.hpp +400 -0
- data/core/vendor/src/crfsuite/include/os.h +61 -0
- data/core/vendor/src/crfsuite/install-sh +520 -0
- data/core/vendor/src/crfsuite/lib/cqdb/COPYING +28 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.am +21 -0
- data/core/vendor/src/crfsuite/lib/cqdb/Makefile.in +549 -0
- data/core/vendor/src/crfsuite/lib/cqdb/cqdb.vcxproj +86 -0
- data/core/vendor/src/crfsuite/lib/cqdb/include/cqdb.h +524 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/cqdb.c +587 -0
- data/core/vendor/src/crfsuite/lib/cqdb/src/lookup3.c +976 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.am +46 -0
- data/core/vendor/src/crfsuite/lib/crf/Makefile.in +721 -0
- data/core/vendor/src/crfsuite/lib/crf/crf.vcxproj +216 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d.h +353 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_context.c +705 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_encode.c +943 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_feature.c +352 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_model.c +994 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crf1d_tag.c +550 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite.c +492 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_internal.h +236 -0
- data/core/vendor/src/crfsuite/lib/crf/src/crfsuite_train.c +272 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dataset.c +106 -0
- data/core/vendor/src/crfsuite/lib/crf/src/dictionary.c +118 -0
- data/core/vendor/src/crfsuite/lib/crf/src/holdout.c +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.c +91 -0
- data/core/vendor/src/crfsuite/lib/crf/src/logging.h +48 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.c +335 -0
- data/core/vendor/src/crfsuite/lib/crf/src/params.h +80 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.c +172 -0
- data/core/vendor/src/crfsuite/lib/crf/src/quark.h +46 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.c +1107 -0
- data/core/vendor/src/crfsuite/lib/crf/src/rumavl.h +160 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_arow.c +408 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_averaged_perceptron.c +242 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_l2sgd.c +507 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_lbfgs.c +338 -0
- data/core/vendor/src/crfsuite/lib/crf/src/train_passive_aggressive.c +435 -0
- data/core/vendor/src/crfsuite/lib/crf/src/vecmath.h +341 -0
- data/core/vendor/src/crfsuite/ltmain.sh +8413 -0
- data/core/vendor/src/crfsuite/missing +376 -0
- data/core/vendor/src/crfsuite/swig/Makefile.am +13 -0
- data/core/vendor/src/crfsuite/swig/Makefile.in +365 -0
- data/core/vendor/src/crfsuite/swig/crfsuite.cpp +2 -0
- data/core/vendor/src/crfsuite/swig/export.i +32 -0
- data/core/vendor/src/crfsuite/swig/python/README +92 -0
- data/core/vendor/src/crfsuite/swig/python/crfsuite.py +329 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.cpp +14355 -0
- data/core/vendor/src/crfsuite/swig/python/export_wrap.h +63 -0
- data/core/vendor/src/crfsuite/swig/python/prepare.sh +9 -0
- data/core/vendor/src/crfsuite/swig/python/sample_tag.py +52 -0
- data/core/vendor/src/crfsuite/swig/python/sample_train.py +68 -0
- data/core/vendor/src/crfsuite/swig/python/setup.py +44 -0
- data/core/vendor/src/crfsuite/win32/stdint.h +679 -0
- data/core/vendor/src/liblbfgs/AUTHORS +1 -0
- data/core/vendor/src/liblbfgs/COPYING +22 -0
- data/core/vendor/src/liblbfgs/ChangeLog +120 -0
- data/core/vendor/src/liblbfgs/INSTALL +231 -0
- data/core/vendor/src/liblbfgs/Makefile.am +10 -0
- data/core/vendor/src/liblbfgs/Makefile.in +638 -0
- data/core/vendor/src/liblbfgs/NEWS +0 -0
- data/core/vendor/src/liblbfgs/README +71 -0
- data/core/vendor/src/liblbfgs/aclocal.m4 +6985 -0
- data/core/vendor/src/liblbfgs/autogen.sh +38 -0
- data/core/vendor/src/liblbfgs/config.guess +1411 -0
- data/core/vendor/src/liblbfgs/config.h.in +64 -0
- data/core/vendor/src/liblbfgs/config.sub +1500 -0
- data/core/vendor/src/liblbfgs/configure +21146 -0
- data/core/vendor/src/liblbfgs/configure.in +107 -0
- data/core/vendor/src/liblbfgs/depcomp +522 -0
- data/core/vendor/src/liblbfgs/include/lbfgs.h +745 -0
- data/core/vendor/src/liblbfgs/install-sh +322 -0
- data/core/vendor/src/liblbfgs/lbfgs.sln +26 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.am +24 -0
- data/core/vendor/src/liblbfgs/lib/Makefile.in +499 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_ansi.h +133 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_double.h +294 -0
- data/core/vendor/src/liblbfgs/lib/arithmetic_sse_float.h +298 -0
- data/core/vendor/src/liblbfgs/lib/lbfgs.c +1371 -0
- data/core/vendor/src/liblbfgs/lib/lib.vcxproj +95 -0
- data/core/vendor/src/liblbfgs/ltmain.sh +6426 -0
- data/core/vendor/src/liblbfgs/missing +353 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.am +15 -0
- data/core/vendor/src/liblbfgs/sample/Makefile.in +433 -0
- data/core/vendor/src/liblbfgs/sample/sample.c +81 -0
- data/core/vendor/src/liblbfgs/sample/sample.cpp +126 -0
- data/core/vendor/src/liblbfgs/sample/sample.vcxproj +105 -0
- data/core/vendor/src/svm_light/LICENSE.txt +59 -0
- data/core/vendor/src/svm_light/Makefile +105 -0
- data/core/vendor/src/svm_light/kernel.h +40 -0
- data/core/vendor/src/svm_light/svm_classify.c +197 -0
- data/core/vendor/src/svm_light/svm_common.c +985 -0
- data/core/vendor/src/svm_light/svm_common.h +301 -0
- data/core/vendor/src/svm_light/svm_hideo.c +1062 -0
- data/core/vendor/src/svm_light/svm_learn.c +4147 -0
- data/core/vendor/src/svm_light/svm_learn.h +169 -0
- data/core/vendor/src/svm_light/svm_learn_main.c +397 -0
- data/core/vendor/src/svm_light/svm_loqo.c +211 -0
- data/ext/hack/Rakefile +17 -0
- data/ext/hack/support.rb +88 -0
- data/lib/opener/opinion_detectors/base.rb +112 -0
- data/lib/opener/opinion_detectors/base/version.rb +7 -0
- data/lib/opener/opinion_detectors/configuration_creator.rb +86 -0
- data/lib/opener/opinion_detectors/de.rb +7 -0
- data/lib/opener/opinion_detectors/en.rb +7 -0
- data/lib/opener/opinion_detectors/it.rb +7 -0
- data/lib/opener/opinion_detectors/nl.rb +6 -0
- data/opener-opinion-detector-base.gemspec +35 -0
- data/pre_build_requirements.txt +3 -0
- metadata +374 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
/*
|
2
|
+
* ANSI C implementation of vector operations.
|
3
|
+
*
|
4
|
+
* Copyright (c) 2007-2010 Naoaki Okazaki
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
+
* of this software and associated documentation files (the "Software"), to deal
|
9
|
+
* in the Software without restriction, including without limitation the rights
|
10
|
+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
* copies of the Software, and to permit persons to whom the Software is
|
12
|
+
* furnished to do so, subject to the following conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be included in
|
15
|
+
* all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
|
+
* THE SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* $Id$ */
|
27
|
+
|
28
|
+
#include <stdlib.h>
|
29
|
+
#include <memory.h>
|
30
|
+
|
31
|
+
#if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
|
32
|
+
#define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
|
33
|
+
#else
|
34
|
+
#define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
|
35
|
+
#endif/*LBFGS_IEEE_FLOAT*/
|
36
|
+
|
37
|
+
inline static void* vecalloc(size_t size)
|
38
|
+
{
|
39
|
+
void *memblock = malloc(size);
|
40
|
+
if (memblock) {
|
41
|
+
memset(memblock, 0, size);
|
42
|
+
}
|
43
|
+
return memblock;
|
44
|
+
}
|
45
|
+
|
46
|
+
inline static void vecfree(void *memblock)
|
47
|
+
{
|
48
|
+
free(memblock);
|
49
|
+
}
|
50
|
+
|
51
|
+
inline static void vecset(lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
|
52
|
+
{
|
53
|
+
int i;
|
54
|
+
|
55
|
+
for (i = 0;i < n;++i) {
|
56
|
+
x[i] = c;
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
inline static void veccpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
|
61
|
+
{
|
62
|
+
int i;
|
63
|
+
|
64
|
+
for (i = 0;i < n;++i) {
|
65
|
+
y[i] = x[i];
|
66
|
+
}
|
67
|
+
}
|
68
|
+
|
69
|
+
inline static void vecncpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
|
70
|
+
{
|
71
|
+
int i;
|
72
|
+
|
73
|
+
for (i = 0;i < n;++i) {
|
74
|
+
y[i] = -x[i];
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
inline static void vecadd(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
|
79
|
+
{
|
80
|
+
int i;
|
81
|
+
|
82
|
+
for (i = 0;i < n;++i) {
|
83
|
+
y[i] += c * x[i];
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
inline static void vecdiff(lbfgsfloatval_t *z, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
|
88
|
+
{
|
89
|
+
int i;
|
90
|
+
|
91
|
+
for (i = 0;i < n;++i) {
|
92
|
+
z[i] = x[i] - y[i];
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
inline static void vecscale(lbfgsfloatval_t *y, const lbfgsfloatval_t c, const int n)
|
97
|
+
{
|
98
|
+
int i;
|
99
|
+
|
100
|
+
for (i = 0;i < n;++i) {
|
101
|
+
y[i] *= c;
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
inline static void vecmul(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
|
106
|
+
{
|
107
|
+
int i;
|
108
|
+
|
109
|
+
for (i = 0;i < n;++i) {
|
110
|
+
y[i] *= x[i];
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
inline static void vecdot(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
|
115
|
+
{
|
116
|
+
int i;
|
117
|
+
*s = 0.;
|
118
|
+
for (i = 0;i < n;++i) {
|
119
|
+
*s += x[i] * y[i];
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
inline static void vec2norm(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
|
124
|
+
{
|
125
|
+
vecdot(s, x, x, n);
|
126
|
+
*s = (lbfgsfloatval_t)sqrt(*s);
|
127
|
+
}
|
128
|
+
|
129
|
+
inline static void vec2norminv(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
|
130
|
+
{
|
131
|
+
vec2norm(s, x, n);
|
132
|
+
*s = (lbfgsfloatval_t)(1.0 / *s);
|
133
|
+
}
|
@@ -0,0 +1,294 @@
|
|
1
|
+
/*
|
2
|
+
* SSE2 implementation of vector oprations (64bit double).
|
3
|
+
*
|
4
|
+
* Copyright (c) 2007-2010 Naoaki Okazaki
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
+
* of this software and associated documentation files (the "Software"), to deal
|
9
|
+
* in the Software without restriction, including without limitation the rights
|
10
|
+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
* copies of the Software, and to permit persons to whom the Software is
|
12
|
+
* furnished to do so, subject to the following conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be included in
|
15
|
+
* all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
|
+
* THE SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* $Id$ */
|
27
|
+
|
28
|
+
#include <stdlib.h>
|
29
|
+
#ifndef __APPLE__
|
30
|
+
#include <malloc.h>
|
31
|
+
#endif
|
32
|
+
#include <memory.h>
|
33
|
+
|
34
|
+
#if 1400 <= _MSC_VER
|
35
|
+
#include <intrin.h>
|
36
|
+
#endif/*1400 <= _MSC_VER*/
|
37
|
+
|
38
|
+
#if HAVE_EMMINTRIN_H
|
39
|
+
#include <emmintrin.h>
|
40
|
+
#endif/*HAVE_EMMINTRIN_H*/
|
41
|
+
|
42
|
+
inline static void* vecalloc(size_t size)
|
43
|
+
{
|
44
|
+
#if defined(_MSC_VER)
|
45
|
+
void *memblock = _aligned_malloc(size, 16);
|
46
|
+
#elif defined(__APPLE__) /* OS X always aligns on 16-byte boundaries */
|
47
|
+
void *memblock = malloc(size);
|
48
|
+
#else
|
49
|
+
void *memblock = NULL, *p = NULL;
|
50
|
+
if (posix_memalign(&p, 16, size) == 0) {
|
51
|
+
memblock = p;
|
52
|
+
}
|
53
|
+
#endif
|
54
|
+
if (memblock != NULL) {
|
55
|
+
memset(memblock, 0, size);
|
56
|
+
}
|
57
|
+
return memblock;
|
58
|
+
}
|
59
|
+
|
60
|
+
inline static void vecfree(void *memblock)
|
61
|
+
{
|
62
|
+
#ifdef _MSC_VER
|
63
|
+
_aligned_free(memblock);
|
64
|
+
#else
|
65
|
+
free(memblock);
|
66
|
+
#endif
|
67
|
+
}
|
68
|
+
|
69
|
+
#define fsigndiff(x, y) \
|
70
|
+
((_mm_movemask_pd(_mm_set_pd(*(x), *(y))) + 1) & 0x002)
|
71
|
+
|
72
|
+
#define vecset(x, c, n) \
|
73
|
+
{ \
|
74
|
+
int i; \
|
75
|
+
__m128d XMM0 = _mm_set1_pd(c); \
|
76
|
+
for (i = 0;i < (n);i += 8) { \
|
77
|
+
_mm_store_pd((x)+i , XMM0); \
|
78
|
+
_mm_store_pd((x)+i+2, XMM0); \
|
79
|
+
_mm_store_pd((x)+i+4, XMM0); \
|
80
|
+
_mm_store_pd((x)+i+6, XMM0); \
|
81
|
+
} \
|
82
|
+
}
|
83
|
+
|
84
|
+
#define veccpy(y, x, n) \
|
85
|
+
{ \
|
86
|
+
int i; \
|
87
|
+
for (i = 0;i < (n);i += 8) { \
|
88
|
+
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
89
|
+
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
90
|
+
__m128d XMM2 = _mm_load_pd((x)+i+4); \
|
91
|
+
__m128d XMM3 = _mm_load_pd((x)+i+6); \
|
92
|
+
_mm_store_pd((y)+i , XMM0); \
|
93
|
+
_mm_store_pd((y)+i+2, XMM1); \
|
94
|
+
_mm_store_pd((y)+i+4, XMM2); \
|
95
|
+
_mm_store_pd((y)+i+6, XMM3); \
|
96
|
+
} \
|
97
|
+
}
|
98
|
+
|
99
|
+
#define vecncpy(y, x, n) \
|
100
|
+
{ \
|
101
|
+
int i; \
|
102
|
+
for (i = 0;i < (n);i += 8) { \
|
103
|
+
__m128d XMM0 = _mm_setzero_pd(); \
|
104
|
+
__m128d XMM1 = _mm_setzero_pd(); \
|
105
|
+
__m128d XMM2 = _mm_setzero_pd(); \
|
106
|
+
__m128d XMM3 = _mm_setzero_pd(); \
|
107
|
+
__m128d XMM4 = _mm_load_pd((x)+i ); \
|
108
|
+
__m128d XMM5 = _mm_load_pd((x)+i+2); \
|
109
|
+
__m128d XMM6 = _mm_load_pd((x)+i+4); \
|
110
|
+
__m128d XMM7 = _mm_load_pd((x)+i+6); \
|
111
|
+
XMM0 = _mm_sub_pd(XMM0, XMM4); \
|
112
|
+
XMM1 = _mm_sub_pd(XMM1, XMM5); \
|
113
|
+
XMM2 = _mm_sub_pd(XMM2, XMM6); \
|
114
|
+
XMM3 = _mm_sub_pd(XMM3, XMM7); \
|
115
|
+
_mm_store_pd((y)+i , XMM0); \
|
116
|
+
_mm_store_pd((y)+i+2, XMM1); \
|
117
|
+
_mm_store_pd((y)+i+4, XMM2); \
|
118
|
+
_mm_store_pd((y)+i+6, XMM3); \
|
119
|
+
} \
|
120
|
+
}
|
121
|
+
|
122
|
+
#define vecadd(y, x, c, n) \
|
123
|
+
{ \
|
124
|
+
int i; \
|
125
|
+
__m128d XMM7 = _mm_set1_pd(c); \
|
126
|
+
for (i = 0;i < (n);i += 4) { \
|
127
|
+
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
128
|
+
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
129
|
+
__m128d XMM2 = _mm_load_pd((y)+i ); \
|
130
|
+
__m128d XMM3 = _mm_load_pd((y)+i+2); \
|
131
|
+
XMM0 = _mm_mul_pd(XMM0, XMM7); \
|
132
|
+
XMM1 = _mm_mul_pd(XMM1, XMM7); \
|
133
|
+
XMM2 = _mm_add_pd(XMM2, XMM0); \
|
134
|
+
XMM3 = _mm_add_pd(XMM3, XMM1); \
|
135
|
+
_mm_store_pd((y)+i , XMM2); \
|
136
|
+
_mm_store_pd((y)+i+2, XMM3); \
|
137
|
+
} \
|
138
|
+
}
|
139
|
+
|
140
|
+
#define vecdiff(z, x, y, n) \
|
141
|
+
{ \
|
142
|
+
int i; \
|
143
|
+
for (i = 0;i < (n);i += 8) { \
|
144
|
+
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
145
|
+
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
146
|
+
__m128d XMM2 = _mm_load_pd((x)+i+4); \
|
147
|
+
__m128d XMM3 = _mm_load_pd((x)+i+6); \
|
148
|
+
__m128d XMM4 = _mm_load_pd((y)+i ); \
|
149
|
+
__m128d XMM5 = _mm_load_pd((y)+i+2); \
|
150
|
+
__m128d XMM6 = _mm_load_pd((y)+i+4); \
|
151
|
+
__m128d XMM7 = _mm_load_pd((y)+i+6); \
|
152
|
+
XMM0 = _mm_sub_pd(XMM0, XMM4); \
|
153
|
+
XMM1 = _mm_sub_pd(XMM1, XMM5); \
|
154
|
+
XMM2 = _mm_sub_pd(XMM2, XMM6); \
|
155
|
+
XMM3 = _mm_sub_pd(XMM3, XMM7); \
|
156
|
+
_mm_store_pd((z)+i , XMM0); \
|
157
|
+
_mm_store_pd((z)+i+2, XMM1); \
|
158
|
+
_mm_store_pd((z)+i+4, XMM2); \
|
159
|
+
_mm_store_pd((z)+i+6, XMM3); \
|
160
|
+
} \
|
161
|
+
}
|
162
|
+
|
163
|
+
#define vecscale(y, c, n) \
|
164
|
+
{ \
|
165
|
+
int i; \
|
166
|
+
__m128d XMM7 = _mm_set1_pd(c); \
|
167
|
+
for (i = 0;i < (n);i += 4) { \
|
168
|
+
__m128d XMM0 = _mm_load_pd((y)+i ); \
|
169
|
+
__m128d XMM1 = _mm_load_pd((y)+i+2); \
|
170
|
+
XMM0 = _mm_mul_pd(XMM0, XMM7); \
|
171
|
+
XMM1 = _mm_mul_pd(XMM1, XMM7); \
|
172
|
+
_mm_store_pd((y)+i , XMM0); \
|
173
|
+
_mm_store_pd((y)+i+2, XMM1); \
|
174
|
+
} \
|
175
|
+
}
|
176
|
+
|
177
|
+
#define vecmul(y, x, n) \
|
178
|
+
{ \
|
179
|
+
int i; \
|
180
|
+
for (i = 0;i < (n);i += 8) { \
|
181
|
+
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
182
|
+
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
183
|
+
__m128d XMM2 = _mm_load_pd((x)+i+4); \
|
184
|
+
__m128d XMM3 = _mm_load_pd((x)+i+6); \
|
185
|
+
__m128d XMM4 = _mm_load_pd((y)+i ); \
|
186
|
+
__m128d XMM5 = _mm_load_pd((y)+i+2); \
|
187
|
+
__m128d XMM6 = _mm_load_pd((y)+i+4); \
|
188
|
+
__m128d XMM7 = _mm_load_pd((y)+i+6); \
|
189
|
+
XMM4 = _mm_mul_pd(XMM4, XMM0); \
|
190
|
+
XMM5 = _mm_mul_pd(XMM5, XMM1); \
|
191
|
+
XMM6 = _mm_mul_pd(XMM6, XMM2); \
|
192
|
+
XMM7 = _mm_mul_pd(XMM7, XMM3); \
|
193
|
+
_mm_store_pd((y)+i , XMM4); \
|
194
|
+
_mm_store_pd((y)+i+2, XMM5); \
|
195
|
+
_mm_store_pd((y)+i+4, XMM6); \
|
196
|
+
_mm_store_pd((y)+i+6, XMM7); \
|
197
|
+
} \
|
198
|
+
}
|
199
|
+
|
200
|
+
|
201
|
+
|
202
|
+
#if 3 <= __SSE__ || defined(__SSE3__)
|
203
|
+
/*
|
204
|
+
Horizontal add with haddps SSE3 instruction. The work register (rw)
|
205
|
+
is unused.
|
206
|
+
*/
|
207
|
+
#define __horizontal_sum(r, rw) \
|
208
|
+
r = _mm_hadd_ps(r, r); \
|
209
|
+
r = _mm_hadd_ps(r, r);
|
210
|
+
|
211
|
+
#else
|
212
|
+
/*
|
213
|
+
Horizontal add with SSE instruction. The work register (rw) is used.
|
214
|
+
*/
|
215
|
+
#define __horizontal_sum(r, rw) \
|
216
|
+
rw = r; \
|
217
|
+
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
|
218
|
+
r = _mm_add_ps(r, rw); \
|
219
|
+
rw = r; \
|
220
|
+
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
|
221
|
+
r = _mm_add_ps(r, rw);
|
222
|
+
|
223
|
+
#endif
|
224
|
+
|
225
|
+
#define vecdot(s, x, y, n) \
|
226
|
+
{ \
|
227
|
+
int i; \
|
228
|
+
__m128d XMM0 = _mm_setzero_pd(); \
|
229
|
+
__m128d XMM1 = _mm_setzero_pd(); \
|
230
|
+
__m128d XMM2, XMM3, XMM4, XMM5; \
|
231
|
+
for (i = 0;i < (n);i += 4) { \
|
232
|
+
XMM2 = _mm_load_pd((x)+i ); \
|
233
|
+
XMM3 = _mm_load_pd((x)+i+2); \
|
234
|
+
XMM4 = _mm_load_pd((y)+i ); \
|
235
|
+
XMM5 = _mm_load_pd((y)+i+2); \
|
236
|
+
XMM2 = _mm_mul_pd(XMM2, XMM4); \
|
237
|
+
XMM3 = _mm_mul_pd(XMM3, XMM5); \
|
238
|
+
XMM0 = _mm_add_pd(XMM0, XMM2); \
|
239
|
+
XMM1 = _mm_add_pd(XMM1, XMM3); \
|
240
|
+
} \
|
241
|
+
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
242
|
+
XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
|
243
|
+
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
244
|
+
_mm_store_sd((s), XMM0); \
|
245
|
+
}
|
246
|
+
|
247
|
+
#define vec2norm(s, x, n) \
|
248
|
+
{ \
|
249
|
+
int i; \
|
250
|
+
__m128d XMM0 = _mm_setzero_pd(); \
|
251
|
+
__m128d XMM1 = _mm_setzero_pd(); \
|
252
|
+
__m128d XMM2, XMM3, XMM4, XMM5; \
|
253
|
+
for (i = 0;i < (n);i += 4) { \
|
254
|
+
XMM2 = _mm_load_pd((x)+i ); \
|
255
|
+
XMM3 = _mm_load_pd((x)+i+2); \
|
256
|
+
XMM4 = XMM2; \
|
257
|
+
XMM5 = XMM3; \
|
258
|
+
XMM2 = _mm_mul_pd(XMM2, XMM4); \
|
259
|
+
XMM3 = _mm_mul_pd(XMM3, XMM5); \
|
260
|
+
XMM0 = _mm_add_pd(XMM0, XMM2); \
|
261
|
+
XMM1 = _mm_add_pd(XMM1, XMM3); \
|
262
|
+
} \
|
263
|
+
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
264
|
+
XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
|
265
|
+
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
266
|
+
XMM0 = _mm_sqrt_pd(XMM0); \
|
267
|
+
_mm_store_sd((s), XMM0); \
|
268
|
+
}
|
269
|
+
|
270
|
+
|
271
|
+
#define vec2norminv(s, x, n) \
|
272
|
+
{ \
|
273
|
+
int i; \
|
274
|
+
__m128d XMM0 = _mm_setzero_pd(); \
|
275
|
+
__m128d XMM1 = _mm_setzero_pd(); \
|
276
|
+
__m128d XMM2, XMM3, XMM4, XMM5; \
|
277
|
+
for (i = 0;i < (n);i += 4) { \
|
278
|
+
XMM2 = _mm_load_pd((x)+i ); \
|
279
|
+
XMM3 = _mm_load_pd((x)+i+2); \
|
280
|
+
XMM4 = XMM2; \
|
281
|
+
XMM5 = XMM3; \
|
282
|
+
XMM2 = _mm_mul_pd(XMM2, XMM4); \
|
283
|
+
XMM3 = _mm_mul_pd(XMM3, XMM5); \
|
284
|
+
XMM0 = _mm_add_pd(XMM0, XMM2); \
|
285
|
+
XMM1 = _mm_add_pd(XMM1, XMM3); \
|
286
|
+
} \
|
287
|
+
XMM2 = _mm_set1_pd(1.0); \
|
288
|
+
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
289
|
+
XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
|
290
|
+
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
291
|
+
XMM0 = _mm_sqrt_pd(XMM0); \
|
292
|
+
XMM2 = _mm_div_pd(XMM2, XMM0); \
|
293
|
+
_mm_store_sd((s), XMM2); \
|
294
|
+
}
|
@@ -0,0 +1,298 @@
|
|
1
|
+
/*
|
2
|
+
* SSE/SSE3 implementation of vector oprations (32bit float).
|
3
|
+
*
|
4
|
+
* Copyright (c) 2007-2010 Naoaki Okazaki
|
5
|
+
* All rights reserved.
|
6
|
+
*
|
7
|
+
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
+
* of this software and associated documentation files (the "Software"), to deal
|
9
|
+
* in the Software without restriction, including without limitation the rights
|
10
|
+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
* copies of the Software, and to permit persons to whom the Software is
|
12
|
+
* furnished to do so, subject to the following conditions:
|
13
|
+
*
|
14
|
+
* The above copyright notice and this permission notice shall be included in
|
15
|
+
* all copies or substantial portions of the Software.
|
16
|
+
*
|
17
|
+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
|
+
* THE SOFTWARE.
|
24
|
+
*/
|
25
|
+
|
26
|
+
/* $Id$ */
|
27
|
+
|
28
|
+
#include <stdlib.h>
|
29
|
+
#ifndef __APPLE__
|
30
|
+
#include <malloc.h>
|
31
|
+
#endif
|
32
|
+
#include <memory.h>
|
33
|
+
|
34
|
+
#if 1400 <= _MSC_VER
|
35
|
+
#include <intrin.h>
|
36
|
+
#endif/*_MSC_VER*/
|
37
|
+
|
38
|
+
#if HAVE_XMMINTRIN_H
|
39
|
+
#include <xmmintrin.h>
|
40
|
+
#endif/*HAVE_XMMINTRIN_H*/
|
41
|
+
|
42
|
+
#if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
|
43
|
+
#define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
|
44
|
+
#else
|
45
|
+
#define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
|
46
|
+
#endif/*LBFGS_IEEE_FLOAT*/
|
47
|
+
|
48
|
+
inline static void* vecalloc(size_t size)
|
49
|
+
{
|
50
|
+
#if defined(_MSC_VER)
|
51
|
+
void *memblock = _aligned_malloc(size, 16);
|
52
|
+
#elif defined(__APPLE__) /* OS X always aligns on 16-byte boundaries */
|
53
|
+
void *memblock = malloc(size);
|
54
|
+
#else
|
55
|
+
void *memblock = NULL, *p = NULL;
|
56
|
+
if (posix_memalign(&p, 16, size) == 0) {
|
57
|
+
memblock = p;
|
58
|
+
}
|
59
|
+
#endif
|
60
|
+
if (memblock != NULL) {
|
61
|
+
memset(memblock, 0, size);
|
62
|
+
}
|
63
|
+
return memblock;
|
64
|
+
}
|
65
|
+
|
66
|
+
inline static void vecfree(void *memblock)
|
67
|
+
{
|
68
|
+
_aligned_free(memblock);
|
69
|
+
}
|
70
|
+
|
71
|
+
#define vecset(x, c, n) \
|
72
|
+
{ \
|
73
|
+
int i; \
|
74
|
+
__m128 XMM0 = _mm_set_ps1(c); \
|
75
|
+
for (i = 0;i < (n);i += 16) { \
|
76
|
+
_mm_store_ps((x)+i , XMM0); \
|
77
|
+
_mm_store_ps((x)+i+ 4, XMM0); \
|
78
|
+
_mm_store_ps((x)+i+ 8, XMM0); \
|
79
|
+
_mm_store_ps((x)+i+12, XMM0); \
|
80
|
+
} \
|
81
|
+
}
|
82
|
+
|
83
|
+
#define veccpy(y, x, n) \
|
84
|
+
{ \
|
85
|
+
int i; \
|
86
|
+
for (i = 0;i < (n);i += 16) { \
|
87
|
+
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
88
|
+
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
89
|
+
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
90
|
+
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
91
|
+
_mm_store_ps((y)+i , XMM0); \
|
92
|
+
_mm_store_ps((y)+i+ 4, XMM1); \
|
93
|
+
_mm_store_ps((y)+i+ 8, XMM2); \
|
94
|
+
_mm_store_ps((y)+i+12, XMM3); \
|
95
|
+
} \
|
96
|
+
}
|
97
|
+
|
98
|
+
#define vecncpy(y, x, n) \
|
99
|
+
{ \
|
100
|
+
int i; \
|
101
|
+
const uint32_t mask = 0x80000000; \
|
102
|
+
__m128 XMM4 = _mm_load_ps1((float*)&mask); \
|
103
|
+
for (i = 0;i < (n);i += 16) { \
|
104
|
+
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
105
|
+
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
106
|
+
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
107
|
+
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
108
|
+
XMM0 = _mm_xor_ps(XMM0, XMM4); \
|
109
|
+
XMM1 = _mm_xor_ps(XMM1, XMM4); \
|
110
|
+
XMM2 = _mm_xor_ps(XMM2, XMM4); \
|
111
|
+
XMM3 = _mm_xor_ps(XMM3, XMM4); \
|
112
|
+
_mm_store_ps((y)+i , XMM0); \
|
113
|
+
_mm_store_ps((y)+i+ 4, XMM1); \
|
114
|
+
_mm_store_ps((y)+i+ 8, XMM2); \
|
115
|
+
_mm_store_ps((y)+i+12, XMM3); \
|
116
|
+
} \
|
117
|
+
}
|
118
|
+
|
119
|
+
#define vecadd(y, x, c, n) \
|
120
|
+
{ \
|
121
|
+
int i; \
|
122
|
+
__m128 XMM7 = _mm_set_ps1(c); \
|
123
|
+
for (i = 0;i < (n);i += 8) { \
|
124
|
+
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
125
|
+
__m128 XMM1 = _mm_load_ps((x)+i+4); \
|
126
|
+
__m128 XMM2 = _mm_load_ps((y)+i ); \
|
127
|
+
__m128 XMM3 = _mm_load_ps((y)+i+4); \
|
128
|
+
XMM0 = _mm_mul_ps(XMM0, XMM7); \
|
129
|
+
XMM1 = _mm_mul_ps(XMM1, XMM7); \
|
130
|
+
XMM2 = _mm_add_ps(XMM2, XMM0); \
|
131
|
+
XMM3 = _mm_add_ps(XMM3, XMM1); \
|
132
|
+
_mm_store_ps((y)+i , XMM2); \
|
133
|
+
_mm_store_ps((y)+i+4, XMM3); \
|
134
|
+
} \
|
135
|
+
}
|
136
|
+
|
137
|
+
#define vecdiff(z, x, y, n) \
|
138
|
+
{ \
|
139
|
+
int i; \
|
140
|
+
for (i = 0;i < (n);i += 16) { \
|
141
|
+
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
142
|
+
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
143
|
+
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
144
|
+
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
145
|
+
__m128 XMM4 = _mm_load_ps((y)+i ); \
|
146
|
+
__m128 XMM5 = _mm_load_ps((y)+i+ 4); \
|
147
|
+
__m128 XMM6 = _mm_load_ps((y)+i+ 8); \
|
148
|
+
__m128 XMM7 = _mm_load_ps((y)+i+12); \
|
149
|
+
XMM0 = _mm_sub_ps(XMM0, XMM4); \
|
150
|
+
XMM1 = _mm_sub_ps(XMM1, XMM5); \
|
151
|
+
XMM2 = _mm_sub_ps(XMM2, XMM6); \
|
152
|
+
XMM3 = _mm_sub_ps(XMM3, XMM7); \
|
153
|
+
_mm_store_ps((z)+i , XMM0); \
|
154
|
+
_mm_store_ps((z)+i+ 4, XMM1); \
|
155
|
+
_mm_store_ps((z)+i+ 8, XMM2); \
|
156
|
+
_mm_store_ps((z)+i+12, XMM3); \
|
157
|
+
} \
|
158
|
+
}
|
159
|
+
|
160
|
+
#define vecscale(y, c, n) \
|
161
|
+
{ \
|
162
|
+
int i; \
|
163
|
+
__m128 XMM7 = _mm_set_ps1(c); \
|
164
|
+
for (i = 0;i < (n);i += 8) { \
|
165
|
+
__m128 XMM0 = _mm_load_ps((y)+i ); \
|
166
|
+
__m128 XMM1 = _mm_load_ps((y)+i+4); \
|
167
|
+
XMM0 = _mm_mul_ps(XMM0, XMM7); \
|
168
|
+
XMM1 = _mm_mul_ps(XMM1, XMM7); \
|
169
|
+
_mm_store_ps((y)+i , XMM0); \
|
170
|
+
_mm_store_ps((y)+i+4, XMM1); \
|
171
|
+
} \
|
172
|
+
}
|
173
|
+
|
174
|
+
#define vecmul(y, x, n) \
|
175
|
+
{ \
|
176
|
+
int i; \
|
177
|
+
for (i = 0;i < (n);i += 16) { \
|
178
|
+
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
179
|
+
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
180
|
+
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
181
|
+
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
182
|
+
__m128 XMM4 = _mm_load_ps((y)+i ); \
|
183
|
+
__m128 XMM5 = _mm_load_ps((y)+i+ 4); \
|
184
|
+
__m128 XMM6 = _mm_load_ps((y)+i+ 8); \
|
185
|
+
__m128 XMM7 = _mm_load_ps((y)+i+12); \
|
186
|
+
XMM4 = _mm_mul_ps(XMM4, XMM0); \
|
187
|
+
XMM5 = _mm_mul_ps(XMM5, XMM1); \
|
188
|
+
XMM6 = _mm_mul_ps(XMM6, XMM2); \
|
189
|
+
XMM7 = _mm_mul_ps(XMM7, XMM3); \
|
190
|
+
_mm_store_ps((y)+i , XMM4); \
|
191
|
+
_mm_store_ps((y)+i+ 4, XMM5); \
|
192
|
+
_mm_store_ps((y)+i+ 8, XMM6); \
|
193
|
+
_mm_store_ps((y)+i+12, XMM7); \
|
194
|
+
} \
|
195
|
+
}
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
#if 3 <= __SSE__ || defined(__SSE3__)
|
200
|
+
/*
|
201
|
+
Horizontal add with haddps SSE3 instruction. The work register (rw)
|
202
|
+
is unused.
|
203
|
+
*/
|
204
|
+
#define __horizontal_sum(r, rw) \
|
205
|
+
r = _mm_hadd_ps(r, r); \
|
206
|
+
r = _mm_hadd_ps(r, r);
|
207
|
+
|
208
|
+
#else
|
209
|
+
/*
|
210
|
+
Horizontal add with SSE instruction. The work register (rw) is used.
|
211
|
+
*/
|
212
|
+
#define __horizontal_sum(r, rw) \
|
213
|
+
rw = r; \
|
214
|
+
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
|
215
|
+
r = _mm_add_ps(r, rw); \
|
216
|
+
rw = r; \
|
217
|
+
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
|
218
|
+
r = _mm_add_ps(r, rw);
|
219
|
+
|
220
|
+
#endif
|
221
|
+
|
222
|
+
#define vecdot(s, x, y, n) \
|
223
|
+
{ \
|
224
|
+
int i; \
|
225
|
+
__m128 XMM0 = _mm_setzero_ps(); \
|
226
|
+
__m128 XMM1 = _mm_setzero_ps(); \
|
227
|
+
__m128 XMM2, XMM3, XMM4, XMM5; \
|
228
|
+
for (i = 0;i < (n);i += 8) { \
|
229
|
+
XMM2 = _mm_load_ps((x)+i ); \
|
230
|
+
XMM3 = _mm_load_ps((x)+i+4); \
|
231
|
+
XMM4 = _mm_load_ps((y)+i ); \
|
232
|
+
XMM5 = _mm_load_ps((y)+i+4); \
|
233
|
+
XMM2 = _mm_mul_ps(XMM2, XMM4); \
|
234
|
+
XMM3 = _mm_mul_ps(XMM3, XMM5); \
|
235
|
+
XMM0 = _mm_add_ps(XMM0, XMM2); \
|
236
|
+
XMM1 = _mm_add_ps(XMM1, XMM3); \
|
237
|
+
} \
|
238
|
+
XMM0 = _mm_add_ps(XMM0, XMM1); \
|
239
|
+
__horizontal_sum(XMM0, XMM1); \
|
240
|
+
_mm_store_ss((s), XMM0); \
|
241
|
+
}
|
242
|
+
|
243
|
+
#define vec2norm(s, x, n) \
|
244
|
+
{ \
|
245
|
+
int i; \
|
246
|
+
__m128 XMM0 = _mm_setzero_ps(); \
|
247
|
+
__m128 XMM1 = _mm_setzero_ps(); \
|
248
|
+
__m128 XMM2, XMM3; \
|
249
|
+
for (i = 0;i < (n);i += 8) { \
|
250
|
+
XMM2 = _mm_load_ps((x)+i ); \
|
251
|
+
XMM3 = _mm_load_ps((x)+i+4); \
|
252
|
+
XMM2 = _mm_mul_ps(XMM2, XMM2); \
|
253
|
+
XMM3 = _mm_mul_ps(XMM3, XMM3); \
|
254
|
+
XMM0 = _mm_add_ps(XMM0, XMM2); \
|
255
|
+
XMM1 = _mm_add_ps(XMM1, XMM3); \
|
256
|
+
} \
|
257
|
+
XMM0 = _mm_add_ps(XMM0, XMM1); \
|
258
|
+
__horizontal_sum(XMM0, XMM1); \
|
259
|
+
XMM2 = XMM0; \
|
260
|
+
XMM1 = _mm_rsqrt_ss(XMM0); \
|
261
|
+
XMM3 = XMM1; \
|
262
|
+
XMM1 = _mm_mul_ss(XMM1, XMM1); \
|
263
|
+
XMM1 = _mm_mul_ss(XMM1, XMM3); \
|
264
|
+
XMM1 = _mm_mul_ss(XMM1, XMM0); \
|
265
|
+
XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
|
266
|
+
XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
|
267
|
+
XMM3 = _mm_add_ss(XMM3, XMM1); \
|
268
|
+
XMM3 = _mm_mul_ss(XMM3, XMM2); \
|
269
|
+
_mm_store_ss((s), XMM3); \
|
270
|
+
}
|
271
|
+
|
272
|
+
#define vec2norminv(s, x, n) \
|
273
|
+
{ \
|
274
|
+
int i; \
|
275
|
+
__m128 XMM0 = _mm_setzero_ps(); \
|
276
|
+
__m128 XMM1 = _mm_setzero_ps(); \
|
277
|
+
__m128 XMM2, XMM3; \
|
278
|
+
for (i = 0;i < (n);i += 16) { \
|
279
|
+
XMM2 = _mm_load_ps((x)+i ); \
|
280
|
+
XMM3 = _mm_load_ps((x)+i+4); \
|
281
|
+
XMM2 = _mm_mul_ps(XMM2, XMM2); \
|
282
|
+
XMM3 = _mm_mul_ps(XMM3, XMM3); \
|
283
|
+
XMM0 = _mm_add_ps(XMM0, XMM2); \
|
284
|
+
XMM1 = _mm_add_ps(XMM1, XMM3); \
|
285
|
+
} \
|
286
|
+
XMM0 = _mm_add_ps(XMM0, XMM1); \
|
287
|
+
__horizontal_sum(XMM0, XMM1); \
|
288
|
+
XMM2 = XMM0; \
|
289
|
+
XMM1 = _mm_rsqrt_ss(XMM0); \
|
290
|
+
XMM3 = XMM1; \
|
291
|
+
XMM1 = _mm_mul_ss(XMM1, XMM1); \
|
292
|
+
XMM1 = _mm_mul_ss(XMM1, XMM3); \
|
293
|
+
XMM1 = _mm_mul_ss(XMM1, XMM0); \
|
294
|
+
XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
|
295
|
+
XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
|
296
|
+
XMM3 = _mm_add_ss(XMM3, XMM1); \
|
297
|
+
_mm_store_ss((s), XMM3); \
|
298
|
+
}
|