fasttext 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +251 -0
- data/ext/fasttext/ext.cpp +291 -0
- data/ext/fasttext/extconf.rb +15 -0
- data/lib/fasttext.rb +41 -0
- data/lib/fasttext/classifier.rb +92 -0
- data/lib/fasttext/ext.bundle +0 -0
- data/lib/fasttext/model.rb +60 -0
- data/lib/fasttext/vectorizer.rb +58 -0
- data/lib/fasttext/version.rb +3 -0
- data/vendor/fastText/CMakeLists.txt +68 -0
- data/vendor/fastText/CODE_OF_CONDUCT.md +2 -0
- data/vendor/fastText/CONTRIBUTING.md +32 -0
- data/vendor/fastText/LICENSE +21 -0
- data/vendor/fastText/MANIFEST.in +5 -0
- data/vendor/fastText/Makefile +63 -0
- data/vendor/fastText/README.md +339 -0
- data/vendor/fastText/alignment/README.md +53 -0
- data/vendor/fastText/alignment/align.py +145 -0
- data/vendor/fastText/alignment/eval.py +60 -0
- data/vendor/fastText/alignment/example.sh +51 -0
- data/vendor/fastText/alignment/unsup_align.py +109 -0
- data/vendor/fastText/alignment/utils.py +154 -0
- data/vendor/fastText/classification-example.sh +41 -0
- data/vendor/fastText/classification-results.sh +94 -0
- data/vendor/fastText/crawl/README.md +26 -0
- data/vendor/fastText/crawl/dedup.cc +51 -0
- data/vendor/fastText/crawl/download_crawl.sh +57 -0
- data/vendor/fastText/crawl/filter_dedup.sh +13 -0
- data/vendor/fastText/crawl/filter_utf8.cc +105 -0
- data/vendor/fastText/crawl/process_wet_file.sh +30 -0
- data/vendor/fastText/docs/aligned-vectors.md +64 -0
- data/vendor/fastText/docs/api.md +6 -0
- data/vendor/fastText/docs/cheatsheet.md +66 -0
- data/vendor/fastText/docs/crawl-vectors.md +125 -0
- data/vendor/fastText/docs/dataset.md +6 -0
- data/vendor/fastText/docs/english-vectors.md +53 -0
- data/vendor/fastText/docs/faqs.md +63 -0
- data/vendor/fastText/docs/language-identification.md +47 -0
- data/vendor/fastText/docs/options.md +50 -0
- data/vendor/fastText/docs/pretrained-vectors.md +142 -0
- data/vendor/fastText/docs/python-module.md +314 -0
- data/vendor/fastText/docs/references.md +41 -0
- data/vendor/fastText/docs/supervised-models.md +54 -0
- data/vendor/fastText/docs/supervised-tutorial.md +349 -0
- data/vendor/fastText/docs/support.md +58 -0
- data/vendor/fastText/docs/unsupervised-tutorials.md +309 -0
- data/vendor/fastText/eval.py +95 -0
- data/vendor/fastText/get-wikimedia.sh +79 -0
- data/vendor/fastText/python/README.md +322 -0
- data/vendor/fastText/python/README.rst +406 -0
- data/vendor/fastText/python/benchmarks/README.rst +3 -0
- data/vendor/fastText/python/benchmarks/get_word_vector.py +49 -0
- data/vendor/fastText/python/doc/examples/FastTextEmbeddingBag.py +81 -0
- data/vendor/fastText/python/doc/examples/bin_to_vec.py +41 -0
- data/vendor/fastText/python/doc/examples/compute_accuracy.py +163 -0
- data/vendor/fastText/python/doc/examples/get_vocab.py +48 -0
- data/vendor/fastText/python/doc/examples/train_supervised.py +42 -0
- data/vendor/fastText/python/doc/examples/train_unsupervised.py +56 -0
- data/vendor/fastText/python/fasttext_module/fasttext/FastText.py +468 -0
- data/vendor/fastText/python/fasttext_module/fasttext/__init__.py +22 -0
- data/vendor/fastText/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc +388 -0
- data/vendor/fastText/python/fasttext_module/fasttext/tests/__init__.py +14 -0
- data/vendor/fastText/python/fasttext_module/fasttext/tests/test_configurations.py +239 -0
- data/vendor/fastText/python/fasttext_module/fasttext/tests/test_script.py +629 -0
- data/vendor/fastText/python/fasttext_module/fasttext/util/__init__.py +13 -0
- data/vendor/fastText/python/fasttext_module/fasttext/util/util.py +60 -0
- data/vendor/fastText/quantization-example.sh +40 -0
- data/vendor/fastText/runtests.py +60 -0
- data/vendor/fastText/scripts/kbcompletion/README.md +19 -0
- data/vendor/fastText/scripts/kbcompletion/data.sh +69 -0
- data/vendor/fastText/scripts/kbcompletion/eval.cpp +108 -0
- data/vendor/fastText/scripts/kbcompletion/fb15k.sh +49 -0
- data/vendor/fastText/scripts/kbcompletion/fb15k237.sh +45 -0
- data/vendor/fastText/scripts/kbcompletion/svo.sh +38 -0
- data/vendor/fastText/scripts/kbcompletion/wn18.sh +49 -0
- data/vendor/fastText/scripts/quantization/quantization-results.sh +43 -0
- data/vendor/fastText/setup.cfg +2 -0
- data/vendor/fastText/setup.py +203 -0
- data/vendor/fastText/src/args.cc +320 -0
- data/vendor/fastText/src/args.h +68 -0
- data/vendor/fastText/src/densematrix.cc +155 -0
- data/vendor/fastText/src/densematrix.h +75 -0
- data/vendor/fastText/src/dictionary.cc +540 -0
- data/vendor/fastText/src/dictionary.h +111 -0
- data/vendor/fastText/src/fasttext.cc +821 -0
- data/vendor/fastText/src/fasttext.h +191 -0
- data/vendor/fastText/src/loss.cc +346 -0
- data/vendor/fastText/src/loss.h +163 -0
- data/vendor/fastText/src/main.cc +435 -0
- data/vendor/fastText/src/matrix.cc +25 -0
- data/vendor/fastText/src/matrix.h +44 -0
- data/vendor/fastText/src/meter.cc +68 -0
- data/vendor/fastText/src/meter.h +69 -0
- data/vendor/fastText/src/model.cc +98 -0
- data/vendor/fastText/src/model.h +79 -0
- data/vendor/fastText/src/productquantizer.cc +251 -0
- data/vendor/fastText/src/productquantizer.h +63 -0
- data/vendor/fastText/src/quantmatrix.cc +117 -0
- data/vendor/fastText/src/quantmatrix.h +60 -0
- data/vendor/fastText/src/real.h +15 -0
- data/vendor/fastText/src/utils.cc +28 -0
- data/vendor/fastText/src/utils.h +43 -0
- data/vendor/fastText/src/vector.cc +97 -0
- data/vendor/fastText/src/vector.h +61 -0
- data/vendor/fastText/tests/fetch_test_data.sh +202 -0
- data/vendor/fastText/website/README.md +6 -0
- data/vendor/fastText/website/blog/2016-08-18-blog-post.md +42 -0
- data/vendor/fastText/website/blog/2017-05-02-blog-post.md +60 -0
- data/vendor/fastText/website/blog/2017-10-02-blog-post.md +90 -0
- data/vendor/fastText/website/blog/2019-06-25-blog-post.md +168 -0
- data/vendor/fastText/website/core/Footer.js +127 -0
- data/vendor/fastText/website/package.json +12 -0
- data/vendor/fastText/website/pages/en/index.js +286 -0
- data/vendor/fastText/website/sidebars.json +18 -0
- data/vendor/fastText/website/siteConfig.js +102 -0
- data/vendor/fastText/website/static/docs/en/html/annotated.html +115 -0
- data/vendor/fastText/website/static/docs/en/html/annotated_dup.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/args_8cc.html +113 -0
- data/vendor/fastText/website/static/docs/en/html/args_8h.html +134 -0
- data/vendor/fastText/website/static/docs/en/html/args_8h.js +14 -0
- data/vendor/fastText/website/static/docs/en/html/args_8h_source.html +139 -0
- data/vendor/fastText/website/static/docs/en/html/bc_s.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/bdwn.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/classes.html +121 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args-members.html +140 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.html +753 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.js +40 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary-members.html +148 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.html +1266 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.js +43 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText-members.html +145 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.html +1149 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.js +45 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix-members.html +123 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.html +610 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.js +23 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model-members.html +150 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.html +1400 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.js +48 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html +131 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html +950 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js +31 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix-members.html +122 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.html +565 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.js +22 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector-members.html +121 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.html +542 -0
- data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.js +21 -0
- data/vendor/fastText/website/static/docs/en/html/closed.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/dictionary_8cc.html +116 -0
- data/vendor/fastText/website/static/docs/en/html/dictionary_8h.html +142 -0
- data/vendor/fastText/website/static/docs/en/html/dictionary_8h.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/dictionary_8h_source.html +127 -0
- data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html +145 -0
- data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js +29 -0
- data/vendor/fastText/website/static/docs/en/html/doc.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/doxygen.css +1596 -0
- data/vendor/fastText/website/static/docs/en/html/doxygen.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/dynsections.js +97 -0
- data/vendor/fastText/website/static/docs/en/html/fasttext_8cc.html +119 -0
- data/vendor/fastText/website/static/docs/en/html/fasttext_8h.html +168 -0
- data/vendor/fastText/website/static/docs/en/html/fasttext_8h.js +6 -0
- data/vendor/fastText/website/static/docs/en/html/fasttext_8h_source.html +155 -0
- data/vendor/fastText/website/static/docs/en/html/favicon.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/files.html +125 -0
- data/vendor/fastText/website/static/docs/en/html/files.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/folderclosed.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/folderopen.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/functions.html +139 -0
- data/vendor/fastText/website/static/docs/en/html/functions_0x7e.html +112 -0
- data/vendor/fastText/website/static/docs/en/html/functions_b.html +115 -0
- data/vendor/fastText/website/static/docs/en/html/functions_c.html +143 -0
- data/vendor/fastText/website/static/docs/en/html/functions_d.html +135 -0
- data/vendor/fastText/website/static/docs/en/html/functions_dup.js +27 -0
- data/vendor/fastText/website/static/docs/en/html/functions_e.html +115 -0
- data/vendor/fastText/website/static/docs/en/html/functions_f.html +112 -0
- data/vendor/fastText/website/static/docs/en/html/functions_func.html +563 -0
- data/vendor/fastText/website/static/docs/en/html/functions_g.html +145 -0
- data/vendor/fastText/website/static/docs/en/html/functions_h.html +112 -0
- data/vendor/fastText/website/static/docs/en/html/functions_i.html +121 -0
- data/vendor/fastText/website/static/docs/en/html/functions_k.html +106 -0
- data/vendor/fastText/website/static/docs/en/html/functions_l.html +140 -0
- data/vendor/fastText/website/static/docs/en/html/functions_m.html +153 -0
- data/vendor/fastText/website/static/docs/en/html/functions_n.html +164 -0
- data/vendor/fastText/website/static/docs/en/html/functions_o.html +116 -0
- data/vendor/fastText/website/static/docs/en/html/functions_p.html +161 -0
- data/vendor/fastText/website/static/docs/en/html/functions_q.html +135 -0
- data/vendor/fastText/website/static/docs/en/html/functions_r.html +116 -0
- data/vendor/fastText/website/static/docs/en/html/functions_s.html +159 -0
- data/vendor/fastText/website/static/docs/en/html/functions_t.html +138 -0
- data/vendor/fastText/website/static/docs/en/html/functions_u.html +106 -0
- data/vendor/fastText/website/static/docs/en/html/functions_v.html +106 -0
- data/vendor/fastText/website/static/docs/en/html/functions_vars.html +486 -0
- data/vendor/fastText/website/static/docs/en/html/functions_w.html +124 -0
- data/vendor/fastText/website/static/docs/en/html/functions_z.html +104 -0
- data/vendor/fastText/website/static/docs/en/html/globals.html +170 -0
- data/vendor/fastText/website/static/docs/en/html/globals_defs.html +113 -0
- data/vendor/fastText/website/static/docs/en/html/globals_func.html +155 -0
- data/vendor/fastText/website/static/docs/en/html/index.html +100 -0
- data/vendor/fastText/website/static/docs/en/html/jquery.js +87 -0
- data/vendor/fastText/website/static/docs/en/html/main_8cc.html +582 -0
- data/vendor/fastText/website/static/docs/en/html/main_8cc.js +22 -0
- data/vendor/fastText/website/static/docs/en/html/matrix_8cc.html +114 -0
- data/vendor/fastText/website/static/docs/en/html/matrix_8h.html +121 -0
- data/vendor/fastText/website/static/docs/en/html/matrix_8h_source.html +123 -0
- data/vendor/fastText/website/static/docs/en/html/menu.js +26 -0
- data/vendor/fastText/website/static/docs/en/html/menudata.js +90 -0
- data/vendor/fastText/website/static/docs/en/html/model_8cc.html +113 -0
- data/vendor/fastText/website/static/docs/en/html/model_8h.html +183 -0
- data/vendor/fastText/website/static/docs/en/html/model_8h.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/model_8h_source.html +139 -0
- data/vendor/fastText/website/static/docs/en/html/namespacefasttext.html +343 -0
- data/vendor/fastText/website/static/docs/en/html/namespacefasttext.js +13 -0
- data/vendor/fastText/website/static/docs/en/html/namespacefasttext_1_1utils.html +158 -0
- data/vendor/fastText/website/static/docs/en/html/namespacemembers.html +125 -0
- data/vendor/fastText/website/static/docs/en/html/namespacemembers_enum.html +107 -0
- data/vendor/fastText/website/static/docs/en/html/namespacemembers_func.html +110 -0
- data/vendor/fastText/website/static/docs/en/html/namespacemembers_type.html +104 -0
- data/vendor/fastText/website/static/docs/en/html/namespaces.html +106 -0
- data/vendor/fastText/website/static/docs/en/html/namespaces.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/nav_f.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/nav_g.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/nav_h.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/navtree.css +146 -0
- data/vendor/fastText/website/static/docs/en/html/navtree.js +517 -0
- data/vendor/fastText/website/static/docs/en/html/navtreedata.js +40 -0
- data/vendor/fastText/website/static/docs/en/html/navtreeindex0.js +253 -0
- data/vendor/fastText/website/static/docs/en/html/navtreeindex1.js +139 -0
- data/vendor/fastText/website/static/docs/en/html/open.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.html +118 -0
- data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/productquantizer_8h.html +124 -0
- data/vendor/fastText/website/static/docs/en/html/productquantizer_8h_source.html +133 -0
- data/vendor/fastText/website/static/docs/en/html/qmatrix_8cc.html +112 -0
- data/vendor/fastText/website/static/docs/en/html/qmatrix_8h.html +126 -0
- data/vendor/fastText/website/static/docs/en/html/qmatrix_8h_source.html +128 -0
- data/vendor/fastText/website/static/docs/en/html/real_8h.html +117 -0
- data/vendor/fastText/website/static/docs/en/html/real_8h.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/real_8h_source.html +103 -0
- data/vendor/fastText/website/static/docs/en/html/resize.js +114 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_0.js +17 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_1.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_10.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_10.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_11.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_11.js +25 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_12.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_12.js +15 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_13.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_13.js +7 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_14.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_14.js +7 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_15.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_15.js +11 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_16.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_16.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_17.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_17.js +7 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_2.js +17 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_3.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_3.js +17 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_4.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_4.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_5.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_5.js +12 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_6.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_6.js +18 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_7.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_7.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_8.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_8.js +11 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_9.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_9.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_a.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_a.js +17 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_b.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_b.js +27 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_c.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_c.js +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_d.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_d.js +9 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_e.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_e.js +35 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_f.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/all_f.js +16 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_0.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_1.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_2.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_3.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_3.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_4.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_4.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_5.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_5.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_6.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_6.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_7.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_7.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_8.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/classes_8.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/close.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_0.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_1.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_2.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_3.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/defines_3.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enums_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enums_0.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enums_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enums_1.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enums_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enums_2.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.js +6 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_0.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_1.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_2.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_3.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_3.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_4.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_4.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_5.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_5.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_6.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_6.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_7.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_7.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_8.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/files_8.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_0.js +14 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_1.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_10.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_10.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_11.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_11.js +18 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_12.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_12.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_13.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_13.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_14.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_14.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_15.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_15.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_16.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_16.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_17.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_17.js +7 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_2.js +11 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_3.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_3.js +9 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_4.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_4.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_5.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_5.js +7 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_6.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_6.js +17 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_7.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_7.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_8.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_8.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_9.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_9.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_a.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_a.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_b.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_b.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_c.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_c.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_d.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_d.js +6 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_e.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_e.js +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_f.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/functions_f.js +6 -0
- data/vendor/fastText/website/static/docs/en/html/search/mag_sel.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/nomatches.html +12 -0
- data/vendor/fastText/website/static/docs/en/html/search/search.css +271 -0
- data/vendor/fastText/website/static/docs/en/html/search/search.js +791 -0
- data/vendor/fastText/website/static/docs/en/html/search/search_l.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/search/search_m.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/search/search_r.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/search/searchdata.js +42 -0
- data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_0.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_0.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_1.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_1.js +6 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_10.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_10.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_11.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_11.js +11 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_12.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_12.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_13.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_13.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_2.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_2.js +9 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_3.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_3.js +9 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_4.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_4.js +7 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_5.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_5.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_6.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_6.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_7.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_7.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_8.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_8.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_9.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_9.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_a.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_a.js +14 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_b.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_b.js +17 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_c.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_c.js +6 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_d.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_d.js +10 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_e.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_e.js +11 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_f.html +26 -0
- data/vendor/fastText/website/static/docs/en/html/search/variables_f.js +6 -0
- data/vendor/fastText/website/static/docs/en/html/splitbar.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node-members.html +108 -0
- data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.html +194 -0
- data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.js +8 -0
- data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry-members.html +107 -0
- data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.html +178 -0
- data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.js +7 -0
- data/vendor/fastText/website/static/docs/en/html/sync_off.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/sync_on.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/tab_a.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/tab_b.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/tab_h.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/tab_s.png +0 -0
- data/vendor/fastText/website/static/docs/en/html/tabs.css +1 -0
- data/vendor/fastText/website/static/docs/en/html/utils_8cc.html +121 -0
- data/vendor/fastText/website/static/docs/en/html/utils_8cc.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/utils_8h.html +122 -0
- data/vendor/fastText/website/static/docs/en/html/utils_8h.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/utils_8h_source.html +104 -0
- data/vendor/fastText/website/static/docs/en/html/vector_8cc.html +121 -0
- data/vendor/fastText/website/static/docs/en/html/vector_8cc.js +4 -0
- data/vendor/fastText/website/static/docs/en/html/vector_8h.html +126 -0
- data/vendor/fastText/website/static/docs/en/html/vector_8h.js +5 -0
- data/vendor/fastText/website/static/docs/en/html/vector_8h_source.html +120 -0
- data/vendor/fastText/website/static/fasttext.css +48 -0
- data/vendor/fastText/website/static/img/authors/armand_joulin.jpg +0 -0
- data/vendor/fastText/website/static/img/authors/christian_puhrsch.png +0 -0
- data/vendor/fastText/website/static/img/authors/edouard_grave.jpeg +0 -0
- data/vendor/fastText/website/static/img/authors/piotr_bojanowski.jpg +0 -0
- data/vendor/fastText/website/static/img/authors/tomas_mikolov.jpg +0 -0
- data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img1.png +0 -0
- data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img2.png +0 -0
- data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img1.jpg +0 -0
- data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img2.jpg +0 -0
- data/vendor/fastText/website/static/img/blog/2017-10-02-blog-post-img1.png +0 -0
- data/vendor/fastText/website/static/img/cbo_vs_skipgram.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-icon-api.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-icon-bg-web.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-icon-color-square.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-icon-color-web.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-icon-faq.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-icon-tutorial.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-icon-white-web.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-logo-color-web.png +0 -0
- data/vendor/fastText/website/static/img/fasttext-logo-white-web.png +0 -0
- data/vendor/fastText/website/static/img/logo-color.png +0 -0
- data/vendor/fastText/website/static/img/model-black.png +0 -0
- data/vendor/fastText/website/static/img/model-blue.png +0 -0
- data/vendor/fastText/website/static/img/model-red.png +0 -0
- data/vendor/fastText/website/static/img/ogimage.png +0 -0
- data/vendor/fastText/website/static/img/oss_logo.png +0 -0
- data/vendor/fastText/wikifil.pl +57 -0
- data/vendor/fastText/word-vector-example.sh +39 -0
- metadata +621 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#
|
4
|
+
# Copyright (c) 2018-present, Facebook, Inc.
|
5
|
+
# All rights reserved.
|
6
|
+
#
|
7
|
+
# This source code is licensed under the license found in the
|
8
|
+
# LICENSE file in the root directory of this source tree.
|
9
|
+
|
10
|
+
import io
|
11
|
+
import numpy as np
|
12
|
+
import argparse
|
13
|
+
from utils import *
|
14
|
+
|
15
|
+
parser = argparse.ArgumentParser(description='Evaluation of word alignment')
|
16
|
+
parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings")
|
17
|
+
parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings")
|
18
|
+
parser.add_argument('--center', action='store_true', help='whether to center embeddings or not')
|
19
|
+
parser.add_argument("--src_mat", type=str, default='', help="Load source alignment matrix. If none given, the aligment matrix is the identity.")
|
20
|
+
parser.add_argument("--tgt_mat", type=str, default='', help="Load target alignment matrix. If none given, the aligment matrix is the identity.")
|
21
|
+
parser.add_argument("--dico_test", type=str, default='', help="test dictionary")
|
22
|
+
parser.add_argument("--maxload", type=int, default=200000)
|
23
|
+
parser.add_argument("--nomatch", action='store_true', help="no exact match in lexicon")
|
24
|
+
params = parser.parse_args()
|
25
|
+
|
26
|
+
|
27
|
+
###### SPECIFIC FUNCTIONS ######
|
28
|
+
# function specific to evaluation
|
29
|
+
# the rest of the functions are in utils.py
|
30
|
+
|
31
|
+
def load_transform(fname, d1=300, d2=300):
|
32
|
+
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
|
33
|
+
R = np.zeros([d1, d2])
|
34
|
+
for i, line in enumerate(fin):
|
35
|
+
tokens = line.split(' ')
|
36
|
+
R[i, :] = np.array(tokens[0:d2], dtype=float)
|
37
|
+
return R
|
38
|
+
|
39
|
+
|
40
|
+
###### MAIN ######
|
41
|
+
|
42
|
+
print("Evaluation of alignment on %s" % params.dico_test)
|
43
|
+
if params.nomatch:
|
44
|
+
print("running without exact string matches")
|
45
|
+
|
46
|
+
words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center)
|
47
|
+
words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center)
|
48
|
+
|
49
|
+
if params.tgt_mat != "":
|
50
|
+
R_tgt = load_transform(params.tgt_mat)
|
51
|
+
x_tgt = np.dot(x_tgt, R_tgt)
|
52
|
+
if params.src_mat != "":
|
53
|
+
R_src = load_transform(params.src_mat)
|
54
|
+
x_src = np.dot(x_src, R_src)
|
55
|
+
|
56
|
+
src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt)
|
57
|
+
|
58
|
+
nnacc = compute_nn_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
|
59
|
+
cslsproc = compute_csls_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
|
60
|
+
print("NN = %.4f - CSLS = %.4f - Coverage = %.4f" % (nnacc, cslsproc, len(src2tgt) / lexicon_size))
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/bin/usr/env sh
|
2
|
+
# Copyright (c) 2018-present, Facebook, Inc.
|
3
|
+
# All rights reserved.
|
4
|
+
#
|
5
|
+
# This source code is licensed under the license found in the
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
7
|
+
|
8
|
+
set -e
|
9
|
+
s=${1:-en}
|
10
|
+
t=${2:-es}
|
11
|
+
echo "Example based on the ${s}->${t} alignment"
|
12
|
+
|
13
|
+
if [ ! -d data/ ]; then
|
14
|
+
mkdir -p data;
|
15
|
+
fi
|
16
|
+
|
17
|
+
if [ ! -d res/ ]; then
|
18
|
+
mkdir -p res;
|
19
|
+
fi
|
20
|
+
|
21
|
+
dico_train=data/${s}-${t}.0-5000.txt
|
22
|
+
if [ ! -f "${dico_train}" ]; then
|
23
|
+
DICO=$(basename -- "${dico_train}")
|
24
|
+
wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
|
25
|
+
fi
|
26
|
+
|
27
|
+
dico_test=data/${s}-${t}.5000-6500.txt
|
28
|
+
if [ ! -f "${dico_test}" ]; then
|
29
|
+
DICO=$(basename -- "${dico_test}")
|
30
|
+
wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
|
31
|
+
fi
|
32
|
+
|
33
|
+
src_emb=data/wiki.${s}.vec
|
34
|
+
if [ ! -f "${src_emb}" ]; then
|
35
|
+
EMB=$(basename -- "${src_emb}")
|
36
|
+
wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
|
37
|
+
fi
|
38
|
+
|
39
|
+
tgt_emb=data/wiki.${t}.vec
|
40
|
+
if [ ! -f "${tgt_emb}" ]; then
|
41
|
+
EMB=$(basename -- "${tgt_emb}")
|
42
|
+
wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
|
43
|
+
fi
|
44
|
+
|
45
|
+
output=res/wiki.${s}-${t}.vec
|
46
|
+
|
47
|
+
python3 align.py --src_emb "${src_emb}" --tgt_emb "${tgt_emb}" \
|
48
|
+
--dico_train "${dico_train}" --dico_test "${dico_test}" --output "${output}" \
|
49
|
+
--lr 25 --niter 10
|
50
|
+
python3 eval.py --src_emb "${output}" --tgt_emb "${tgt_emb}" \
|
51
|
+
--dico_test "${dico_test}"
|
@@ -0,0 +1,109 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# Copyright (c) 2018-present, Facebook, Inc.
|
3
|
+
# All rights reserved.
|
4
|
+
#
|
5
|
+
# This source code is licensed under the MIT license found in the
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
7
|
+
|
8
|
+
import codecs, sys, time, math, argparse, ot
|
9
|
+
import numpy as np
|
10
|
+
from utils import *
|
11
|
+
|
12
|
+
parser = argparse.ArgumentParser(description='Wasserstein Procrustes for Embedding Alignment')
|
13
|
+
parser.add_argument('--model_src', type=str, help='Path to source word embeddings')
|
14
|
+
parser.add_argument('--model_tgt', type=str, help='Path to target word embeddings')
|
15
|
+
parser.add_argument('--lexicon', type=str, help='Path to the evaluation lexicon')
|
16
|
+
parser.add_argument('--output_src', default='', type=str, help='Path to save the aligned source embeddings')
|
17
|
+
parser.add_argument('--output_tgt', default='', type=str, help='Path to save the aligned target embeddings')
|
18
|
+
parser.add_argument('--seed', default=1111, type=int, help='Random number generator seed')
|
19
|
+
parser.add_argument('--nepoch', default=5, type=int, help='Number of epochs')
|
20
|
+
parser.add_argument('--niter', default=5000, type=int, help='Initial number of iterations')
|
21
|
+
parser.add_argument('--bsz', default=500, type=int, help='Initial batch size')
|
22
|
+
parser.add_argument('--lr', default=500., type=float, help='Learning rate')
|
23
|
+
parser.add_argument('--nmax', default=20000, type=int, help='Vocabulary size for learning the alignment')
|
24
|
+
parser.add_argument('--reg', default=0.05, type=float, help='Regularization parameter for sinkhorn')
|
25
|
+
args = parser.parse_args()
|
26
|
+
|
27
|
+
|
28
|
+
def objective(X, Y, R, n=5000):
|
29
|
+
Xn, Yn = X[:n], Y[:n]
|
30
|
+
C = -np.dot(np.dot(Xn, R), Yn.T)
|
31
|
+
P = ot.sinkhorn(np.ones(n), np.ones(n), C, 0.025, stopThr=1e-3)
|
32
|
+
return 1000 * np.linalg.norm(np.dot(Xn, R) - np.dot(P, Yn)) / n
|
33
|
+
|
34
|
+
|
35
|
+
def sqrt_eig(x):
|
36
|
+
U, s, VT = np.linalg.svd(x, full_matrices=False)
|
37
|
+
return np.dot(U, np.dot(np.diag(np.sqrt(s)), VT))
|
38
|
+
|
39
|
+
|
40
|
+
def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000,
|
41
|
+
nmax=10000, reg=0.05, verbose=True):
|
42
|
+
for epoch in range(1, nepoch + 1):
|
43
|
+
for _it in range(1, niter + 1):
|
44
|
+
# sample mini-batch
|
45
|
+
xt = X[np.random.permutation(nmax)[:bsz], :]
|
46
|
+
yt = Y[np.random.permutation(nmax)[:bsz], :]
|
47
|
+
# compute OT on minibatch
|
48
|
+
C = -np.dot(np.dot(xt, R), yt.T)
|
49
|
+
P = ot.sinkhorn(np.ones(bsz), np.ones(bsz), C, reg, stopThr=1e-3)
|
50
|
+
# compute gradient
|
51
|
+
G = - np.dot(xt.T, np.dot(P, yt))
|
52
|
+
R -= lr / bsz * G
|
53
|
+
# project on orthogonal matrices
|
54
|
+
U, s, VT = np.linalg.svd(R)
|
55
|
+
R = np.dot(U, VT)
|
56
|
+
bsz *= 2
|
57
|
+
niter //= 4
|
58
|
+
if verbose:
|
59
|
+
print("epoch: %d obj: %.3f" % (epoch, objective(X, Y, R)))
|
60
|
+
return R
|
61
|
+
|
62
|
+
|
63
|
+
def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
|
64
|
+
n, d = X.shape
|
65
|
+
if apply_sqrt:
|
66
|
+
X, Y = sqrt_eig(X), sqrt_eig(Y)
|
67
|
+
K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
|
68
|
+
K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
|
69
|
+
K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
|
70
|
+
P = np.ones([n, n]) / float(n)
|
71
|
+
for it in range(1, niter + 1):
|
72
|
+
G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
|
73
|
+
q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
|
74
|
+
alpha = 2.0 / float(2.0 + it)
|
75
|
+
P = alpha * q + (1.0 - alpha) * P
|
76
|
+
obj = np.linalg.norm(np.dot(P, K_X) - np.dot(K_Y, P))
|
77
|
+
print(obj)
|
78
|
+
return procrustes(np.dot(P, X), Y).T
|
79
|
+
|
80
|
+
|
81
|
+
print("\n*** Wasserstein Procrustes ***\n")
|
82
|
+
|
83
|
+
np.random.seed(args.seed)
|
84
|
+
|
85
|
+
maxload = 200000
|
86
|
+
w_src, x_src = load_vectors(args.model_src, maxload, norm=True, center=True)
|
87
|
+
w_tgt, x_tgt = load_vectors(args.model_tgt, maxload, norm=True, center=True)
|
88
|
+
src2trg, _ = load_lexicon(args.lexicon, w_src, w_tgt)
|
89
|
+
|
90
|
+
print("\nComputing initial mapping with convex relaxation...")
|
91
|
+
t0 = time.time()
|
92
|
+
R0 = convex_init(x_src[:2500], x_tgt[:2500], reg=args.reg, apply_sqrt=True)
|
93
|
+
print("Done [%03d sec]" % math.floor(time.time() - t0))
|
94
|
+
|
95
|
+
print("\nComputing mapping with Wasserstein Procrustes...")
|
96
|
+
t0 = time.time()
|
97
|
+
R = align(x_src, x_tgt, R0.copy(), bsz=args.bsz, lr=args.lr, niter=args.niter,
|
98
|
+
nepoch=args.nepoch, reg=args.reg, nmax=args.nmax)
|
99
|
+
print("Done [%03d sec]" % math.floor(time.time() - t0))
|
100
|
+
|
101
|
+
acc = compute_nn_accuracy(x_src, np.dot(x_tgt, R.T), src2trg)
|
102
|
+
print("\nPrecision@1: %.3f\n" % acc)
|
103
|
+
|
104
|
+
if args.output_src != '':
|
105
|
+
x_src = x_src / np.linalg.norm(x_src, 2, 1).reshape([-1, 1])
|
106
|
+
save_vectors(args.output_src, x_src, w_src)
|
107
|
+
if args.output_tgt != '':
|
108
|
+
x_tgt = x_tgt / np.linalg.norm(x_tgt, 2, 1).reshape([-1, 1])
|
109
|
+
save_vectors(args.output_tgt, np.dot(x_tgt, R.T), w_tgt)
|
@@ -0,0 +1,154 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# Copyright (c) 2018-present, Facebook, Inc.
|
3
|
+
# All rights reserved.
|
4
|
+
#
|
5
|
+
# This source code is licensed under the license found in the
|
6
|
+
# LICENSE file in the root directory of this source tree.
|
7
|
+
|
8
|
+
import io
|
9
|
+
import numpy as np
|
10
|
+
import collections
|
11
|
+
|
12
|
+
|
13
|
+
def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True):
|
14
|
+
if verbose:
|
15
|
+
print("Loading vectors from %s" % fname)
|
16
|
+
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
|
17
|
+
n, d = map(int, fin.readline().split())
|
18
|
+
if maxload > 0:
|
19
|
+
n = min(n, maxload)
|
20
|
+
x = np.zeros([n, d])
|
21
|
+
words = []
|
22
|
+
for i, line in enumerate(fin):
|
23
|
+
if i >= n:
|
24
|
+
break
|
25
|
+
tokens = line.rstrip().split(' ')
|
26
|
+
words.append(tokens[0])
|
27
|
+
v = np.array(tokens[1:], dtype=float)
|
28
|
+
x[i, :] = v
|
29
|
+
if norm:
|
30
|
+
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
|
31
|
+
if center:
|
32
|
+
x -= x.mean(axis=0)[np.newaxis, :]
|
33
|
+
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
|
34
|
+
if verbose:
|
35
|
+
print("%d word vectors loaded" % (len(words)))
|
36
|
+
return words, x
|
37
|
+
|
38
|
+
|
39
|
+
def idx(words):
|
40
|
+
w2i = {}
|
41
|
+
for i, w in enumerate(words):
|
42
|
+
if w not in w2i:
|
43
|
+
w2i[w] = i
|
44
|
+
return w2i
|
45
|
+
|
46
|
+
|
47
|
+
def save_vectors(fname, x, words):
|
48
|
+
n, d = x.shape
|
49
|
+
fout = io.open(fname, 'w', encoding='utf-8')
|
50
|
+
fout.write(u"%d %d\n" % (n, d))
|
51
|
+
for i in range(n):
|
52
|
+
fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
|
53
|
+
fout.close()
|
54
|
+
|
55
|
+
|
56
|
+
def save_matrix(fname, x):
|
57
|
+
n, d = x.shape
|
58
|
+
fout = io.open(fname, 'w', encoding='utf-8')
|
59
|
+
fout.write(u"%d %d\n" % (n, d))
|
60
|
+
for i in range(n):
|
61
|
+
fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
|
62
|
+
fout.close()
|
63
|
+
|
64
|
+
|
65
|
+
def procrustes(X_src, Y_tgt):
|
66
|
+
U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
|
67
|
+
return np.dot(U, V)
|
68
|
+
|
69
|
+
|
70
|
+
def select_vectors_from_pairs(x_src, y_tgt, pairs):
|
71
|
+
n = len(pairs)
|
72
|
+
d = x_src.shape[1]
|
73
|
+
x = np.zeros([n, d])
|
74
|
+
y = np.zeros([n, d])
|
75
|
+
for k, ij in enumerate(pairs):
|
76
|
+
i, j = ij
|
77
|
+
x[k, :] = x_src[i, :]
|
78
|
+
y[k, :] = y_tgt[j, :]
|
79
|
+
return x, y
|
80
|
+
|
81
|
+
|
82
|
+
def load_lexicon(filename, words_src, words_tgt, verbose=True):
|
83
|
+
f = io.open(filename, 'r', encoding='utf-8')
|
84
|
+
lexicon = collections.defaultdict(set)
|
85
|
+
idx_src , idx_tgt = idx(words_src), idx(words_tgt)
|
86
|
+
vocab = set()
|
87
|
+
for line in f:
|
88
|
+
word_src, word_tgt = line.split()
|
89
|
+
if word_src in idx_src and word_tgt in idx_tgt:
|
90
|
+
lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
|
91
|
+
vocab.add(word_src)
|
92
|
+
if verbose:
|
93
|
+
coverage = len(lexicon) / float(len(vocab))
|
94
|
+
print("Coverage of source vocab: %.4f" % (coverage))
|
95
|
+
return lexicon, float(len(vocab))
|
96
|
+
|
97
|
+
|
98
|
+
def load_pairs(filename, idx_src, idx_tgt, verbose=True):
|
99
|
+
f = io.open(filename, 'r', encoding='utf-8')
|
100
|
+
pairs = []
|
101
|
+
tot = 0
|
102
|
+
for line in f:
|
103
|
+
a, b = line.rstrip().split(' ')
|
104
|
+
tot += 1
|
105
|
+
if a in idx_src and b in idx_tgt:
|
106
|
+
pairs.append((idx_src[a], idx_tgt[b]))
|
107
|
+
if verbose:
|
108
|
+
coverage = (1.0 * len(pairs)) / tot
|
109
|
+
print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage))
|
110
|
+
return pairs
|
111
|
+
|
112
|
+
|
113
|
+
def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
|
114
|
+
if lexicon_size < 0:
|
115
|
+
lexicon_size = len(lexicon)
|
116
|
+
idx_src = list(lexicon.keys())
|
117
|
+
acc = 0.0
|
118
|
+
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
|
119
|
+
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
|
120
|
+
for i in range(0, len(idx_src), bsz):
|
121
|
+
e = min(i + bsz, len(idx_src))
|
122
|
+
scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
|
123
|
+
pred = scores.argmax(axis=0)
|
124
|
+
for j in range(i, e):
|
125
|
+
if pred[j - i] in lexicon[idx_src[j]]:
|
126
|
+
acc += 1.0
|
127
|
+
return acc / lexicon_size
|
128
|
+
|
129
|
+
|
130
|
+
def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024):
|
131
|
+
if lexicon_size < 0:
|
132
|
+
lexicon_size = len(lexicon)
|
133
|
+
idx_src = list(lexicon.keys())
|
134
|
+
|
135
|
+
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
|
136
|
+
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
|
137
|
+
|
138
|
+
sr = x_src[list(idx_src)]
|
139
|
+
sc = np.dot(sr, x_tgt.T)
|
140
|
+
similarities = 2 * sc
|
141
|
+
sc2 = np.zeros(x_tgt.shape[0])
|
142
|
+
for i in range(0, x_tgt.shape[0], bsz):
|
143
|
+
j = min(i + bsz, x_tgt.shape[0])
|
144
|
+
sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
|
145
|
+
dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
|
146
|
+
sc2[i:j] = np.mean(dotprod, axis=1)
|
147
|
+
similarities -= sc2[np.newaxis, :]
|
148
|
+
|
149
|
+
nn = np.argmax(similarities, axis=1).tolist()
|
150
|
+
correct = 0.0
|
151
|
+
for k in range(0, len(lexicon)):
|
152
|
+
if nn[k] in lexicon[idx_src[k]]:
|
153
|
+
correct += 1.0
|
154
|
+
return correct / lexicon_size
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
#
|
3
|
+
# Copyright (c) 2016-present, Facebook, Inc.
|
4
|
+
# All rights reserved.
|
5
|
+
#
|
6
|
+
# This source code is licensed under the MIT license found in the
|
7
|
+
# LICENSE file in the root directory of this source tree.
|
8
|
+
#
|
9
|
+
|
10
|
+
myshuf() {
|
11
|
+
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
|
12
|
+
}
|
13
|
+
|
14
|
+
normalize_text() {
|
15
|
+
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
|
16
|
+
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
|
17
|
+
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
|
18
|
+
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
|
19
|
+
}
|
20
|
+
|
21
|
+
RESULTDIR=result
|
22
|
+
DATADIR=data
|
23
|
+
|
24
|
+
mkdir -p "${RESULTDIR}"
|
25
|
+
mkdir -p "${DATADIR}"
|
26
|
+
|
27
|
+
if [ ! -f "${DATADIR}/dbpedia.train" ]
|
28
|
+
then
|
29
|
+
wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "${DATADIR}/dbpedia_csv.tar.gz"
|
30
|
+
tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
|
31
|
+
cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
|
32
|
+
cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
|
33
|
+
fi
|
34
|
+
|
35
|
+
make
|
36
|
+
|
37
|
+
./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4
|
38
|
+
|
39
|
+
./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
|
40
|
+
|
41
|
+
./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
#
|
3
|
+
# Copyright (c) 2016-present, Facebook, Inc.
|
4
|
+
# All rights reserved.
|
5
|
+
#
|
6
|
+
# This source code is licensed under the MIT license found in the
|
7
|
+
# LICENSE file in the root directory of this source tree.
|
8
|
+
#
|
9
|
+
|
10
|
+
# This script produces the results from Table 1 in the following paper:
|
11
|
+
# Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016
|
12
|
+
|
13
|
+
myshuf() {
|
14
|
+
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
|
15
|
+
}
|
16
|
+
|
17
|
+
normalize_text() {
|
18
|
+
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
|
19
|
+
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
|
20
|
+
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
|
21
|
+
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
|
22
|
+
}
|
23
|
+
|
24
|
+
DATASET=(
|
25
|
+
ag_news
|
26
|
+
sogou_news
|
27
|
+
dbpedia
|
28
|
+
yelp_review_polarity
|
29
|
+
yelp_review_full
|
30
|
+
yahoo_answers
|
31
|
+
amazon_review_full
|
32
|
+
amazon_review_polarity
|
33
|
+
)
|
34
|
+
|
35
|
+
ID=(
|
36
|
+
0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
|
37
|
+
0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
|
38
|
+
0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
|
39
|
+
0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
|
40
|
+
0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
|
41
|
+
0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
|
42
|
+
0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
|
43
|
+
0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
|
44
|
+
)
|
45
|
+
|
46
|
+
# These learning rates were chosen by validation on a subset of the training set.
|
47
|
+
LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 )
|
48
|
+
|
49
|
+
RESULTDIR=result
|
50
|
+
DATADIR=data
|
51
|
+
|
52
|
+
mkdir -p "${RESULTDIR}"
|
53
|
+
mkdir -p "${DATADIR}"
|
54
|
+
|
55
|
+
# Small datasets first
|
56
|
+
|
57
|
+
for i in {0..0}
|
58
|
+
do
|
59
|
+
echo "Downloading dataset ${DATASET[i]}"
|
60
|
+
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
|
61
|
+
then
|
62
|
+
wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
|
63
|
+
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
|
64
|
+
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
|
65
|
+
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
|
66
|
+
fi
|
67
|
+
done
|
68
|
+
|
69
|
+
# Large datasets require a bit more work due to the extra request page
|
70
|
+
|
71
|
+
for i in {1..7}
|
72
|
+
do
|
73
|
+
echo "Downloading dataset ${DATASET[i]}"
|
74
|
+
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
|
75
|
+
then
|
76
|
+
curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
|
77
|
+
curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
|
78
|
+
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
|
79
|
+
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
|
80
|
+
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
|
81
|
+
fi
|
82
|
+
done
|
83
|
+
|
84
|
+
make
|
85
|
+
|
86
|
+
for i in {0..7}
|
87
|
+
do
|
88
|
+
echo "Working on dataset ${DATASET[i]}"
|
89
|
+
./fasttext supervised -input "${DATADIR}/${DATASET[i]}.train" \
|
90
|
+
-output "${RESULTDIR}/${DATASET[i]}" -dim 10 -lr "${LR[i]}" -wordNgrams 2 \
|
91
|
+
-minCount 1 -bucket 10000000 -epoch 5 -thread 4 > /dev/null
|
92
|
+
./fasttext test "${RESULTDIR}/${DATASET[i]}.bin" \
|
93
|
+
"${DATADIR}/${DATASET[i]}.test"
|
94
|
+
done
|