fasttext 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (478) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/README.md +20 -1
  4. data/lib/fasttext.rb +3 -0
  5. data/lib/fasttext/classifier.rb +12 -4
  6. data/lib/fasttext/vectorizer.rb +1 -1
  7. data/lib/fasttext/version.rb +1 -1
  8. metadata +4 -473
  9. data/lib/fasttext/ext.bundle +0 -0
  10. data/vendor/fastText/CMakeLists.txt +0 -68
  11. data/vendor/fastText/CODE_OF_CONDUCT.md +0 -2
  12. data/vendor/fastText/CONTRIBUTING.md +0 -32
  13. data/vendor/fastText/MANIFEST.in +0 -5
  14. data/vendor/fastText/Makefile +0 -63
  15. data/vendor/fastText/alignment/README.md +0 -53
  16. data/vendor/fastText/alignment/align.py +0 -145
  17. data/vendor/fastText/alignment/eval.py +0 -60
  18. data/vendor/fastText/alignment/example.sh +0 -51
  19. data/vendor/fastText/alignment/unsup_align.py +0 -109
  20. data/vendor/fastText/alignment/utils.py +0 -154
  21. data/vendor/fastText/classification-example.sh +0 -41
  22. data/vendor/fastText/classification-results.sh +0 -94
  23. data/vendor/fastText/crawl/README.md +0 -26
  24. data/vendor/fastText/crawl/dedup.cc +0 -51
  25. data/vendor/fastText/crawl/download_crawl.sh +0 -57
  26. data/vendor/fastText/crawl/filter_dedup.sh +0 -13
  27. data/vendor/fastText/crawl/filter_utf8.cc +0 -105
  28. data/vendor/fastText/crawl/process_wet_file.sh +0 -30
  29. data/vendor/fastText/docs/aligned-vectors.md +0 -64
  30. data/vendor/fastText/docs/api.md +0 -6
  31. data/vendor/fastText/docs/cheatsheet.md +0 -66
  32. data/vendor/fastText/docs/crawl-vectors.md +0 -125
  33. data/vendor/fastText/docs/dataset.md +0 -6
  34. data/vendor/fastText/docs/english-vectors.md +0 -53
  35. data/vendor/fastText/docs/faqs.md +0 -63
  36. data/vendor/fastText/docs/language-identification.md +0 -47
  37. data/vendor/fastText/docs/options.md +0 -50
  38. data/vendor/fastText/docs/pretrained-vectors.md +0 -142
  39. data/vendor/fastText/docs/python-module.md +0 -314
  40. data/vendor/fastText/docs/references.md +0 -41
  41. data/vendor/fastText/docs/supervised-models.md +0 -54
  42. data/vendor/fastText/docs/supervised-tutorial.md +0 -349
  43. data/vendor/fastText/docs/support.md +0 -58
  44. data/vendor/fastText/docs/unsupervised-tutorials.md +0 -309
  45. data/vendor/fastText/eval.py +0 -95
  46. data/vendor/fastText/get-wikimedia.sh +0 -79
  47. data/vendor/fastText/python/README.md +0 -322
  48. data/vendor/fastText/python/README.rst +0 -406
  49. data/vendor/fastText/python/benchmarks/README.rst +0 -3
  50. data/vendor/fastText/python/benchmarks/get_word_vector.py +0 -49
  51. data/vendor/fastText/python/doc/examples/FastTextEmbeddingBag.py +0 -81
  52. data/vendor/fastText/python/doc/examples/bin_to_vec.py +0 -41
  53. data/vendor/fastText/python/doc/examples/compute_accuracy.py +0 -163
  54. data/vendor/fastText/python/doc/examples/get_vocab.py +0 -48
  55. data/vendor/fastText/python/doc/examples/train_supervised.py +0 -42
  56. data/vendor/fastText/python/doc/examples/train_unsupervised.py +0 -56
  57. data/vendor/fastText/python/fasttext_module/fasttext/FastText.py +0 -468
  58. data/vendor/fastText/python/fasttext_module/fasttext/__init__.py +0 -22
  59. data/vendor/fastText/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc +0 -388
  60. data/vendor/fastText/python/fasttext_module/fasttext/tests/__init__.py +0 -14
  61. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_configurations.py +0 -239
  62. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_script.py +0 -629
  63. data/vendor/fastText/python/fasttext_module/fasttext/util/__init__.py +0 -13
  64. data/vendor/fastText/python/fasttext_module/fasttext/util/util.py +0 -60
  65. data/vendor/fastText/quantization-example.sh +0 -40
  66. data/vendor/fastText/runtests.py +0 -60
  67. data/vendor/fastText/scripts/kbcompletion/README.md +0 -19
  68. data/vendor/fastText/scripts/kbcompletion/data.sh +0 -69
  69. data/vendor/fastText/scripts/kbcompletion/eval.cpp +0 -108
  70. data/vendor/fastText/scripts/kbcompletion/fb15k.sh +0 -49
  71. data/vendor/fastText/scripts/kbcompletion/fb15k237.sh +0 -45
  72. data/vendor/fastText/scripts/kbcompletion/svo.sh +0 -38
  73. data/vendor/fastText/scripts/kbcompletion/wn18.sh +0 -49
  74. data/vendor/fastText/scripts/quantization/quantization-results.sh +0 -43
  75. data/vendor/fastText/setup.cfg +0 -2
  76. data/vendor/fastText/setup.py +0 -203
  77. data/vendor/fastText/tests/fetch_test_data.sh +0 -202
  78. data/vendor/fastText/website/README.md +0 -6
  79. data/vendor/fastText/website/blog/2016-08-18-blog-post.md +0 -42
  80. data/vendor/fastText/website/blog/2017-05-02-blog-post.md +0 -60
  81. data/vendor/fastText/website/blog/2017-10-02-blog-post.md +0 -90
  82. data/vendor/fastText/website/blog/2019-06-25-blog-post.md +0 -168
  83. data/vendor/fastText/website/core/Footer.js +0 -127
  84. data/vendor/fastText/website/package.json +0 -12
  85. data/vendor/fastText/website/pages/en/index.js +0 -286
  86. data/vendor/fastText/website/sidebars.json +0 -18
  87. data/vendor/fastText/website/siteConfig.js +0 -102
  88. data/vendor/fastText/website/static/docs/en/html/annotated.html +0 -115
  89. data/vendor/fastText/website/static/docs/en/html/annotated_dup.js +0 -4
  90. data/vendor/fastText/website/static/docs/en/html/args_8cc.html +0 -113
  91. data/vendor/fastText/website/static/docs/en/html/args_8h.html +0 -134
  92. data/vendor/fastText/website/static/docs/en/html/args_8h.js +0 -14
  93. data/vendor/fastText/website/static/docs/en/html/args_8h_source.html +0 -139
  94. data/vendor/fastText/website/static/docs/en/html/bc_s.png +0 -0
  95. data/vendor/fastText/website/static/docs/en/html/bdwn.png +0 -0
  96. data/vendor/fastText/website/static/docs/en/html/classes.html +0 -121
  97. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args-members.html +0 -140
  98. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.html +0 -753
  99. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.js +0 -40
  100. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary-members.html +0 -148
  101. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.html +0 -1266
  102. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.js +0 -43
  103. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText-members.html +0 -145
  104. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.html +0 -1149
  105. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.js +0 -45
  106. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix-members.html +0 -123
  107. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.html +0 -610
  108. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.js +0 -23
  109. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model-members.html +0 -150
  110. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.html +0 -1400
  111. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.js +0 -48
  112. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html +0 -131
  113. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html +0 -950
  114. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js +0 -31
  115. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix-members.html +0 -122
  116. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.html +0 -565
  117. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.js +0 -22
  118. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector-members.html +0 -121
  119. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.html +0 -542
  120. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.js +0 -21
  121. data/vendor/fastText/website/static/docs/en/html/closed.png +0 -0
  122. data/vendor/fastText/website/static/docs/en/html/dictionary_8cc.html +0 -116
  123. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.html +0 -142
  124. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.js +0 -10
  125. data/vendor/fastText/website/static/docs/en/html/dictionary_8h_source.html +0 -127
  126. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html +0 -145
  127. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js +0 -29
  128. data/vendor/fastText/website/static/docs/en/html/doc.png +0 -0
  129. data/vendor/fastText/website/static/docs/en/html/doxygen.css +0 -1596
  130. data/vendor/fastText/website/static/docs/en/html/doxygen.png +0 -0
  131. data/vendor/fastText/website/static/docs/en/html/dynsections.js +0 -97
  132. data/vendor/fastText/website/static/docs/en/html/fasttext_8cc.html +0 -119
  133. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.html +0 -168
  134. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.js +0 -6
  135. data/vendor/fastText/website/static/docs/en/html/fasttext_8h_source.html +0 -155
  136. data/vendor/fastText/website/static/docs/en/html/favicon.png +0 -0
  137. data/vendor/fastText/website/static/docs/en/html/files.html +0 -125
  138. data/vendor/fastText/website/static/docs/en/html/files.js +0 -4
  139. data/vendor/fastText/website/static/docs/en/html/folderclosed.png +0 -0
  140. data/vendor/fastText/website/static/docs/en/html/folderopen.png +0 -0
  141. data/vendor/fastText/website/static/docs/en/html/functions.html +0 -139
  142. data/vendor/fastText/website/static/docs/en/html/functions_0x7e.html +0 -112
  143. data/vendor/fastText/website/static/docs/en/html/functions_b.html +0 -115
  144. data/vendor/fastText/website/static/docs/en/html/functions_c.html +0 -143
  145. data/vendor/fastText/website/static/docs/en/html/functions_d.html +0 -135
  146. data/vendor/fastText/website/static/docs/en/html/functions_dup.js +0 -27
  147. data/vendor/fastText/website/static/docs/en/html/functions_e.html +0 -115
  148. data/vendor/fastText/website/static/docs/en/html/functions_f.html +0 -112
  149. data/vendor/fastText/website/static/docs/en/html/functions_func.html +0 -563
  150. data/vendor/fastText/website/static/docs/en/html/functions_g.html +0 -145
  151. data/vendor/fastText/website/static/docs/en/html/functions_h.html +0 -112
  152. data/vendor/fastText/website/static/docs/en/html/functions_i.html +0 -121
  153. data/vendor/fastText/website/static/docs/en/html/functions_k.html +0 -106
  154. data/vendor/fastText/website/static/docs/en/html/functions_l.html +0 -140
  155. data/vendor/fastText/website/static/docs/en/html/functions_m.html +0 -153
  156. data/vendor/fastText/website/static/docs/en/html/functions_n.html +0 -164
  157. data/vendor/fastText/website/static/docs/en/html/functions_o.html +0 -116
  158. data/vendor/fastText/website/static/docs/en/html/functions_p.html +0 -161
  159. data/vendor/fastText/website/static/docs/en/html/functions_q.html +0 -135
  160. data/vendor/fastText/website/static/docs/en/html/functions_r.html +0 -116
  161. data/vendor/fastText/website/static/docs/en/html/functions_s.html +0 -159
  162. data/vendor/fastText/website/static/docs/en/html/functions_t.html +0 -138
  163. data/vendor/fastText/website/static/docs/en/html/functions_u.html +0 -106
  164. data/vendor/fastText/website/static/docs/en/html/functions_v.html +0 -106
  165. data/vendor/fastText/website/static/docs/en/html/functions_vars.html +0 -486
  166. data/vendor/fastText/website/static/docs/en/html/functions_w.html +0 -124
  167. data/vendor/fastText/website/static/docs/en/html/functions_z.html +0 -104
  168. data/vendor/fastText/website/static/docs/en/html/globals.html +0 -170
  169. data/vendor/fastText/website/static/docs/en/html/globals_defs.html +0 -113
  170. data/vendor/fastText/website/static/docs/en/html/globals_func.html +0 -155
  171. data/vendor/fastText/website/static/docs/en/html/index.html +0 -100
  172. data/vendor/fastText/website/static/docs/en/html/jquery.js +0 -87
  173. data/vendor/fastText/website/static/docs/en/html/main_8cc.html +0 -582
  174. data/vendor/fastText/website/static/docs/en/html/main_8cc.js +0 -22
  175. data/vendor/fastText/website/static/docs/en/html/matrix_8cc.html +0 -114
  176. data/vendor/fastText/website/static/docs/en/html/matrix_8h.html +0 -121
  177. data/vendor/fastText/website/static/docs/en/html/matrix_8h_source.html +0 -123
  178. data/vendor/fastText/website/static/docs/en/html/menu.js +0 -26
  179. data/vendor/fastText/website/static/docs/en/html/menudata.js +0 -90
  180. data/vendor/fastText/website/static/docs/en/html/model_8cc.html +0 -113
  181. data/vendor/fastText/website/static/docs/en/html/model_8h.html +0 -183
  182. data/vendor/fastText/website/static/docs/en/html/model_8h.js +0 -8
  183. data/vendor/fastText/website/static/docs/en/html/model_8h_source.html +0 -139
  184. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.html +0 -343
  185. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.js +0 -13
  186. data/vendor/fastText/website/static/docs/en/html/namespacefasttext_1_1utils.html +0 -158
  187. data/vendor/fastText/website/static/docs/en/html/namespacemembers.html +0 -125
  188. data/vendor/fastText/website/static/docs/en/html/namespacemembers_enum.html +0 -107
  189. data/vendor/fastText/website/static/docs/en/html/namespacemembers_func.html +0 -110
  190. data/vendor/fastText/website/static/docs/en/html/namespacemembers_type.html +0 -104
  191. data/vendor/fastText/website/static/docs/en/html/namespaces.html +0 -106
  192. data/vendor/fastText/website/static/docs/en/html/namespaces.js +0 -4
  193. data/vendor/fastText/website/static/docs/en/html/nav_f.png +0 -0
  194. data/vendor/fastText/website/static/docs/en/html/nav_g.png +0 -0
  195. data/vendor/fastText/website/static/docs/en/html/nav_h.png +0 -0
  196. data/vendor/fastText/website/static/docs/en/html/navtree.css +0 -146
  197. data/vendor/fastText/website/static/docs/en/html/navtree.js +0 -517
  198. data/vendor/fastText/website/static/docs/en/html/navtreedata.js +0 -40
  199. data/vendor/fastText/website/static/docs/en/html/navtreeindex0.js +0 -253
  200. data/vendor/fastText/website/static/docs/en/html/navtreeindex1.js +0 -139
  201. data/vendor/fastText/website/static/docs/en/html/open.png +0 -0
  202. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.html +0 -118
  203. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.js +0 -4
  204. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h.html +0 -124
  205. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h_source.html +0 -133
  206. data/vendor/fastText/website/static/docs/en/html/qmatrix_8cc.html +0 -112
  207. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h.html +0 -126
  208. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h_source.html +0 -128
  209. data/vendor/fastText/website/static/docs/en/html/real_8h.html +0 -117
  210. data/vendor/fastText/website/static/docs/en/html/real_8h.js +0 -4
  211. data/vendor/fastText/website/static/docs/en/html/real_8h_source.html +0 -103
  212. data/vendor/fastText/website/static/docs/en/html/resize.js +0 -114
  213. data/vendor/fastText/website/static/docs/en/html/search/all_0.html +0 -26
  214. data/vendor/fastText/website/static/docs/en/html/search/all_0.js +0 -17
  215. data/vendor/fastText/website/static/docs/en/html/search/all_1.html +0 -26
  216. data/vendor/fastText/website/static/docs/en/html/search/all_1.js +0 -8
  217. data/vendor/fastText/website/static/docs/en/html/search/all_10.html +0 -26
  218. data/vendor/fastText/website/static/docs/en/html/search/all_10.js +0 -10
  219. data/vendor/fastText/website/static/docs/en/html/search/all_11.html +0 -26
  220. data/vendor/fastText/website/static/docs/en/html/search/all_11.js +0 -25
  221. data/vendor/fastText/website/static/docs/en/html/search/all_12.html +0 -26
  222. data/vendor/fastText/website/static/docs/en/html/search/all_12.js +0 -15
  223. data/vendor/fastText/website/static/docs/en/html/search/all_13.html +0 -26
  224. data/vendor/fastText/website/static/docs/en/html/search/all_13.js +0 -7
  225. data/vendor/fastText/website/static/docs/en/html/search/all_14.html +0 -26
  226. data/vendor/fastText/website/static/docs/en/html/search/all_14.js +0 -7
  227. data/vendor/fastText/website/static/docs/en/html/search/all_15.html +0 -26
  228. data/vendor/fastText/website/static/docs/en/html/search/all_15.js +0 -11
  229. data/vendor/fastText/website/static/docs/en/html/search/all_16.html +0 -26
  230. data/vendor/fastText/website/static/docs/en/html/search/all_16.js +0 -4
  231. data/vendor/fastText/website/static/docs/en/html/search/all_17.html +0 -26
  232. data/vendor/fastText/website/static/docs/en/html/search/all_17.js +0 -7
  233. data/vendor/fastText/website/static/docs/en/html/search/all_2.html +0 -26
  234. data/vendor/fastText/website/static/docs/en/html/search/all_2.js +0 -17
  235. data/vendor/fastText/website/static/docs/en/html/search/all_3.html +0 -26
  236. data/vendor/fastText/website/static/docs/en/html/search/all_3.js +0 -17
  237. data/vendor/fastText/website/static/docs/en/html/search/all_4.html +0 -26
  238. data/vendor/fastText/website/static/docs/en/html/search/all_4.js +0 -10
  239. data/vendor/fastText/website/static/docs/en/html/search/all_5.html +0 -26
  240. data/vendor/fastText/website/static/docs/en/html/search/all_5.js +0 -12
  241. data/vendor/fastText/website/static/docs/en/html/search/all_6.html +0 -26
  242. data/vendor/fastText/website/static/docs/en/html/search/all_6.js +0 -18
  243. data/vendor/fastText/website/static/docs/en/html/search/all_7.html +0 -26
  244. data/vendor/fastText/website/static/docs/en/html/search/all_7.js +0 -8
  245. data/vendor/fastText/website/static/docs/en/html/search/all_8.html +0 -26
  246. data/vendor/fastText/website/static/docs/en/html/search/all_8.js +0 -11
  247. data/vendor/fastText/website/static/docs/en/html/search/all_9.html +0 -26
  248. data/vendor/fastText/website/static/docs/en/html/search/all_9.js +0 -5
  249. data/vendor/fastText/website/static/docs/en/html/search/all_a.html +0 -26
  250. data/vendor/fastText/website/static/docs/en/html/search/all_a.js +0 -17
  251. data/vendor/fastText/website/static/docs/en/html/search/all_b.html +0 -26
  252. data/vendor/fastText/website/static/docs/en/html/search/all_b.js +0 -27
  253. data/vendor/fastText/website/static/docs/en/html/search/all_c.html +0 -26
  254. data/vendor/fastText/website/static/docs/en/html/search/all_c.js +0 -26
  255. data/vendor/fastText/website/static/docs/en/html/search/all_d.html +0 -26
  256. data/vendor/fastText/website/static/docs/en/html/search/all_d.js +0 -9
  257. data/vendor/fastText/website/static/docs/en/html/search/all_e.html +0 -26
  258. data/vendor/fastText/website/static/docs/en/html/search/all_e.js +0 -35
  259. data/vendor/fastText/website/static/docs/en/html/search/all_f.html +0 -26
  260. data/vendor/fastText/website/static/docs/en/html/search/all_f.js +0 -16
  261. data/vendor/fastText/website/static/docs/en/html/search/classes_0.html +0 -26
  262. data/vendor/fastText/website/static/docs/en/html/search/classes_0.js +0 -4
  263. data/vendor/fastText/website/static/docs/en/html/search/classes_1.html +0 -26
  264. data/vendor/fastText/website/static/docs/en/html/search/classes_1.js +0 -4
  265. data/vendor/fastText/website/static/docs/en/html/search/classes_2.html +0 -26
  266. data/vendor/fastText/website/static/docs/en/html/search/classes_2.js +0 -4
  267. data/vendor/fastText/website/static/docs/en/html/search/classes_3.html +0 -26
  268. data/vendor/fastText/website/static/docs/en/html/search/classes_3.js +0 -4
  269. data/vendor/fastText/website/static/docs/en/html/search/classes_4.html +0 -26
  270. data/vendor/fastText/website/static/docs/en/html/search/classes_4.js +0 -5
  271. data/vendor/fastText/website/static/docs/en/html/search/classes_5.html +0 -26
  272. data/vendor/fastText/website/static/docs/en/html/search/classes_5.js +0 -4
  273. data/vendor/fastText/website/static/docs/en/html/search/classes_6.html +0 -26
  274. data/vendor/fastText/website/static/docs/en/html/search/classes_6.js +0 -4
  275. data/vendor/fastText/website/static/docs/en/html/search/classes_7.html +0 -26
  276. data/vendor/fastText/website/static/docs/en/html/search/classes_7.js +0 -4
  277. data/vendor/fastText/website/static/docs/en/html/search/classes_8.html +0 -26
  278. data/vendor/fastText/website/static/docs/en/html/search/classes_8.js +0 -4
  279. data/vendor/fastText/website/static/docs/en/html/search/close.png +0 -0
  280. data/vendor/fastText/website/static/docs/en/html/search/defines_0.html +0 -26
  281. data/vendor/fastText/website/static/docs/en/html/search/defines_0.js +0 -5
  282. data/vendor/fastText/website/static/docs/en/html/search/defines_1.html +0 -26
  283. data/vendor/fastText/website/static/docs/en/html/search/defines_1.js +0 -4
  284. data/vendor/fastText/website/static/docs/en/html/search/defines_2.html +0 -26
  285. data/vendor/fastText/website/static/docs/en/html/search/defines_2.js +0 -4
  286. data/vendor/fastText/website/static/docs/en/html/search/defines_3.html +0 -26
  287. data/vendor/fastText/website/static/docs/en/html/search/defines_3.js +0 -4
  288. data/vendor/fastText/website/static/docs/en/html/search/enums_0.html +0 -26
  289. data/vendor/fastText/website/static/docs/en/html/search/enums_0.js +0 -4
  290. data/vendor/fastText/website/static/docs/en/html/search/enums_1.html +0 -26
  291. data/vendor/fastText/website/static/docs/en/html/search/enums_1.js +0 -4
  292. data/vendor/fastText/website/static/docs/en/html/search/enums_2.html +0 -26
  293. data/vendor/fastText/website/static/docs/en/html/search/enums_2.js +0 -4
  294. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.html +0 -26
  295. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.js +0 -4
  296. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.html +0 -26
  297. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.js +0 -4
  298. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.html +0 -26
  299. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.js +0 -4
  300. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.html +0 -26
  301. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.js +0 -4
  302. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.html +0 -26
  303. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.js +0 -6
  304. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.html +0 -26
  305. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.js +0 -4
  306. data/vendor/fastText/website/static/docs/en/html/search/files_0.html +0 -26
  307. data/vendor/fastText/website/static/docs/en/html/search/files_0.js +0 -5
  308. data/vendor/fastText/website/static/docs/en/html/search/files_1.html +0 -26
  309. data/vendor/fastText/website/static/docs/en/html/search/files_1.js +0 -5
  310. data/vendor/fastText/website/static/docs/en/html/search/files_2.html +0 -26
  311. data/vendor/fastText/website/static/docs/en/html/search/files_2.js +0 -5
  312. data/vendor/fastText/website/static/docs/en/html/search/files_3.html +0 -26
  313. data/vendor/fastText/website/static/docs/en/html/search/files_3.js +0 -8
  314. data/vendor/fastText/website/static/docs/en/html/search/files_4.html +0 -26
  315. data/vendor/fastText/website/static/docs/en/html/search/files_4.js +0 -5
  316. data/vendor/fastText/website/static/docs/en/html/search/files_5.html +0 -26
  317. data/vendor/fastText/website/static/docs/en/html/search/files_5.js +0 -5
  318. data/vendor/fastText/website/static/docs/en/html/search/files_6.html +0 -26
  319. data/vendor/fastText/website/static/docs/en/html/search/files_6.js +0 -4
  320. data/vendor/fastText/website/static/docs/en/html/search/files_7.html +0 -26
  321. data/vendor/fastText/website/static/docs/en/html/search/files_7.js +0 -5
  322. data/vendor/fastText/website/static/docs/en/html/search/files_8.html +0 -26
  323. data/vendor/fastText/website/static/docs/en/html/search/files_8.js +0 -5
  324. data/vendor/fastText/website/static/docs/en/html/search/functions_0.html +0 -26
  325. data/vendor/fastText/website/static/docs/en/html/search/functions_0.js +0 -14
  326. data/vendor/fastText/website/static/docs/en/html/search/functions_1.html +0 -26
  327. data/vendor/fastText/website/static/docs/en/html/search/functions_1.js +0 -5
  328. data/vendor/fastText/website/static/docs/en/html/search/functions_10.html +0 -26
  329. data/vendor/fastText/website/static/docs/en/html/search/functions_10.js +0 -5
  330. data/vendor/fastText/website/static/docs/en/html/search/functions_11.html +0 -26
  331. data/vendor/fastText/website/static/docs/en/html/search/functions_11.js +0 -18
  332. data/vendor/fastText/website/static/docs/en/html/search/functions_12.html +0 -26
  333. data/vendor/fastText/website/static/docs/en/html/search/functions_12.js +0 -8
  334. data/vendor/fastText/website/static/docs/en/html/search/functions_13.html +0 -26
  335. data/vendor/fastText/website/static/docs/en/html/search/functions_13.js +0 -5
  336. data/vendor/fastText/website/static/docs/en/html/search/functions_14.html +0 -26
  337. data/vendor/fastText/website/static/docs/en/html/search/functions_14.js +0 -4
  338. data/vendor/fastText/website/static/docs/en/html/search/functions_15.html +0 -26
  339. data/vendor/fastText/website/static/docs/en/html/search/functions_15.js +0 -4
  340. data/vendor/fastText/website/static/docs/en/html/search/functions_16.html +0 -26
  341. data/vendor/fastText/website/static/docs/en/html/search/functions_16.js +0 -4
  342. data/vendor/fastText/website/static/docs/en/html/search/functions_17.html +0 -26
  343. data/vendor/fastText/website/static/docs/en/html/search/functions_17.js +0 -7
  344. data/vendor/fastText/website/static/docs/en/html/search/functions_2.html +0 -26
  345. data/vendor/fastText/website/static/docs/en/html/search/functions_2.js +0 -11
  346. data/vendor/fastText/website/static/docs/en/html/search/functions_3.html +0 -26
  347. data/vendor/fastText/website/static/docs/en/html/search/functions_3.js +0 -9
  348. data/vendor/fastText/website/static/docs/en/html/search/functions_4.html +0 -26
  349. data/vendor/fastText/website/static/docs/en/html/search/functions_4.js +0 -4
  350. data/vendor/fastText/website/static/docs/en/html/search/functions_5.html +0 -26
  351. data/vendor/fastText/website/static/docs/en/html/search/functions_5.js +0 -7
  352. data/vendor/fastText/website/static/docs/en/html/search/functions_6.html +0 -26
  353. data/vendor/fastText/website/static/docs/en/html/search/functions_6.js +0 -17
  354. data/vendor/fastText/website/static/docs/en/html/search/functions_7.html +0 -26
  355. data/vendor/fastText/website/static/docs/en/html/search/functions_7.js +0 -5
  356. data/vendor/fastText/website/static/docs/en/html/search/functions_8.html +0 -26
  357. data/vendor/fastText/website/static/docs/en/html/search/functions_8.js +0 -8
  358. data/vendor/fastText/website/static/docs/en/html/search/functions_9.html +0 -26
  359. data/vendor/fastText/website/static/docs/en/html/search/functions_9.js +0 -4
  360. data/vendor/fastText/website/static/docs/en/html/search/functions_a.html +0 -26
  361. data/vendor/fastText/website/static/docs/en/html/search/functions_a.js +0 -8
  362. data/vendor/fastText/website/static/docs/en/html/search/functions_b.html +0 -26
  363. data/vendor/fastText/website/static/docs/en/html/search/functions_b.js +0 -10
  364. data/vendor/fastText/website/static/docs/en/html/search/functions_c.html +0 -26
  365. data/vendor/fastText/website/static/docs/en/html/search/functions_c.js +0 -10
  366. data/vendor/fastText/website/static/docs/en/html/search/functions_d.html +0 -26
  367. data/vendor/fastText/website/static/docs/en/html/search/functions_d.js +0 -6
  368. data/vendor/fastText/website/static/docs/en/html/search/functions_e.html +0 -26
  369. data/vendor/fastText/website/static/docs/en/html/search/functions_e.js +0 -26
  370. data/vendor/fastText/website/static/docs/en/html/search/functions_f.html +0 -26
  371. data/vendor/fastText/website/static/docs/en/html/search/functions_f.js +0 -6
  372. data/vendor/fastText/website/static/docs/en/html/search/mag_sel.png +0 -0
  373. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.html +0 -26
  374. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.js +0 -5
  375. data/vendor/fastText/website/static/docs/en/html/search/nomatches.html +0 -12
  376. data/vendor/fastText/website/static/docs/en/html/search/search.css +0 -271
  377. data/vendor/fastText/website/static/docs/en/html/search/search.js +0 -791
  378. data/vendor/fastText/website/static/docs/en/html/search/search_l.png +0 -0
  379. data/vendor/fastText/website/static/docs/en/html/search/search_m.png +0 -0
  380. data/vendor/fastText/website/static/docs/en/html/search/search_r.png +0 -0
  381. data/vendor/fastText/website/static/docs/en/html/search/searchdata.js +0 -42
  382. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.html +0 -26
  383. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.js +0 -4
  384. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.html +0 -26
  385. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.js +0 -4
  386. data/vendor/fastText/website/static/docs/en/html/search/variables_0.html +0 -26
  387. data/vendor/fastText/website/static/docs/en/html/search/variables_0.js +0 -4
  388. data/vendor/fastText/website/static/docs/en/html/search/variables_1.html +0 -26
  389. data/vendor/fastText/website/static/docs/en/html/search/variables_1.js +0 -6
  390. data/vendor/fastText/website/static/docs/en/html/search/variables_10.html +0 -26
  391. data/vendor/fastText/website/static/docs/en/html/search/variables_10.js +0 -8
  392. data/vendor/fastText/website/static/docs/en/html/search/variables_11.html +0 -26
  393. data/vendor/fastText/website/static/docs/en/html/search/variables_11.js +0 -11
  394. data/vendor/fastText/website/static/docs/en/html/search/variables_12.html +0 -26
  395. data/vendor/fastText/website/static/docs/en/html/search/variables_12.js +0 -4
  396. data/vendor/fastText/website/static/docs/en/html/search/variables_13.html +0 -26
  397. data/vendor/fastText/website/static/docs/en/html/search/variables_13.js +0 -10
  398. data/vendor/fastText/website/static/docs/en/html/search/variables_2.html +0 -26
  399. data/vendor/fastText/website/static/docs/en/html/search/variables_2.js +0 -9
  400. data/vendor/fastText/website/static/docs/en/html/search/variables_3.html +0 -26
  401. data/vendor/fastText/website/static/docs/en/html/search/variables_3.js +0 -9
  402. data/vendor/fastText/website/static/docs/en/html/search/variables_4.html +0 -26
  403. data/vendor/fastText/website/static/docs/en/html/search/variables_4.js +0 -7
  404. data/vendor/fastText/website/static/docs/en/html/search/variables_5.html +0 -26
  405. data/vendor/fastText/website/static/docs/en/html/search/variables_5.js +0 -4
  406. data/vendor/fastText/website/static/docs/en/html/search/variables_6.html +0 -26
  407. data/vendor/fastText/website/static/docs/en/html/search/variables_6.js +0 -5
  408. data/vendor/fastText/website/static/docs/en/html/search/variables_7.html +0 -26
  409. data/vendor/fastText/website/static/docs/en/html/search/variables_7.js +0 -5
  410. data/vendor/fastText/website/static/docs/en/html/search/variables_8.html +0 -26
  411. data/vendor/fastText/website/static/docs/en/html/search/variables_8.js +0 -4
  412. data/vendor/fastText/website/static/docs/en/html/search/variables_9.html +0 -26
  413. data/vendor/fastText/website/static/docs/en/html/search/variables_9.js +0 -10
  414. data/vendor/fastText/website/static/docs/en/html/search/variables_a.html +0 -26
  415. data/vendor/fastText/website/static/docs/en/html/search/variables_a.js +0 -14
  416. data/vendor/fastText/website/static/docs/en/html/search/variables_b.html +0 -26
  417. data/vendor/fastText/website/static/docs/en/html/search/variables_b.js +0 -17
  418. data/vendor/fastText/website/static/docs/en/html/search/variables_c.html +0 -26
  419. data/vendor/fastText/website/static/docs/en/html/search/variables_c.js +0 -6
  420. data/vendor/fastText/website/static/docs/en/html/search/variables_d.html +0 -26
  421. data/vendor/fastText/website/static/docs/en/html/search/variables_d.js +0 -10
  422. data/vendor/fastText/website/static/docs/en/html/search/variables_e.html +0 -26
  423. data/vendor/fastText/website/static/docs/en/html/search/variables_e.js +0 -11
  424. data/vendor/fastText/website/static/docs/en/html/search/variables_f.html +0 -26
  425. data/vendor/fastText/website/static/docs/en/html/search/variables_f.js +0 -6
  426. data/vendor/fastText/website/static/docs/en/html/splitbar.png +0 -0
  427. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node-members.html +0 -108
  428. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.html +0 -194
  429. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.js +0 -8
  430. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry-members.html +0 -107
  431. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.html +0 -178
  432. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.js +0 -7
  433. data/vendor/fastText/website/static/docs/en/html/sync_off.png +0 -0
  434. data/vendor/fastText/website/static/docs/en/html/sync_on.png +0 -0
  435. data/vendor/fastText/website/static/docs/en/html/tab_a.png +0 -0
  436. data/vendor/fastText/website/static/docs/en/html/tab_b.png +0 -0
  437. data/vendor/fastText/website/static/docs/en/html/tab_h.png +0 -0
  438. data/vendor/fastText/website/static/docs/en/html/tab_s.png +0 -0
  439. data/vendor/fastText/website/static/docs/en/html/tabs.css +0 -1
  440. data/vendor/fastText/website/static/docs/en/html/utils_8cc.html +0 -121
  441. data/vendor/fastText/website/static/docs/en/html/utils_8cc.js +0 -5
  442. data/vendor/fastText/website/static/docs/en/html/utils_8h.html +0 -122
  443. data/vendor/fastText/website/static/docs/en/html/utils_8h.js +0 -5
  444. data/vendor/fastText/website/static/docs/en/html/utils_8h_source.html +0 -104
  445. data/vendor/fastText/website/static/docs/en/html/vector_8cc.html +0 -121
  446. data/vendor/fastText/website/static/docs/en/html/vector_8cc.js +0 -4
  447. data/vendor/fastText/website/static/docs/en/html/vector_8h.html +0 -126
  448. data/vendor/fastText/website/static/docs/en/html/vector_8h.js +0 -5
  449. data/vendor/fastText/website/static/docs/en/html/vector_8h_source.html +0 -120
  450. data/vendor/fastText/website/static/fasttext.css +0 -48
  451. data/vendor/fastText/website/static/img/authors/armand_joulin.jpg +0 -0
  452. data/vendor/fastText/website/static/img/authors/christian_puhrsch.png +0 -0
  453. data/vendor/fastText/website/static/img/authors/edouard_grave.jpeg +0 -0
  454. data/vendor/fastText/website/static/img/authors/piotr_bojanowski.jpg +0 -0
  455. data/vendor/fastText/website/static/img/authors/tomas_mikolov.jpg +0 -0
  456. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img1.png +0 -0
  457. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img2.png +0 -0
  458. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img1.jpg +0 -0
  459. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img2.jpg +0 -0
  460. data/vendor/fastText/website/static/img/blog/2017-10-02-blog-post-img1.png +0 -0
  461. data/vendor/fastText/website/static/img/cbo_vs_skipgram.png +0 -0
  462. data/vendor/fastText/website/static/img/fasttext-icon-api.png +0 -0
  463. data/vendor/fastText/website/static/img/fasttext-icon-bg-web.png +0 -0
  464. data/vendor/fastText/website/static/img/fasttext-icon-color-square.png +0 -0
  465. data/vendor/fastText/website/static/img/fasttext-icon-color-web.png +0 -0
  466. data/vendor/fastText/website/static/img/fasttext-icon-faq.png +0 -0
  467. data/vendor/fastText/website/static/img/fasttext-icon-tutorial.png +0 -0
  468. data/vendor/fastText/website/static/img/fasttext-icon-white-web.png +0 -0
  469. data/vendor/fastText/website/static/img/fasttext-logo-color-web.png +0 -0
  470. data/vendor/fastText/website/static/img/fasttext-logo-white-web.png +0 -0
  471. data/vendor/fastText/website/static/img/logo-color.png +0 -0
  472. data/vendor/fastText/website/static/img/model-black.png +0 -0
  473. data/vendor/fastText/website/static/img/model-blue.png +0 -0
  474. data/vendor/fastText/website/static/img/model-red.png +0 -0
  475. data/vendor/fastText/website/static/img/ogimage.png +0 -0
  476. data/vendor/fastText/website/static/img/oss_logo.png +0 -0
  477. data/vendor/fastText/wikifil.pl +0 -57
  478. data/vendor/fastText/word-vector-example.sh +0 -39
@@ -1,109 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) 2018-present, Facebook, Inc.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the MIT license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- import codecs, sys, time, math, argparse, ot
9
- import numpy as np
10
- from utils import *
11
-
12
- parser = argparse.ArgumentParser(description='Wasserstein Procrustes for Embedding Alignment')
13
- parser.add_argument('--model_src', type=str, help='Path to source word embeddings')
14
- parser.add_argument('--model_tgt', type=str, help='Path to target word embeddings')
15
- parser.add_argument('--lexicon', type=str, help='Path to the evaluation lexicon')
16
- parser.add_argument('--output_src', default='', type=str, help='Path to save the aligned source embeddings')
17
- parser.add_argument('--output_tgt', default='', type=str, help='Path to save the aligned target embeddings')
18
- parser.add_argument('--seed', default=1111, type=int, help='Random number generator seed')
19
- parser.add_argument('--nepoch', default=5, type=int, help='Number of epochs')
20
- parser.add_argument('--niter', default=5000, type=int, help='Initial number of iterations')
21
- parser.add_argument('--bsz', default=500, type=int, help='Initial batch size')
22
- parser.add_argument('--lr', default=500., type=float, help='Learning rate')
23
- parser.add_argument('--nmax', default=20000, type=int, help='Vocabulary size for learning the alignment')
24
- parser.add_argument('--reg', default=0.05, type=float, help='Regularization parameter for sinkhorn')
25
- args = parser.parse_args()
26
-
27
-
28
- def objective(X, Y, R, n=5000):
29
- Xn, Yn = X[:n], Y[:n]
30
- C = -np.dot(np.dot(Xn, R), Yn.T)
31
- P = ot.sinkhorn(np.ones(n), np.ones(n), C, 0.025, stopThr=1e-3)
32
- return 1000 * np.linalg.norm(np.dot(Xn, R) - np.dot(P, Yn)) / n
33
-
34
-
35
- def sqrt_eig(x):
36
- U, s, VT = np.linalg.svd(x, full_matrices=False)
37
- return np.dot(U, np.dot(np.diag(np.sqrt(s)), VT))
38
-
39
-
40
- def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000,
41
- nmax=10000, reg=0.05, verbose=True):
42
- for epoch in range(1, nepoch + 1):
43
- for _it in range(1, niter + 1):
44
- # sample mini-batch
45
- xt = X[np.random.permutation(nmax)[:bsz], :]
46
- yt = Y[np.random.permutation(nmax)[:bsz], :]
47
- # compute OT on minibatch
48
- C = -np.dot(np.dot(xt, R), yt.T)
49
- P = ot.sinkhorn(np.ones(bsz), np.ones(bsz), C, reg, stopThr=1e-3)
50
- # compute gradient
51
- G = - np.dot(xt.T, np.dot(P, yt))
52
- R -= lr / bsz * G
53
- # project on orthogonal matrices
54
- U, s, VT = np.linalg.svd(R)
55
- R = np.dot(U, VT)
56
- bsz *= 2
57
- niter //= 4
58
- if verbose:
59
- print("epoch: %d obj: %.3f" % (epoch, objective(X, Y, R)))
60
- return R
61
-
62
-
63
- def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
64
- n, d = X.shape
65
- if apply_sqrt:
66
- X, Y = sqrt_eig(X), sqrt_eig(Y)
67
- K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
68
- K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
69
- K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
70
- P = np.ones([n, n]) / float(n)
71
- for it in range(1, niter + 1):
72
- G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
73
- q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
74
- alpha = 2.0 / float(2.0 + it)
75
- P = alpha * q + (1.0 - alpha) * P
76
- obj = np.linalg.norm(np.dot(P, K_X) - np.dot(K_Y, P))
77
- print(obj)
78
- return procrustes(np.dot(P, X), Y).T
79
-
80
-
81
- print("\n*** Wasserstein Procrustes ***\n")
82
-
83
- np.random.seed(args.seed)
84
-
85
- maxload = 200000
86
- w_src, x_src = load_vectors(args.model_src, maxload, norm=True, center=True)
87
- w_tgt, x_tgt = load_vectors(args.model_tgt, maxload, norm=True, center=True)
88
- src2trg, _ = load_lexicon(args.lexicon, w_src, w_tgt)
89
-
90
- print("\nComputing initial mapping with convex relaxation...")
91
- t0 = time.time()
92
- R0 = convex_init(x_src[:2500], x_tgt[:2500], reg=args.reg, apply_sqrt=True)
93
- print("Done [%03d sec]" % math.floor(time.time() - t0))
94
-
95
- print("\nComputing mapping with Wasserstein Procrustes...")
96
- t0 = time.time()
97
- R = align(x_src, x_tgt, R0.copy(), bsz=args.bsz, lr=args.lr, niter=args.niter,
98
- nepoch=args.nepoch, reg=args.reg, nmax=args.nmax)
99
- print("Done [%03d sec]" % math.floor(time.time() - t0))
100
-
101
- acc = compute_nn_accuracy(x_src, np.dot(x_tgt, R.T), src2trg)
102
- print("\nPrecision@1: %.3f\n" % acc)
103
-
104
- if args.output_src != '':
105
- x_src = x_src / np.linalg.norm(x_src, 2, 1).reshape([-1, 1])
106
- save_vectors(args.output_src, x_src, w_src)
107
- if args.output_tgt != '':
108
- x_tgt = x_tgt / np.linalg.norm(x_tgt, 2, 1).reshape([-1, 1])
109
- save_vectors(args.output_tgt, np.dot(x_tgt, R.T), w_tgt)
@@ -1,154 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) 2018-present, Facebook, Inc.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- import io
9
- import numpy as np
10
- import collections
11
-
12
-
13
- def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True):
14
- if verbose:
15
- print("Loading vectors from %s" % fname)
16
- fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
17
- n, d = map(int, fin.readline().split())
18
- if maxload > 0:
19
- n = min(n, maxload)
20
- x = np.zeros([n, d])
21
- words = []
22
- for i, line in enumerate(fin):
23
- if i >= n:
24
- break
25
- tokens = line.rstrip().split(' ')
26
- words.append(tokens[0])
27
- v = np.array(tokens[1:], dtype=float)
28
- x[i, :] = v
29
- if norm:
30
- x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
31
- if center:
32
- x -= x.mean(axis=0)[np.newaxis, :]
33
- x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
34
- if verbose:
35
- print("%d word vectors loaded" % (len(words)))
36
- return words, x
37
-
38
-
39
- def idx(words):
40
- w2i = {}
41
- for i, w in enumerate(words):
42
- if w not in w2i:
43
- w2i[w] = i
44
- return w2i
45
-
46
-
47
- def save_vectors(fname, x, words):
48
- n, d = x.shape
49
- fout = io.open(fname, 'w', encoding='utf-8')
50
- fout.write(u"%d %d\n" % (n, d))
51
- for i in range(n):
52
- fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
53
- fout.close()
54
-
55
-
56
- def save_matrix(fname, x):
57
- n, d = x.shape
58
- fout = io.open(fname, 'w', encoding='utf-8')
59
- fout.write(u"%d %d\n" % (n, d))
60
- for i in range(n):
61
- fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
62
- fout.close()
63
-
64
-
65
- def procrustes(X_src, Y_tgt):
66
- U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
67
- return np.dot(U, V)
68
-
69
-
70
- def select_vectors_from_pairs(x_src, y_tgt, pairs):
71
- n = len(pairs)
72
- d = x_src.shape[1]
73
- x = np.zeros([n, d])
74
- y = np.zeros([n, d])
75
- for k, ij in enumerate(pairs):
76
- i, j = ij
77
- x[k, :] = x_src[i, :]
78
- y[k, :] = y_tgt[j, :]
79
- return x, y
80
-
81
-
82
- def load_lexicon(filename, words_src, words_tgt, verbose=True):
83
- f = io.open(filename, 'r', encoding='utf-8')
84
- lexicon = collections.defaultdict(set)
85
- idx_src , idx_tgt = idx(words_src), idx(words_tgt)
86
- vocab = set()
87
- for line in f:
88
- word_src, word_tgt = line.split()
89
- if word_src in idx_src and word_tgt in idx_tgt:
90
- lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
91
- vocab.add(word_src)
92
- if verbose:
93
- coverage = len(lexicon) / float(len(vocab))
94
- print("Coverage of source vocab: %.4f" % (coverage))
95
- return lexicon, float(len(vocab))
96
-
97
-
98
- def load_pairs(filename, idx_src, idx_tgt, verbose=True):
99
- f = io.open(filename, 'r', encoding='utf-8')
100
- pairs = []
101
- tot = 0
102
- for line in f:
103
- a, b = line.rstrip().split(' ')
104
- tot += 1
105
- if a in idx_src and b in idx_tgt:
106
- pairs.append((idx_src[a], idx_tgt[b]))
107
- if verbose:
108
- coverage = (1.0 * len(pairs)) / tot
109
- print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage))
110
- return pairs
111
-
112
-
113
- def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
114
- if lexicon_size < 0:
115
- lexicon_size = len(lexicon)
116
- idx_src = list(lexicon.keys())
117
- acc = 0.0
118
- x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
119
- x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
120
- for i in range(0, len(idx_src), bsz):
121
- e = min(i + bsz, len(idx_src))
122
- scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
123
- pred = scores.argmax(axis=0)
124
- for j in range(i, e):
125
- if pred[j - i] in lexicon[idx_src[j]]:
126
- acc += 1.0
127
- return acc / lexicon_size
128
-
129
-
130
- def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024):
131
- if lexicon_size < 0:
132
- lexicon_size = len(lexicon)
133
- idx_src = list(lexicon.keys())
134
-
135
- x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
136
- x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
137
-
138
- sr = x_src[list(idx_src)]
139
- sc = np.dot(sr, x_tgt.T)
140
- similarities = 2 * sc
141
- sc2 = np.zeros(x_tgt.shape[0])
142
- for i in range(0, x_tgt.shape[0], bsz):
143
- j = min(i + bsz, x_tgt.shape[0])
144
- sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
145
- dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
146
- sc2[i:j] = np.mean(dotprod, axis=1)
147
- similarities -= sc2[np.newaxis, :]
148
-
149
- nn = np.argmax(similarities, axis=1).tolist()
150
- correct = 0.0
151
- for k in range(0, len(lexicon)):
152
- if nn[k] in lexicon[idx_src[k]]:
153
- correct += 1.0
154
- return correct / lexicon_size
@@ -1,41 +0,0 @@
1
- #!/usr/bin/env bash
2
- #
3
- # Copyright (c) 2016-present, Facebook, Inc.
4
- # All rights reserved.
5
- #
6
- # This source code is licensed under the MIT license found in the
7
- # LICENSE file in the root directory of this source tree.
8
- #
9
-
10
- myshuf() {
11
- perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
12
- }
13
-
14
- normalize_text() {
15
- tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
16
- sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
17
- -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
18
- -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
19
- }
20
-
21
- RESULTDIR=result
22
- DATADIR=data
23
-
24
- mkdir -p "${RESULTDIR}"
25
- mkdir -p "${DATADIR}"
26
-
27
- if [ ! -f "${DATADIR}/dbpedia.train" ]
28
- then
29
- wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "${DATADIR}/dbpedia_csv.tar.gz"
30
- tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
31
- cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
32
- cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
33
- fi
34
-
35
- make
36
-
37
- ./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4
38
-
39
- ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
40
-
41
- ./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
@@ -1,94 +0,0 @@
1
- #!/usr/bin/env bash
2
- #
3
- # Copyright (c) 2016-present, Facebook, Inc.
4
- # All rights reserved.
5
- #
6
- # This source code is licensed under the MIT license found in the
7
- # LICENSE file in the root directory of this source tree.
8
- #
9
-
10
- # This script produces the results from Table 1 in the following paper:
11
- # Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016
12
-
13
- myshuf() {
14
- perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
15
- }
16
-
17
- normalize_text() {
18
- tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
19
- sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
20
- -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
21
- -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
22
- }
23
-
24
- DATASET=(
25
- ag_news
26
- sogou_news
27
- dbpedia
28
- yelp_review_polarity
29
- yelp_review_full
30
- yahoo_answers
31
- amazon_review_full
32
- amazon_review_polarity
33
- )
34
-
35
- ID=(
36
- 0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
37
- 0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
38
- 0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
39
- 0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
40
- 0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
41
- 0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
42
- 0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
43
- 0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
44
- )
45
-
46
- # These learning rates were chosen by validation on a subset of the training set.
47
- LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 )
48
-
49
- RESULTDIR=result
50
- DATADIR=data
51
-
52
- mkdir -p "${RESULTDIR}"
53
- mkdir -p "${DATADIR}"
54
-
55
- # Small datasets first
56
-
57
- for i in {0..0}
58
- do
59
- echo "Downloading dataset ${DATASET[i]}"
60
- if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
61
- then
62
- wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
63
- tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
64
- cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
65
- cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
66
- fi
67
- done
68
-
69
- # Large datasets require a bit more work due to the extra request page
70
-
71
- for i in {1..7}
72
- do
73
- echo "Downloading dataset ${DATASET[i]}"
74
- if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
75
- then
76
- curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
77
- curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&amp;/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
78
- tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
79
- cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
80
- cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
81
- fi
82
- done
83
-
84
- make
85
-
86
- for i in {0..7}
87
- do
88
- echo "Working on dataset ${DATASET[i]}"
89
- ./fasttext supervised -input "${DATADIR}/${DATASET[i]}.train" \
90
- -output "${RESULTDIR}/${DATASET[i]}" -dim 10 -lr "${LR[i]}" -wordNgrams 2 \
91
- -minCount 1 -bucket 10000000 -epoch 5 -thread 4 > /dev/null
92
- ./fasttext test "${RESULTDIR}/${DATASET[i]}.bin" \
93
- "${DATADIR}/${DATASET[i]}.test"
94
- done
@@ -1,26 +0,0 @@
1
- ## Preprocessing Common Crawl
2
-
3
- This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
4
-
5
- This script uses the scripts and language identifier of [1].
6
-
7
- This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
8
-
9
- Set the variable WET_PATHS_URL to the crawl you want to process.
10
- Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
11
- Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
12
-
13
- ### Reference
14
-
15
- If you use this code, please cite:
16
-
17
- [1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
18
-
19
- ```
20
- @inproceedings{grave2018learning,
21
- title={Learning Word Vectors for 157 Languages},
22
- author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
23
- booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
24
- year={2018}
25
- }
26
- ```
@@ -1,51 +0,0 @@
1
- // Copyright (c) 2018-present, Facebook, Inc.
2
- // All rights reserved.
3
- //
4
- // This source code is licensed under the MIT license found in the
5
- // LICENSE file in the root directory of this source tree.
6
-
7
- #include <cstdint>
8
- #include <iostream>
9
- #include <fstream>
10
- #include <string>
11
- #include <vector>
12
-
13
- uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull)
14
- {
15
- for (size_t i = 0; i < sz; i++, data++) {
16
- h ^= uint64_t(*data);
17
- h *= 1099511628211ull;
18
- }
19
- return h;
20
- }
21
-
22
- int main(int argc, char** argv)
23
- {
24
- uint64_t init_values[] = {
25
- 14695981039346656037ull,
26
- 9425296925403859339ull,
27
- 13716263814064014149ull,
28
- 3525492407291847033ull,
29
- 8607404175481815707ull,
30
- 9818874561736458749ull,
31
- 10026508429719773353ull,
32
- 3560712257386009938ull
33
- };
34
- size_t n = 1ull<<34, num_hashes = 2;
35
- std::vector<bool> seen(n);
36
-
37
- std::ios_base::sync_with_stdio(false);
38
-
39
- for (std::string line; std::getline(std::cin, line);) {
40
- bool b = true;
41
- for (size_t i = 0; i < num_hashes; i++) {
42
- uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n;
43
- b = b && seen[h];
44
- seen[h] = true;
45
- }
46
- if (!b) {
47
- std::cout << line << std::endl;
48
- }
49
- }
50
- return 0;
51
- }