fasttext 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (478) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/README.md +20 -1
  4. data/lib/fasttext.rb +3 -0
  5. data/lib/fasttext/classifier.rb +12 -4
  6. data/lib/fasttext/vectorizer.rb +1 -1
  7. data/lib/fasttext/version.rb +1 -1
  8. metadata +4 -473
  9. data/lib/fasttext/ext.bundle +0 -0
  10. data/vendor/fastText/CMakeLists.txt +0 -68
  11. data/vendor/fastText/CODE_OF_CONDUCT.md +0 -2
  12. data/vendor/fastText/CONTRIBUTING.md +0 -32
  13. data/vendor/fastText/MANIFEST.in +0 -5
  14. data/vendor/fastText/Makefile +0 -63
  15. data/vendor/fastText/alignment/README.md +0 -53
  16. data/vendor/fastText/alignment/align.py +0 -145
  17. data/vendor/fastText/alignment/eval.py +0 -60
  18. data/vendor/fastText/alignment/example.sh +0 -51
  19. data/vendor/fastText/alignment/unsup_align.py +0 -109
  20. data/vendor/fastText/alignment/utils.py +0 -154
  21. data/vendor/fastText/classification-example.sh +0 -41
  22. data/vendor/fastText/classification-results.sh +0 -94
  23. data/vendor/fastText/crawl/README.md +0 -26
  24. data/vendor/fastText/crawl/dedup.cc +0 -51
  25. data/vendor/fastText/crawl/download_crawl.sh +0 -57
  26. data/vendor/fastText/crawl/filter_dedup.sh +0 -13
  27. data/vendor/fastText/crawl/filter_utf8.cc +0 -105
  28. data/vendor/fastText/crawl/process_wet_file.sh +0 -30
  29. data/vendor/fastText/docs/aligned-vectors.md +0 -64
  30. data/vendor/fastText/docs/api.md +0 -6
  31. data/vendor/fastText/docs/cheatsheet.md +0 -66
  32. data/vendor/fastText/docs/crawl-vectors.md +0 -125
  33. data/vendor/fastText/docs/dataset.md +0 -6
  34. data/vendor/fastText/docs/english-vectors.md +0 -53
  35. data/vendor/fastText/docs/faqs.md +0 -63
  36. data/vendor/fastText/docs/language-identification.md +0 -47
  37. data/vendor/fastText/docs/options.md +0 -50
  38. data/vendor/fastText/docs/pretrained-vectors.md +0 -142
  39. data/vendor/fastText/docs/python-module.md +0 -314
  40. data/vendor/fastText/docs/references.md +0 -41
  41. data/vendor/fastText/docs/supervised-models.md +0 -54
  42. data/vendor/fastText/docs/supervised-tutorial.md +0 -349
  43. data/vendor/fastText/docs/support.md +0 -58
  44. data/vendor/fastText/docs/unsupervised-tutorials.md +0 -309
  45. data/vendor/fastText/eval.py +0 -95
  46. data/vendor/fastText/get-wikimedia.sh +0 -79
  47. data/vendor/fastText/python/README.md +0 -322
  48. data/vendor/fastText/python/README.rst +0 -406
  49. data/vendor/fastText/python/benchmarks/README.rst +0 -3
  50. data/vendor/fastText/python/benchmarks/get_word_vector.py +0 -49
  51. data/vendor/fastText/python/doc/examples/FastTextEmbeddingBag.py +0 -81
  52. data/vendor/fastText/python/doc/examples/bin_to_vec.py +0 -41
  53. data/vendor/fastText/python/doc/examples/compute_accuracy.py +0 -163
  54. data/vendor/fastText/python/doc/examples/get_vocab.py +0 -48
  55. data/vendor/fastText/python/doc/examples/train_supervised.py +0 -42
  56. data/vendor/fastText/python/doc/examples/train_unsupervised.py +0 -56
  57. data/vendor/fastText/python/fasttext_module/fasttext/FastText.py +0 -468
  58. data/vendor/fastText/python/fasttext_module/fasttext/__init__.py +0 -22
  59. data/vendor/fastText/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc +0 -388
  60. data/vendor/fastText/python/fasttext_module/fasttext/tests/__init__.py +0 -14
  61. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_configurations.py +0 -239
  62. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_script.py +0 -629
  63. data/vendor/fastText/python/fasttext_module/fasttext/util/__init__.py +0 -13
  64. data/vendor/fastText/python/fasttext_module/fasttext/util/util.py +0 -60
  65. data/vendor/fastText/quantization-example.sh +0 -40
  66. data/vendor/fastText/runtests.py +0 -60
  67. data/vendor/fastText/scripts/kbcompletion/README.md +0 -19
  68. data/vendor/fastText/scripts/kbcompletion/data.sh +0 -69
  69. data/vendor/fastText/scripts/kbcompletion/eval.cpp +0 -108
  70. data/vendor/fastText/scripts/kbcompletion/fb15k.sh +0 -49
  71. data/vendor/fastText/scripts/kbcompletion/fb15k237.sh +0 -45
  72. data/vendor/fastText/scripts/kbcompletion/svo.sh +0 -38
  73. data/vendor/fastText/scripts/kbcompletion/wn18.sh +0 -49
  74. data/vendor/fastText/scripts/quantization/quantization-results.sh +0 -43
  75. data/vendor/fastText/setup.cfg +0 -2
  76. data/vendor/fastText/setup.py +0 -203
  77. data/vendor/fastText/tests/fetch_test_data.sh +0 -202
  78. data/vendor/fastText/website/README.md +0 -6
  79. data/vendor/fastText/website/blog/2016-08-18-blog-post.md +0 -42
  80. data/vendor/fastText/website/blog/2017-05-02-blog-post.md +0 -60
  81. data/vendor/fastText/website/blog/2017-10-02-blog-post.md +0 -90
  82. data/vendor/fastText/website/blog/2019-06-25-blog-post.md +0 -168
  83. data/vendor/fastText/website/core/Footer.js +0 -127
  84. data/vendor/fastText/website/package.json +0 -12
  85. data/vendor/fastText/website/pages/en/index.js +0 -286
  86. data/vendor/fastText/website/sidebars.json +0 -18
  87. data/vendor/fastText/website/siteConfig.js +0 -102
  88. data/vendor/fastText/website/static/docs/en/html/annotated.html +0 -115
  89. data/vendor/fastText/website/static/docs/en/html/annotated_dup.js +0 -4
  90. data/vendor/fastText/website/static/docs/en/html/args_8cc.html +0 -113
  91. data/vendor/fastText/website/static/docs/en/html/args_8h.html +0 -134
  92. data/vendor/fastText/website/static/docs/en/html/args_8h.js +0 -14
  93. data/vendor/fastText/website/static/docs/en/html/args_8h_source.html +0 -139
  94. data/vendor/fastText/website/static/docs/en/html/bc_s.png +0 -0
  95. data/vendor/fastText/website/static/docs/en/html/bdwn.png +0 -0
  96. data/vendor/fastText/website/static/docs/en/html/classes.html +0 -121
  97. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args-members.html +0 -140
  98. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.html +0 -753
  99. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.js +0 -40
  100. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary-members.html +0 -148
  101. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.html +0 -1266
  102. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.js +0 -43
  103. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText-members.html +0 -145
  104. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.html +0 -1149
  105. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.js +0 -45
  106. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix-members.html +0 -123
  107. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.html +0 -610
  108. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.js +0 -23
  109. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model-members.html +0 -150
  110. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.html +0 -1400
  111. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.js +0 -48
  112. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html +0 -131
  113. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html +0 -950
  114. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js +0 -31
  115. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix-members.html +0 -122
  116. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.html +0 -565
  117. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.js +0 -22
  118. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector-members.html +0 -121
  119. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.html +0 -542
  120. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.js +0 -21
  121. data/vendor/fastText/website/static/docs/en/html/closed.png +0 -0
  122. data/vendor/fastText/website/static/docs/en/html/dictionary_8cc.html +0 -116
  123. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.html +0 -142
  124. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.js +0 -10
  125. data/vendor/fastText/website/static/docs/en/html/dictionary_8h_source.html +0 -127
  126. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html +0 -145
  127. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js +0 -29
  128. data/vendor/fastText/website/static/docs/en/html/doc.png +0 -0
  129. data/vendor/fastText/website/static/docs/en/html/doxygen.css +0 -1596
  130. data/vendor/fastText/website/static/docs/en/html/doxygen.png +0 -0
  131. data/vendor/fastText/website/static/docs/en/html/dynsections.js +0 -97
  132. data/vendor/fastText/website/static/docs/en/html/fasttext_8cc.html +0 -119
  133. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.html +0 -168
  134. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.js +0 -6
  135. data/vendor/fastText/website/static/docs/en/html/fasttext_8h_source.html +0 -155
  136. data/vendor/fastText/website/static/docs/en/html/favicon.png +0 -0
  137. data/vendor/fastText/website/static/docs/en/html/files.html +0 -125
  138. data/vendor/fastText/website/static/docs/en/html/files.js +0 -4
  139. data/vendor/fastText/website/static/docs/en/html/folderclosed.png +0 -0
  140. data/vendor/fastText/website/static/docs/en/html/folderopen.png +0 -0
  141. data/vendor/fastText/website/static/docs/en/html/functions.html +0 -139
  142. data/vendor/fastText/website/static/docs/en/html/functions_0x7e.html +0 -112
  143. data/vendor/fastText/website/static/docs/en/html/functions_b.html +0 -115
  144. data/vendor/fastText/website/static/docs/en/html/functions_c.html +0 -143
  145. data/vendor/fastText/website/static/docs/en/html/functions_d.html +0 -135
  146. data/vendor/fastText/website/static/docs/en/html/functions_dup.js +0 -27
  147. data/vendor/fastText/website/static/docs/en/html/functions_e.html +0 -115
  148. data/vendor/fastText/website/static/docs/en/html/functions_f.html +0 -112
  149. data/vendor/fastText/website/static/docs/en/html/functions_func.html +0 -563
  150. data/vendor/fastText/website/static/docs/en/html/functions_g.html +0 -145
  151. data/vendor/fastText/website/static/docs/en/html/functions_h.html +0 -112
  152. data/vendor/fastText/website/static/docs/en/html/functions_i.html +0 -121
  153. data/vendor/fastText/website/static/docs/en/html/functions_k.html +0 -106
  154. data/vendor/fastText/website/static/docs/en/html/functions_l.html +0 -140
  155. data/vendor/fastText/website/static/docs/en/html/functions_m.html +0 -153
  156. data/vendor/fastText/website/static/docs/en/html/functions_n.html +0 -164
  157. data/vendor/fastText/website/static/docs/en/html/functions_o.html +0 -116
  158. data/vendor/fastText/website/static/docs/en/html/functions_p.html +0 -161
  159. data/vendor/fastText/website/static/docs/en/html/functions_q.html +0 -135
  160. data/vendor/fastText/website/static/docs/en/html/functions_r.html +0 -116
  161. data/vendor/fastText/website/static/docs/en/html/functions_s.html +0 -159
  162. data/vendor/fastText/website/static/docs/en/html/functions_t.html +0 -138
  163. data/vendor/fastText/website/static/docs/en/html/functions_u.html +0 -106
  164. data/vendor/fastText/website/static/docs/en/html/functions_v.html +0 -106
  165. data/vendor/fastText/website/static/docs/en/html/functions_vars.html +0 -486
  166. data/vendor/fastText/website/static/docs/en/html/functions_w.html +0 -124
  167. data/vendor/fastText/website/static/docs/en/html/functions_z.html +0 -104
  168. data/vendor/fastText/website/static/docs/en/html/globals.html +0 -170
  169. data/vendor/fastText/website/static/docs/en/html/globals_defs.html +0 -113
  170. data/vendor/fastText/website/static/docs/en/html/globals_func.html +0 -155
  171. data/vendor/fastText/website/static/docs/en/html/index.html +0 -100
  172. data/vendor/fastText/website/static/docs/en/html/jquery.js +0 -87
  173. data/vendor/fastText/website/static/docs/en/html/main_8cc.html +0 -582
  174. data/vendor/fastText/website/static/docs/en/html/main_8cc.js +0 -22
  175. data/vendor/fastText/website/static/docs/en/html/matrix_8cc.html +0 -114
  176. data/vendor/fastText/website/static/docs/en/html/matrix_8h.html +0 -121
  177. data/vendor/fastText/website/static/docs/en/html/matrix_8h_source.html +0 -123
  178. data/vendor/fastText/website/static/docs/en/html/menu.js +0 -26
  179. data/vendor/fastText/website/static/docs/en/html/menudata.js +0 -90
  180. data/vendor/fastText/website/static/docs/en/html/model_8cc.html +0 -113
  181. data/vendor/fastText/website/static/docs/en/html/model_8h.html +0 -183
  182. data/vendor/fastText/website/static/docs/en/html/model_8h.js +0 -8
  183. data/vendor/fastText/website/static/docs/en/html/model_8h_source.html +0 -139
  184. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.html +0 -343
  185. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.js +0 -13
  186. data/vendor/fastText/website/static/docs/en/html/namespacefasttext_1_1utils.html +0 -158
  187. data/vendor/fastText/website/static/docs/en/html/namespacemembers.html +0 -125
  188. data/vendor/fastText/website/static/docs/en/html/namespacemembers_enum.html +0 -107
  189. data/vendor/fastText/website/static/docs/en/html/namespacemembers_func.html +0 -110
  190. data/vendor/fastText/website/static/docs/en/html/namespacemembers_type.html +0 -104
  191. data/vendor/fastText/website/static/docs/en/html/namespaces.html +0 -106
  192. data/vendor/fastText/website/static/docs/en/html/namespaces.js +0 -4
  193. data/vendor/fastText/website/static/docs/en/html/nav_f.png +0 -0
  194. data/vendor/fastText/website/static/docs/en/html/nav_g.png +0 -0
  195. data/vendor/fastText/website/static/docs/en/html/nav_h.png +0 -0
  196. data/vendor/fastText/website/static/docs/en/html/navtree.css +0 -146
  197. data/vendor/fastText/website/static/docs/en/html/navtree.js +0 -517
  198. data/vendor/fastText/website/static/docs/en/html/navtreedata.js +0 -40
  199. data/vendor/fastText/website/static/docs/en/html/navtreeindex0.js +0 -253
  200. data/vendor/fastText/website/static/docs/en/html/navtreeindex1.js +0 -139
  201. data/vendor/fastText/website/static/docs/en/html/open.png +0 -0
  202. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.html +0 -118
  203. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.js +0 -4
  204. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h.html +0 -124
  205. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h_source.html +0 -133
  206. data/vendor/fastText/website/static/docs/en/html/qmatrix_8cc.html +0 -112
  207. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h.html +0 -126
  208. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h_source.html +0 -128
  209. data/vendor/fastText/website/static/docs/en/html/real_8h.html +0 -117
  210. data/vendor/fastText/website/static/docs/en/html/real_8h.js +0 -4
  211. data/vendor/fastText/website/static/docs/en/html/real_8h_source.html +0 -103
  212. data/vendor/fastText/website/static/docs/en/html/resize.js +0 -114
  213. data/vendor/fastText/website/static/docs/en/html/search/all_0.html +0 -26
  214. data/vendor/fastText/website/static/docs/en/html/search/all_0.js +0 -17
  215. data/vendor/fastText/website/static/docs/en/html/search/all_1.html +0 -26
  216. data/vendor/fastText/website/static/docs/en/html/search/all_1.js +0 -8
  217. data/vendor/fastText/website/static/docs/en/html/search/all_10.html +0 -26
  218. data/vendor/fastText/website/static/docs/en/html/search/all_10.js +0 -10
  219. data/vendor/fastText/website/static/docs/en/html/search/all_11.html +0 -26
  220. data/vendor/fastText/website/static/docs/en/html/search/all_11.js +0 -25
  221. data/vendor/fastText/website/static/docs/en/html/search/all_12.html +0 -26
  222. data/vendor/fastText/website/static/docs/en/html/search/all_12.js +0 -15
  223. data/vendor/fastText/website/static/docs/en/html/search/all_13.html +0 -26
  224. data/vendor/fastText/website/static/docs/en/html/search/all_13.js +0 -7
  225. data/vendor/fastText/website/static/docs/en/html/search/all_14.html +0 -26
  226. data/vendor/fastText/website/static/docs/en/html/search/all_14.js +0 -7
  227. data/vendor/fastText/website/static/docs/en/html/search/all_15.html +0 -26
  228. data/vendor/fastText/website/static/docs/en/html/search/all_15.js +0 -11
  229. data/vendor/fastText/website/static/docs/en/html/search/all_16.html +0 -26
  230. data/vendor/fastText/website/static/docs/en/html/search/all_16.js +0 -4
  231. data/vendor/fastText/website/static/docs/en/html/search/all_17.html +0 -26
  232. data/vendor/fastText/website/static/docs/en/html/search/all_17.js +0 -7
  233. data/vendor/fastText/website/static/docs/en/html/search/all_2.html +0 -26
  234. data/vendor/fastText/website/static/docs/en/html/search/all_2.js +0 -17
  235. data/vendor/fastText/website/static/docs/en/html/search/all_3.html +0 -26
  236. data/vendor/fastText/website/static/docs/en/html/search/all_3.js +0 -17
  237. data/vendor/fastText/website/static/docs/en/html/search/all_4.html +0 -26
  238. data/vendor/fastText/website/static/docs/en/html/search/all_4.js +0 -10
  239. data/vendor/fastText/website/static/docs/en/html/search/all_5.html +0 -26
  240. data/vendor/fastText/website/static/docs/en/html/search/all_5.js +0 -12
  241. data/vendor/fastText/website/static/docs/en/html/search/all_6.html +0 -26
  242. data/vendor/fastText/website/static/docs/en/html/search/all_6.js +0 -18
  243. data/vendor/fastText/website/static/docs/en/html/search/all_7.html +0 -26
  244. data/vendor/fastText/website/static/docs/en/html/search/all_7.js +0 -8
  245. data/vendor/fastText/website/static/docs/en/html/search/all_8.html +0 -26
  246. data/vendor/fastText/website/static/docs/en/html/search/all_8.js +0 -11
  247. data/vendor/fastText/website/static/docs/en/html/search/all_9.html +0 -26
  248. data/vendor/fastText/website/static/docs/en/html/search/all_9.js +0 -5
  249. data/vendor/fastText/website/static/docs/en/html/search/all_a.html +0 -26
  250. data/vendor/fastText/website/static/docs/en/html/search/all_a.js +0 -17
  251. data/vendor/fastText/website/static/docs/en/html/search/all_b.html +0 -26
  252. data/vendor/fastText/website/static/docs/en/html/search/all_b.js +0 -27
  253. data/vendor/fastText/website/static/docs/en/html/search/all_c.html +0 -26
  254. data/vendor/fastText/website/static/docs/en/html/search/all_c.js +0 -26
  255. data/vendor/fastText/website/static/docs/en/html/search/all_d.html +0 -26
  256. data/vendor/fastText/website/static/docs/en/html/search/all_d.js +0 -9
  257. data/vendor/fastText/website/static/docs/en/html/search/all_e.html +0 -26
  258. data/vendor/fastText/website/static/docs/en/html/search/all_e.js +0 -35
  259. data/vendor/fastText/website/static/docs/en/html/search/all_f.html +0 -26
  260. data/vendor/fastText/website/static/docs/en/html/search/all_f.js +0 -16
  261. data/vendor/fastText/website/static/docs/en/html/search/classes_0.html +0 -26
  262. data/vendor/fastText/website/static/docs/en/html/search/classes_0.js +0 -4
  263. data/vendor/fastText/website/static/docs/en/html/search/classes_1.html +0 -26
  264. data/vendor/fastText/website/static/docs/en/html/search/classes_1.js +0 -4
  265. data/vendor/fastText/website/static/docs/en/html/search/classes_2.html +0 -26
  266. data/vendor/fastText/website/static/docs/en/html/search/classes_2.js +0 -4
  267. data/vendor/fastText/website/static/docs/en/html/search/classes_3.html +0 -26
  268. data/vendor/fastText/website/static/docs/en/html/search/classes_3.js +0 -4
  269. data/vendor/fastText/website/static/docs/en/html/search/classes_4.html +0 -26
  270. data/vendor/fastText/website/static/docs/en/html/search/classes_4.js +0 -5
  271. data/vendor/fastText/website/static/docs/en/html/search/classes_5.html +0 -26
  272. data/vendor/fastText/website/static/docs/en/html/search/classes_5.js +0 -4
  273. data/vendor/fastText/website/static/docs/en/html/search/classes_6.html +0 -26
  274. data/vendor/fastText/website/static/docs/en/html/search/classes_6.js +0 -4
  275. data/vendor/fastText/website/static/docs/en/html/search/classes_7.html +0 -26
  276. data/vendor/fastText/website/static/docs/en/html/search/classes_7.js +0 -4
  277. data/vendor/fastText/website/static/docs/en/html/search/classes_8.html +0 -26
  278. data/vendor/fastText/website/static/docs/en/html/search/classes_8.js +0 -4
  279. data/vendor/fastText/website/static/docs/en/html/search/close.png +0 -0
  280. data/vendor/fastText/website/static/docs/en/html/search/defines_0.html +0 -26
  281. data/vendor/fastText/website/static/docs/en/html/search/defines_0.js +0 -5
  282. data/vendor/fastText/website/static/docs/en/html/search/defines_1.html +0 -26
  283. data/vendor/fastText/website/static/docs/en/html/search/defines_1.js +0 -4
  284. data/vendor/fastText/website/static/docs/en/html/search/defines_2.html +0 -26
  285. data/vendor/fastText/website/static/docs/en/html/search/defines_2.js +0 -4
  286. data/vendor/fastText/website/static/docs/en/html/search/defines_3.html +0 -26
  287. data/vendor/fastText/website/static/docs/en/html/search/defines_3.js +0 -4
  288. data/vendor/fastText/website/static/docs/en/html/search/enums_0.html +0 -26
  289. data/vendor/fastText/website/static/docs/en/html/search/enums_0.js +0 -4
  290. data/vendor/fastText/website/static/docs/en/html/search/enums_1.html +0 -26
  291. data/vendor/fastText/website/static/docs/en/html/search/enums_1.js +0 -4
  292. data/vendor/fastText/website/static/docs/en/html/search/enums_2.html +0 -26
  293. data/vendor/fastText/website/static/docs/en/html/search/enums_2.js +0 -4
  294. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.html +0 -26
  295. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.js +0 -4
  296. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.html +0 -26
  297. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.js +0 -4
  298. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.html +0 -26
  299. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.js +0 -4
  300. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.html +0 -26
  301. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.js +0 -4
  302. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.html +0 -26
  303. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.js +0 -6
  304. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.html +0 -26
  305. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.js +0 -4
  306. data/vendor/fastText/website/static/docs/en/html/search/files_0.html +0 -26
  307. data/vendor/fastText/website/static/docs/en/html/search/files_0.js +0 -5
  308. data/vendor/fastText/website/static/docs/en/html/search/files_1.html +0 -26
  309. data/vendor/fastText/website/static/docs/en/html/search/files_1.js +0 -5
  310. data/vendor/fastText/website/static/docs/en/html/search/files_2.html +0 -26
  311. data/vendor/fastText/website/static/docs/en/html/search/files_2.js +0 -5
  312. data/vendor/fastText/website/static/docs/en/html/search/files_3.html +0 -26
  313. data/vendor/fastText/website/static/docs/en/html/search/files_3.js +0 -8
  314. data/vendor/fastText/website/static/docs/en/html/search/files_4.html +0 -26
  315. data/vendor/fastText/website/static/docs/en/html/search/files_4.js +0 -5
  316. data/vendor/fastText/website/static/docs/en/html/search/files_5.html +0 -26
  317. data/vendor/fastText/website/static/docs/en/html/search/files_5.js +0 -5
  318. data/vendor/fastText/website/static/docs/en/html/search/files_6.html +0 -26
  319. data/vendor/fastText/website/static/docs/en/html/search/files_6.js +0 -4
  320. data/vendor/fastText/website/static/docs/en/html/search/files_7.html +0 -26
  321. data/vendor/fastText/website/static/docs/en/html/search/files_7.js +0 -5
  322. data/vendor/fastText/website/static/docs/en/html/search/files_8.html +0 -26
  323. data/vendor/fastText/website/static/docs/en/html/search/files_8.js +0 -5
  324. data/vendor/fastText/website/static/docs/en/html/search/functions_0.html +0 -26
  325. data/vendor/fastText/website/static/docs/en/html/search/functions_0.js +0 -14
  326. data/vendor/fastText/website/static/docs/en/html/search/functions_1.html +0 -26
  327. data/vendor/fastText/website/static/docs/en/html/search/functions_1.js +0 -5
  328. data/vendor/fastText/website/static/docs/en/html/search/functions_10.html +0 -26
  329. data/vendor/fastText/website/static/docs/en/html/search/functions_10.js +0 -5
  330. data/vendor/fastText/website/static/docs/en/html/search/functions_11.html +0 -26
  331. data/vendor/fastText/website/static/docs/en/html/search/functions_11.js +0 -18
  332. data/vendor/fastText/website/static/docs/en/html/search/functions_12.html +0 -26
  333. data/vendor/fastText/website/static/docs/en/html/search/functions_12.js +0 -8
  334. data/vendor/fastText/website/static/docs/en/html/search/functions_13.html +0 -26
  335. data/vendor/fastText/website/static/docs/en/html/search/functions_13.js +0 -5
  336. data/vendor/fastText/website/static/docs/en/html/search/functions_14.html +0 -26
  337. data/vendor/fastText/website/static/docs/en/html/search/functions_14.js +0 -4
  338. data/vendor/fastText/website/static/docs/en/html/search/functions_15.html +0 -26
  339. data/vendor/fastText/website/static/docs/en/html/search/functions_15.js +0 -4
  340. data/vendor/fastText/website/static/docs/en/html/search/functions_16.html +0 -26
  341. data/vendor/fastText/website/static/docs/en/html/search/functions_16.js +0 -4
  342. data/vendor/fastText/website/static/docs/en/html/search/functions_17.html +0 -26
  343. data/vendor/fastText/website/static/docs/en/html/search/functions_17.js +0 -7
  344. data/vendor/fastText/website/static/docs/en/html/search/functions_2.html +0 -26
  345. data/vendor/fastText/website/static/docs/en/html/search/functions_2.js +0 -11
  346. data/vendor/fastText/website/static/docs/en/html/search/functions_3.html +0 -26
  347. data/vendor/fastText/website/static/docs/en/html/search/functions_3.js +0 -9
  348. data/vendor/fastText/website/static/docs/en/html/search/functions_4.html +0 -26
  349. data/vendor/fastText/website/static/docs/en/html/search/functions_4.js +0 -4
  350. data/vendor/fastText/website/static/docs/en/html/search/functions_5.html +0 -26
  351. data/vendor/fastText/website/static/docs/en/html/search/functions_5.js +0 -7
  352. data/vendor/fastText/website/static/docs/en/html/search/functions_6.html +0 -26
  353. data/vendor/fastText/website/static/docs/en/html/search/functions_6.js +0 -17
  354. data/vendor/fastText/website/static/docs/en/html/search/functions_7.html +0 -26
  355. data/vendor/fastText/website/static/docs/en/html/search/functions_7.js +0 -5
  356. data/vendor/fastText/website/static/docs/en/html/search/functions_8.html +0 -26
  357. data/vendor/fastText/website/static/docs/en/html/search/functions_8.js +0 -8
  358. data/vendor/fastText/website/static/docs/en/html/search/functions_9.html +0 -26
  359. data/vendor/fastText/website/static/docs/en/html/search/functions_9.js +0 -4
  360. data/vendor/fastText/website/static/docs/en/html/search/functions_a.html +0 -26
  361. data/vendor/fastText/website/static/docs/en/html/search/functions_a.js +0 -8
  362. data/vendor/fastText/website/static/docs/en/html/search/functions_b.html +0 -26
  363. data/vendor/fastText/website/static/docs/en/html/search/functions_b.js +0 -10
  364. data/vendor/fastText/website/static/docs/en/html/search/functions_c.html +0 -26
  365. data/vendor/fastText/website/static/docs/en/html/search/functions_c.js +0 -10
  366. data/vendor/fastText/website/static/docs/en/html/search/functions_d.html +0 -26
  367. data/vendor/fastText/website/static/docs/en/html/search/functions_d.js +0 -6
  368. data/vendor/fastText/website/static/docs/en/html/search/functions_e.html +0 -26
  369. data/vendor/fastText/website/static/docs/en/html/search/functions_e.js +0 -26
  370. data/vendor/fastText/website/static/docs/en/html/search/functions_f.html +0 -26
  371. data/vendor/fastText/website/static/docs/en/html/search/functions_f.js +0 -6
  372. data/vendor/fastText/website/static/docs/en/html/search/mag_sel.png +0 -0
  373. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.html +0 -26
  374. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.js +0 -5
  375. data/vendor/fastText/website/static/docs/en/html/search/nomatches.html +0 -12
  376. data/vendor/fastText/website/static/docs/en/html/search/search.css +0 -271
  377. data/vendor/fastText/website/static/docs/en/html/search/search.js +0 -791
  378. data/vendor/fastText/website/static/docs/en/html/search/search_l.png +0 -0
  379. data/vendor/fastText/website/static/docs/en/html/search/search_m.png +0 -0
  380. data/vendor/fastText/website/static/docs/en/html/search/search_r.png +0 -0
  381. data/vendor/fastText/website/static/docs/en/html/search/searchdata.js +0 -42
  382. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.html +0 -26
  383. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.js +0 -4
  384. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.html +0 -26
  385. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.js +0 -4
  386. data/vendor/fastText/website/static/docs/en/html/search/variables_0.html +0 -26
  387. data/vendor/fastText/website/static/docs/en/html/search/variables_0.js +0 -4
  388. data/vendor/fastText/website/static/docs/en/html/search/variables_1.html +0 -26
  389. data/vendor/fastText/website/static/docs/en/html/search/variables_1.js +0 -6
  390. data/vendor/fastText/website/static/docs/en/html/search/variables_10.html +0 -26
  391. data/vendor/fastText/website/static/docs/en/html/search/variables_10.js +0 -8
  392. data/vendor/fastText/website/static/docs/en/html/search/variables_11.html +0 -26
  393. data/vendor/fastText/website/static/docs/en/html/search/variables_11.js +0 -11
  394. data/vendor/fastText/website/static/docs/en/html/search/variables_12.html +0 -26
  395. data/vendor/fastText/website/static/docs/en/html/search/variables_12.js +0 -4
  396. data/vendor/fastText/website/static/docs/en/html/search/variables_13.html +0 -26
  397. data/vendor/fastText/website/static/docs/en/html/search/variables_13.js +0 -10
  398. data/vendor/fastText/website/static/docs/en/html/search/variables_2.html +0 -26
  399. data/vendor/fastText/website/static/docs/en/html/search/variables_2.js +0 -9
  400. data/vendor/fastText/website/static/docs/en/html/search/variables_3.html +0 -26
  401. data/vendor/fastText/website/static/docs/en/html/search/variables_3.js +0 -9
  402. data/vendor/fastText/website/static/docs/en/html/search/variables_4.html +0 -26
  403. data/vendor/fastText/website/static/docs/en/html/search/variables_4.js +0 -7
  404. data/vendor/fastText/website/static/docs/en/html/search/variables_5.html +0 -26
  405. data/vendor/fastText/website/static/docs/en/html/search/variables_5.js +0 -4
  406. data/vendor/fastText/website/static/docs/en/html/search/variables_6.html +0 -26
  407. data/vendor/fastText/website/static/docs/en/html/search/variables_6.js +0 -5
  408. data/vendor/fastText/website/static/docs/en/html/search/variables_7.html +0 -26
  409. data/vendor/fastText/website/static/docs/en/html/search/variables_7.js +0 -5
  410. data/vendor/fastText/website/static/docs/en/html/search/variables_8.html +0 -26
  411. data/vendor/fastText/website/static/docs/en/html/search/variables_8.js +0 -4
  412. data/vendor/fastText/website/static/docs/en/html/search/variables_9.html +0 -26
  413. data/vendor/fastText/website/static/docs/en/html/search/variables_9.js +0 -10
  414. data/vendor/fastText/website/static/docs/en/html/search/variables_a.html +0 -26
  415. data/vendor/fastText/website/static/docs/en/html/search/variables_a.js +0 -14
  416. data/vendor/fastText/website/static/docs/en/html/search/variables_b.html +0 -26
  417. data/vendor/fastText/website/static/docs/en/html/search/variables_b.js +0 -17
  418. data/vendor/fastText/website/static/docs/en/html/search/variables_c.html +0 -26
  419. data/vendor/fastText/website/static/docs/en/html/search/variables_c.js +0 -6
  420. data/vendor/fastText/website/static/docs/en/html/search/variables_d.html +0 -26
  421. data/vendor/fastText/website/static/docs/en/html/search/variables_d.js +0 -10
  422. data/vendor/fastText/website/static/docs/en/html/search/variables_e.html +0 -26
  423. data/vendor/fastText/website/static/docs/en/html/search/variables_e.js +0 -11
  424. data/vendor/fastText/website/static/docs/en/html/search/variables_f.html +0 -26
  425. data/vendor/fastText/website/static/docs/en/html/search/variables_f.js +0 -6
  426. data/vendor/fastText/website/static/docs/en/html/splitbar.png +0 -0
  427. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node-members.html +0 -108
  428. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.html +0 -194
  429. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.js +0 -8
  430. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry-members.html +0 -107
  431. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.html +0 -178
  432. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.js +0 -7
  433. data/vendor/fastText/website/static/docs/en/html/sync_off.png +0 -0
  434. data/vendor/fastText/website/static/docs/en/html/sync_on.png +0 -0
  435. data/vendor/fastText/website/static/docs/en/html/tab_a.png +0 -0
  436. data/vendor/fastText/website/static/docs/en/html/tab_b.png +0 -0
  437. data/vendor/fastText/website/static/docs/en/html/tab_h.png +0 -0
  438. data/vendor/fastText/website/static/docs/en/html/tab_s.png +0 -0
  439. data/vendor/fastText/website/static/docs/en/html/tabs.css +0 -1
  440. data/vendor/fastText/website/static/docs/en/html/utils_8cc.html +0 -121
  441. data/vendor/fastText/website/static/docs/en/html/utils_8cc.js +0 -5
  442. data/vendor/fastText/website/static/docs/en/html/utils_8h.html +0 -122
  443. data/vendor/fastText/website/static/docs/en/html/utils_8h.js +0 -5
  444. data/vendor/fastText/website/static/docs/en/html/utils_8h_source.html +0 -104
  445. data/vendor/fastText/website/static/docs/en/html/vector_8cc.html +0 -121
  446. data/vendor/fastText/website/static/docs/en/html/vector_8cc.js +0 -4
  447. data/vendor/fastText/website/static/docs/en/html/vector_8h.html +0 -126
  448. data/vendor/fastText/website/static/docs/en/html/vector_8h.js +0 -5
  449. data/vendor/fastText/website/static/docs/en/html/vector_8h_source.html +0 -120
  450. data/vendor/fastText/website/static/fasttext.css +0 -48
  451. data/vendor/fastText/website/static/img/authors/armand_joulin.jpg +0 -0
  452. data/vendor/fastText/website/static/img/authors/christian_puhrsch.png +0 -0
  453. data/vendor/fastText/website/static/img/authors/edouard_grave.jpeg +0 -0
  454. data/vendor/fastText/website/static/img/authors/piotr_bojanowski.jpg +0 -0
  455. data/vendor/fastText/website/static/img/authors/tomas_mikolov.jpg +0 -0
  456. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img1.png +0 -0
  457. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img2.png +0 -0
  458. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img1.jpg +0 -0
  459. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img2.jpg +0 -0
  460. data/vendor/fastText/website/static/img/blog/2017-10-02-blog-post-img1.png +0 -0
  461. data/vendor/fastText/website/static/img/cbo_vs_skipgram.png +0 -0
  462. data/vendor/fastText/website/static/img/fasttext-icon-api.png +0 -0
  463. data/vendor/fastText/website/static/img/fasttext-icon-bg-web.png +0 -0
  464. data/vendor/fastText/website/static/img/fasttext-icon-color-square.png +0 -0
  465. data/vendor/fastText/website/static/img/fasttext-icon-color-web.png +0 -0
  466. data/vendor/fastText/website/static/img/fasttext-icon-faq.png +0 -0
  467. data/vendor/fastText/website/static/img/fasttext-icon-tutorial.png +0 -0
  468. data/vendor/fastText/website/static/img/fasttext-icon-white-web.png +0 -0
  469. data/vendor/fastText/website/static/img/fasttext-logo-color-web.png +0 -0
  470. data/vendor/fastText/website/static/img/fasttext-logo-white-web.png +0 -0
  471. data/vendor/fastText/website/static/img/logo-color.png +0 -0
  472. data/vendor/fastText/website/static/img/model-black.png +0 -0
  473. data/vendor/fastText/website/static/img/model-blue.png +0 -0
  474. data/vendor/fastText/website/static/img/model-red.png +0 -0
  475. data/vendor/fastText/website/static/img/ogimage.png +0 -0
  476. data/vendor/fastText/website/static/img/oss_logo.png +0 -0
  477. data/vendor/fastText/wikifil.pl +0 -57
  478. data/vendor/fastText/word-vector-example.sh +0 -39
@@ -1,109 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) 2018-present, Facebook, Inc.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the MIT license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- import codecs, sys, time, math, argparse, ot
9
- import numpy as np
10
- from utils import *
11
-
12
- parser = argparse.ArgumentParser(description='Wasserstein Procrustes for Embedding Alignment')
13
- parser.add_argument('--model_src', type=str, help='Path to source word embeddings')
14
- parser.add_argument('--model_tgt', type=str, help='Path to target word embeddings')
15
- parser.add_argument('--lexicon', type=str, help='Path to the evaluation lexicon')
16
- parser.add_argument('--output_src', default='', type=str, help='Path to save the aligned source embeddings')
17
- parser.add_argument('--output_tgt', default='', type=str, help='Path to save the aligned target embeddings')
18
- parser.add_argument('--seed', default=1111, type=int, help='Random number generator seed')
19
- parser.add_argument('--nepoch', default=5, type=int, help='Number of epochs')
20
- parser.add_argument('--niter', default=5000, type=int, help='Initial number of iterations')
21
- parser.add_argument('--bsz', default=500, type=int, help='Initial batch size')
22
- parser.add_argument('--lr', default=500., type=float, help='Learning rate')
23
- parser.add_argument('--nmax', default=20000, type=int, help='Vocabulary size for learning the alignment')
24
- parser.add_argument('--reg', default=0.05, type=float, help='Regularization parameter for sinkhorn')
25
- args = parser.parse_args()
26
-
27
-
28
- def objective(X, Y, R, n=5000):
29
- Xn, Yn = X[:n], Y[:n]
30
- C = -np.dot(np.dot(Xn, R), Yn.T)
31
- P = ot.sinkhorn(np.ones(n), np.ones(n), C, 0.025, stopThr=1e-3)
32
- return 1000 * np.linalg.norm(np.dot(Xn, R) - np.dot(P, Yn)) / n
33
-
34
-
35
- def sqrt_eig(x):
36
- U, s, VT = np.linalg.svd(x, full_matrices=False)
37
- return np.dot(U, np.dot(np.diag(np.sqrt(s)), VT))
38
-
39
-
40
- def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000,
41
- nmax=10000, reg=0.05, verbose=True):
42
- for epoch in range(1, nepoch + 1):
43
- for _it in range(1, niter + 1):
44
- # sample mini-batch
45
- xt = X[np.random.permutation(nmax)[:bsz], :]
46
- yt = Y[np.random.permutation(nmax)[:bsz], :]
47
- # compute OT on minibatch
48
- C = -np.dot(np.dot(xt, R), yt.T)
49
- P = ot.sinkhorn(np.ones(bsz), np.ones(bsz), C, reg, stopThr=1e-3)
50
- # compute gradient
51
- G = - np.dot(xt.T, np.dot(P, yt))
52
- R -= lr / bsz * G
53
- # project on orthogonal matrices
54
- U, s, VT = np.linalg.svd(R)
55
- R = np.dot(U, VT)
56
- bsz *= 2
57
- niter //= 4
58
- if verbose:
59
- print("epoch: %d obj: %.3f" % (epoch, objective(X, Y, R)))
60
- return R
61
-
62
-
63
- def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
64
- n, d = X.shape
65
- if apply_sqrt:
66
- X, Y = sqrt_eig(X), sqrt_eig(Y)
67
- K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
68
- K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
69
- K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
70
- P = np.ones([n, n]) / float(n)
71
- for it in range(1, niter + 1):
72
- G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
73
- q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
74
- alpha = 2.0 / float(2.0 + it)
75
- P = alpha * q + (1.0 - alpha) * P
76
- obj = np.linalg.norm(np.dot(P, K_X) - np.dot(K_Y, P))
77
- print(obj)
78
- return procrustes(np.dot(P, X), Y).T
79
-
80
-
81
- print("\n*** Wasserstein Procrustes ***\n")
82
-
83
- np.random.seed(args.seed)
84
-
85
- maxload = 200000
86
- w_src, x_src = load_vectors(args.model_src, maxload, norm=True, center=True)
87
- w_tgt, x_tgt = load_vectors(args.model_tgt, maxload, norm=True, center=True)
88
- src2trg, _ = load_lexicon(args.lexicon, w_src, w_tgt)
89
-
90
- print("\nComputing initial mapping with convex relaxation...")
91
- t0 = time.time()
92
- R0 = convex_init(x_src[:2500], x_tgt[:2500], reg=args.reg, apply_sqrt=True)
93
- print("Done [%03d sec]" % math.floor(time.time() - t0))
94
-
95
- print("\nComputing mapping with Wasserstein Procrustes...")
96
- t0 = time.time()
97
- R = align(x_src, x_tgt, R0.copy(), bsz=args.bsz, lr=args.lr, niter=args.niter,
98
- nepoch=args.nepoch, reg=args.reg, nmax=args.nmax)
99
- print("Done [%03d sec]" % math.floor(time.time() - t0))
100
-
101
- acc = compute_nn_accuracy(x_src, np.dot(x_tgt, R.T), src2trg)
102
- print("\nPrecision@1: %.3f\n" % acc)
103
-
104
- if args.output_src != '':
105
- x_src = x_src / np.linalg.norm(x_src, 2, 1).reshape([-1, 1])
106
- save_vectors(args.output_src, x_src, w_src)
107
- if args.output_tgt != '':
108
- x_tgt = x_tgt / np.linalg.norm(x_tgt, 2, 1).reshape([-1, 1])
109
- save_vectors(args.output_tgt, np.dot(x_tgt, R.T), w_tgt)
@@ -1,154 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright (c) 2018-present, Facebook, Inc.
3
- # All rights reserved.
4
- #
5
- # This source code is licensed under the license found in the
6
- # LICENSE file in the root directory of this source tree.
7
-
8
- import io
9
- import numpy as np
10
- import collections
11
-
12
-
13
- def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True):
14
- if verbose:
15
- print("Loading vectors from %s" % fname)
16
- fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
17
- n, d = map(int, fin.readline().split())
18
- if maxload > 0:
19
- n = min(n, maxload)
20
- x = np.zeros([n, d])
21
- words = []
22
- for i, line in enumerate(fin):
23
- if i >= n:
24
- break
25
- tokens = line.rstrip().split(' ')
26
- words.append(tokens[0])
27
- v = np.array(tokens[1:], dtype=float)
28
- x[i, :] = v
29
- if norm:
30
- x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
31
- if center:
32
- x -= x.mean(axis=0)[np.newaxis, :]
33
- x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
34
- if verbose:
35
- print("%d word vectors loaded" % (len(words)))
36
- return words, x
37
-
38
-
39
- def idx(words):
40
- w2i = {}
41
- for i, w in enumerate(words):
42
- if w not in w2i:
43
- w2i[w] = i
44
- return w2i
45
-
46
-
47
- def save_vectors(fname, x, words):
48
- n, d = x.shape
49
- fout = io.open(fname, 'w', encoding='utf-8')
50
- fout.write(u"%d %d\n" % (n, d))
51
- for i in range(n):
52
- fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
53
- fout.close()
54
-
55
-
56
- def save_matrix(fname, x):
57
- n, d = x.shape
58
- fout = io.open(fname, 'w', encoding='utf-8')
59
- fout.write(u"%d %d\n" % (n, d))
60
- for i in range(n):
61
- fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
62
- fout.close()
63
-
64
-
65
- def procrustes(X_src, Y_tgt):
66
- U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
67
- return np.dot(U, V)
68
-
69
-
70
- def select_vectors_from_pairs(x_src, y_tgt, pairs):
71
- n = len(pairs)
72
- d = x_src.shape[1]
73
- x = np.zeros([n, d])
74
- y = np.zeros([n, d])
75
- for k, ij in enumerate(pairs):
76
- i, j = ij
77
- x[k, :] = x_src[i, :]
78
- y[k, :] = y_tgt[j, :]
79
- return x, y
80
-
81
-
82
- def load_lexicon(filename, words_src, words_tgt, verbose=True):
83
- f = io.open(filename, 'r', encoding='utf-8')
84
- lexicon = collections.defaultdict(set)
85
- idx_src , idx_tgt = idx(words_src), idx(words_tgt)
86
- vocab = set()
87
- for line in f:
88
- word_src, word_tgt = line.split()
89
- if word_src in idx_src and word_tgt in idx_tgt:
90
- lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
91
- vocab.add(word_src)
92
- if verbose:
93
- coverage = len(lexicon) / float(len(vocab))
94
- print("Coverage of source vocab: %.4f" % (coverage))
95
- return lexicon, float(len(vocab))
96
-
97
-
98
- def load_pairs(filename, idx_src, idx_tgt, verbose=True):
99
- f = io.open(filename, 'r', encoding='utf-8')
100
- pairs = []
101
- tot = 0
102
- for line in f:
103
- a, b = line.rstrip().split(' ')
104
- tot += 1
105
- if a in idx_src and b in idx_tgt:
106
- pairs.append((idx_src[a], idx_tgt[b]))
107
- if verbose:
108
- coverage = (1.0 * len(pairs)) / tot
109
- print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage))
110
- return pairs
111
-
112
-
113
- def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
114
- if lexicon_size < 0:
115
- lexicon_size = len(lexicon)
116
- idx_src = list(lexicon.keys())
117
- acc = 0.0
118
- x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
119
- x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
120
- for i in range(0, len(idx_src), bsz):
121
- e = min(i + bsz, len(idx_src))
122
- scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
123
- pred = scores.argmax(axis=0)
124
- for j in range(i, e):
125
- if pred[j - i] in lexicon[idx_src[j]]:
126
- acc += 1.0
127
- return acc / lexicon_size
128
-
129
-
130
- def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024):
131
- if lexicon_size < 0:
132
- lexicon_size = len(lexicon)
133
- idx_src = list(lexicon.keys())
134
-
135
- x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
136
- x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
137
-
138
- sr = x_src[list(idx_src)]
139
- sc = np.dot(sr, x_tgt.T)
140
- similarities = 2 * sc
141
- sc2 = np.zeros(x_tgt.shape[0])
142
- for i in range(0, x_tgt.shape[0], bsz):
143
- j = min(i + bsz, x_tgt.shape[0])
144
- sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
145
- dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
146
- sc2[i:j] = np.mean(dotprod, axis=1)
147
- similarities -= sc2[np.newaxis, :]
148
-
149
- nn = np.argmax(similarities, axis=1).tolist()
150
- correct = 0.0
151
- for k in range(0, len(lexicon)):
152
- if nn[k] in lexicon[idx_src[k]]:
153
- correct += 1.0
154
- return correct / lexicon_size
@@ -1,41 +0,0 @@
1
- #!/usr/bin/env bash
2
- #
3
- # Copyright (c) 2016-present, Facebook, Inc.
4
- # All rights reserved.
5
- #
6
- # This source code is licensed under the MIT license found in the
7
- # LICENSE file in the root directory of this source tree.
8
- #
9
-
10
- myshuf() {
11
- perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
12
- }
13
-
14
- normalize_text() {
15
- tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
16
- sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
17
- -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
18
- -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
19
- }
20
-
21
- RESULTDIR=result
22
- DATADIR=data
23
-
24
- mkdir -p "${RESULTDIR}"
25
- mkdir -p "${DATADIR}"
26
-
27
- if [ ! -f "${DATADIR}/dbpedia.train" ]
28
- then
29
- wget -c "https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz" -O "${DATADIR}/dbpedia_csv.tar.gz"
30
- tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
31
- cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
32
- cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
33
- fi
34
-
35
- make
36
-
37
- ./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4
38
-
39
- ./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
40
-
41
- ./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
@@ -1,94 +0,0 @@
1
- #!/usr/bin/env bash
2
- #
3
- # Copyright (c) 2016-present, Facebook, Inc.
4
- # All rights reserved.
5
- #
6
- # This source code is licensed under the MIT license found in the
7
- # LICENSE file in the root directory of this source tree.
8
- #
9
-
10
- # This script produces the results from Table 1 in the following paper:
11
- # Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016
12
-
13
- myshuf() {
14
- perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
15
- }
16
-
17
- normalize_text() {
18
- tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
19
- sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
20
- -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
21
- -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
22
- }
23
-
24
- DATASET=(
25
- ag_news
26
- sogou_news
27
- dbpedia
28
- yelp_review_polarity
29
- yelp_review_full
30
- yahoo_answers
31
- amazon_review_full
32
- amazon_review_polarity
33
- )
34
-
35
- ID=(
36
- 0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
37
- 0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
38
- 0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
39
- 0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
40
- 0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
41
- 0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
42
- 0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
43
- 0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
44
- )
45
-
46
- # These learning rates were chosen by validation on a subset of the training set.
47
- LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 )
48
-
49
- RESULTDIR=result
50
- DATADIR=data
51
-
52
- mkdir -p "${RESULTDIR}"
53
- mkdir -p "${DATADIR}"
54
-
55
- # Small datasets first
56
-
57
- for i in {0..0}
58
- do
59
- echo "Downloading dataset ${DATASET[i]}"
60
- if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
61
- then
62
- wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
63
- tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
64
- cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
65
- cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
66
- fi
67
- done
68
-
69
- # Large datasets require a bit more work due to the extra request page
70
-
71
- for i in {1..7}
72
- do
73
- echo "Downloading dataset ${DATASET[i]}"
74
- if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
75
- then
76
- curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
77
- curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&amp;/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
78
- tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
79
- cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
80
- cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
81
- fi
82
- done
83
-
84
- make
85
-
86
- for i in {0..7}
87
- do
88
- echo "Working on dataset ${DATASET[i]}"
89
- ./fasttext supervised -input "${DATADIR}/${DATASET[i]}.train" \
90
- -output "${RESULTDIR}/${DATASET[i]}" -dim 10 -lr "${LR[i]}" -wordNgrams 2 \
91
- -minCount 1 -bucket 10000000 -epoch 5 -thread 4 > /dev/null
92
- ./fasttext test "${RESULTDIR}/${DATASET[i]}.bin" \
93
- "${DATADIR}/${DATASET[i]}.test"
94
- done
@@ -1,26 +0,0 @@
1
- ## Preprocessing Common Crawl
2
-
3
- This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
4
-
5
- This script uses the scripts and language identifier of [1].
6
-
7
- This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
8
-
9
- Set the variable WET_PATHS_URL to the crawl you want to process.
10
- Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
11
- Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
12
-
13
- ### Reference
14
-
15
- If you use this code, please cite:
16
-
17
- [1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
18
-
19
- ```
20
- @inproceedings{grave2018learning,
21
- title={Learning Word Vectors for 157 Languages},
22
- author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
23
- booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
24
- year={2018}
25
- }
26
- ```
@@ -1,51 +0,0 @@
1
- // Copyright (c) 2018-present, Facebook, Inc.
2
- // All rights reserved.
3
- //
4
- // This source code is licensed under the MIT license found in the
5
- // LICENSE file in the root directory of this source tree.
6
-
7
- #include <cstdint>
8
- #include <iostream>
9
- #include <fstream>
10
- #include <string>
11
- #include <vector>
12
-
13
- uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull)
14
- {
15
- for (size_t i = 0; i < sz; i++, data++) {
16
- h ^= uint64_t(*data);
17
- h *= 1099511628211ull;
18
- }
19
- return h;
20
- }
21
-
22
- int main(int argc, char** argv)
23
- {
24
- uint64_t init_values[] = {
25
- 14695981039346656037ull,
26
- 9425296925403859339ull,
27
- 13716263814064014149ull,
28
- 3525492407291847033ull,
29
- 8607404175481815707ull,
30
- 9818874561736458749ull,
31
- 10026508429719773353ull,
32
- 3560712257386009938ull
33
- };
34
- size_t n = 1ull<<34, num_hashes = 2;
35
- std::vector<bool> seen(n);
36
-
37
- std::ios_base::sync_with_stdio(false);
38
-
39
- for (std::string line; std::getline(std::cin, line);) {
40
- bool b = true;
41
- for (size_t i = 0; i < num_hashes; i++) {
42
- uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n;
43
- b = b && seen[h];
44
- seen[h] = true;
45
- }
46
- if (!b) {
47
- std::cout << line << std::endl;
48
- }
49
- }
50
- return 0;
51
- }