fasttext 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (510) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +251 -0
  5. data/ext/fasttext/ext.cpp +291 -0
  6. data/ext/fasttext/extconf.rb +15 -0
  7. data/lib/fasttext.rb +41 -0
  8. data/lib/fasttext/classifier.rb +92 -0
  9. data/lib/fasttext/ext.bundle +0 -0
  10. data/lib/fasttext/model.rb +60 -0
  11. data/lib/fasttext/vectorizer.rb +58 -0
  12. data/lib/fasttext/version.rb +3 -0
  13. data/vendor/fastText/CMakeLists.txt +68 -0
  14. data/vendor/fastText/CODE_OF_CONDUCT.md +2 -0
  15. data/vendor/fastText/CONTRIBUTING.md +32 -0
  16. data/vendor/fastText/LICENSE +21 -0
  17. data/vendor/fastText/MANIFEST.in +5 -0
  18. data/vendor/fastText/Makefile +63 -0
  19. data/vendor/fastText/README.md +339 -0
  20. data/vendor/fastText/alignment/README.md +53 -0
  21. data/vendor/fastText/alignment/align.py +145 -0
  22. data/vendor/fastText/alignment/eval.py +60 -0
  23. data/vendor/fastText/alignment/example.sh +51 -0
  24. data/vendor/fastText/alignment/unsup_align.py +109 -0
  25. data/vendor/fastText/alignment/utils.py +154 -0
  26. data/vendor/fastText/classification-example.sh +41 -0
  27. data/vendor/fastText/classification-results.sh +94 -0
  28. data/vendor/fastText/crawl/README.md +26 -0
  29. data/vendor/fastText/crawl/dedup.cc +51 -0
  30. data/vendor/fastText/crawl/download_crawl.sh +57 -0
  31. data/vendor/fastText/crawl/filter_dedup.sh +13 -0
  32. data/vendor/fastText/crawl/filter_utf8.cc +105 -0
  33. data/vendor/fastText/crawl/process_wet_file.sh +30 -0
  34. data/vendor/fastText/docs/aligned-vectors.md +64 -0
  35. data/vendor/fastText/docs/api.md +6 -0
  36. data/vendor/fastText/docs/cheatsheet.md +66 -0
  37. data/vendor/fastText/docs/crawl-vectors.md +125 -0
  38. data/vendor/fastText/docs/dataset.md +6 -0
  39. data/vendor/fastText/docs/english-vectors.md +53 -0
  40. data/vendor/fastText/docs/faqs.md +63 -0
  41. data/vendor/fastText/docs/language-identification.md +47 -0
  42. data/vendor/fastText/docs/options.md +50 -0
  43. data/vendor/fastText/docs/pretrained-vectors.md +142 -0
  44. data/vendor/fastText/docs/python-module.md +314 -0
  45. data/vendor/fastText/docs/references.md +41 -0
  46. data/vendor/fastText/docs/supervised-models.md +54 -0
  47. data/vendor/fastText/docs/supervised-tutorial.md +349 -0
  48. data/vendor/fastText/docs/support.md +58 -0
  49. data/vendor/fastText/docs/unsupervised-tutorials.md +309 -0
  50. data/vendor/fastText/eval.py +95 -0
  51. data/vendor/fastText/get-wikimedia.sh +79 -0
  52. data/vendor/fastText/python/README.md +322 -0
  53. data/vendor/fastText/python/README.rst +406 -0
  54. data/vendor/fastText/python/benchmarks/README.rst +3 -0
  55. data/vendor/fastText/python/benchmarks/get_word_vector.py +49 -0
  56. data/vendor/fastText/python/doc/examples/FastTextEmbeddingBag.py +81 -0
  57. data/vendor/fastText/python/doc/examples/bin_to_vec.py +41 -0
  58. data/vendor/fastText/python/doc/examples/compute_accuracy.py +163 -0
  59. data/vendor/fastText/python/doc/examples/get_vocab.py +48 -0
  60. data/vendor/fastText/python/doc/examples/train_supervised.py +42 -0
  61. data/vendor/fastText/python/doc/examples/train_unsupervised.py +56 -0
  62. data/vendor/fastText/python/fasttext_module/fasttext/FastText.py +468 -0
  63. data/vendor/fastText/python/fasttext_module/fasttext/__init__.py +22 -0
  64. data/vendor/fastText/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc +388 -0
  65. data/vendor/fastText/python/fasttext_module/fasttext/tests/__init__.py +14 -0
  66. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_configurations.py +239 -0
  67. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_script.py +629 -0
  68. data/vendor/fastText/python/fasttext_module/fasttext/util/__init__.py +13 -0
  69. data/vendor/fastText/python/fasttext_module/fasttext/util/util.py +60 -0
  70. data/vendor/fastText/quantization-example.sh +40 -0
  71. data/vendor/fastText/runtests.py +60 -0
  72. data/vendor/fastText/scripts/kbcompletion/README.md +19 -0
  73. data/vendor/fastText/scripts/kbcompletion/data.sh +69 -0
  74. data/vendor/fastText/scripts/kbcompletion/eval.cpp +108 -0
  75. data/vendor/fastText/scripts/kbcompletion/fb15k.sh +49 -0
  76. data/vendor/fastText/scripts/kbcompletion/fb15k237.sh +45 -0
  77. data/vendor/fastText/scripts/kbcompletion/svo.sh +38 -0
  78. data/vendor/fastText/scripts/kbcompletion/wn18.sh +49 -0
  79. data/vendor/fastText/scripts/quantization/quantization-results.sh +43 -0
  80. data/vendor/fastText/setup.cfg +2 -0
  81. data/vendor/fastText/setup.py +203 -0
  82. data/vendor/fastText/src/args.cc +320 -0
  83. data/vendor/fastText/src/args.h +68 -0
  84. data/vendor/fastText/src/densematrix.cc +155 -0
  85. data/vendor/fastText/src/densematrix.h +75 -0
  86. data/vendor/fastText/src/dictionary.cc +540 -0
  87. data/vendor/fastText/src/dictionary.h +111 -0
  88. data/vendor/fastText/src/fasttext.cc +821 -0
  89. data/vendor/fastText/src/fasttext.h +191 -0
  90. data/vendor/fastText/src/loss.cc +346 -0
  91. data/vendor/fastText/src/loss.h +163 -0
  92. data/vendor/fastText/src/main.cc +435 -0
  93. data/vendor/fastText/src/matrix.cc +25 -0
  94. data/vendor/fastText/src/matrix.h +44 -0
  95. data/vendor/fastText/src/meter.cc +68 -0
  96. data/vendor/fastText/src/meter.h +69 -0
  97. data/vendor/fastText/src/model.cc +98 -0
  98. data/vendor/fastText/src/model.h +79 -0
  99. data/vendor/fastText/src/productquantizer.cc +251 -0
  100. data/vendor/fastText/src/productquantizer.h +63 -0
  101. data/vendor/fastText/src/quantmatrix.cc +117 -0
  102. data/vendor/fastText/src/quantmatrix.h +60 -0
  103. data/vendor/fastText/src/real.h +15 -0
  104. data/vendor/fastText/src/utils.cc +28 -0
  105. data/vendor/fastText/src/utils.h +43 -0
  106. data/vendor/fastText/src/vector.cc +97 -0
  107. data/vendor/fastText/src/vector.h +61 -0
  108. data/vendor/fastText/tests/fetch_test_data.sh +202 -0
  109. data/vendor/fastText/website/README.md +6 -0
  110. data/vendor/fastText/website/blog/2016-08-18-blog-post.md +42 -0
  111. data/vendor/fastText/website/blog/2017-05-02-blog-post.md +60 -0
  112. data/vendor/fastText/website/blog/2017-10-02-blog-post.md +90 -0
  113. data/vendor/fastText/website/blog/2019-06-25-blog-post.md +168 -0
  114. data/vendor/fastText/website/core/Footer.js +127 -0
  115. data/vendor/fastText/website/package.json +12 -0
  116. data/vendor/fastText/website/pages/en/index.js +286 -0
  117. data/vendor/fastText/website/sidebars.json +18 -0
  118. data/vendor/fastText/website/siteConfig.js +102 -0
  119. data/vendor/fastText/website/static/docs/en/html/annotated.html +115 -0
  120. data/vendor/fastText/website/static/docs/en/html/annotated_dup.js +4 -0
  121. data/vendor/fastText/website/static/docs/en/html/args_8cc.html +113 -0
  122. data/vendor/fastText/website/static/docs/en/html/args_8h.html +134 -0
  123. data/vendor/fastText/website/static/docs/en/html/args_8h.js +14 -0
  124. data/vendor/fastText/website/static/docs/en/html/args_8h_source.html +139 -0
  125. data/vendor/fastText/website/static/docs/en/html/bc_s.png +0 -0
  126. data/vendor/fastText/website/static/docs/en/html/bdwn.png +0 -0
  127. data/vendor/fastText/website/static/docs/en/html/classes.html +121 -0
  128. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args-members.html +140 -0
  129. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.html +753 -0
  130. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.js +40 -0
  131. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary-members.html +148 -0
  132. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.html +1266 -0
  133. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.js +43 -0
  134. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText-members.html +145 -0
  135. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.html +1149 -0
  136. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.js +45 -0
  137. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix-members.html +123 -0
  138. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.html +610 -0
  139. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.js +23 -0
  140. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model-members.html +150 -0
  141. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.html +1400 -0
  142. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.js +48 -0
  143. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html +131 -0
  144. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html +950 -0
  145. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js +31 -0
  146. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix-members.html +122 -0
  147. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.html +565 -0
  148. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.js +22 -0
  149. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector-members.html +121 -0
  150. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.html +542 -0
  151. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.js +21 -0
  152. data/vendor/fastText/website/static/docs/en/html/closed.png +0 -0
  153. data/vendor/fastText/website/static/docs/en/html/dictionary_8cc.html +116 -0
  154. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.html +142 -0
  155. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.js +10 -0
  156. data/vendor/fastText/website/static/docs/en/html/dictionary_8h_source.html +127 -0
  157. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html +145 -0
  158. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js +29 -0
  159. data/vendor/fastText/website/static/docs/en/html/doc.png +0 -0
  160. data/vendor/fastText/website/static/docs/en/html/doxygen.css +1596 -0
  161. data/vendor/fastText/website/static/docs/en/html/doxygen.png +0 -0
  162. data/vendor/fastText/website/static/docs/en/html/dynsections.js +97 -0
  163. data/vendor/fastText/website/static/docs/en/html/fasttext_8cc.html +119 -0
  164. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.html +168 -0
  165. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.js +6 -0
  166. data/vendor/fastText/website/static/docs/en/html/fasttext_8h_source.html +155 -0
  167. data/vendor/fastText/website/static/docs/en/html/favicon.png +0 -0
  168. data/vendor/fastText/website/static/docs/en/html/files.html +125 -0
  169. data/vendor/fastText/website/static/docs/en/html/files.js +4 -0
  170. data/vendor/fastText/website/static/docs/en/html/folderclosed.png +0 -0
  171. data/vendor/fastText/website/static/docs/en/html/folderopen.png +0 -0
  172. data/vendor/fastText/website/static/docs/en/html/functions.html +139 -0
  173. data/vendor/fastText/website/static/docs/en/html/functions_0x7e.html +112 -0
  174. data/vendor/fastText/website/static/docs/en/html/functions_b.html +115 -0
  175. data/vendor/fastText/website/static/docs/en/html/functions_c.html +143 -0
  176. data/vendor/fastText/website/static/docs/en/html/functions_d.html +135 -0
  177. data/vendor/fastText/website/static/docs/en/html/functions_dup.js +27 -0
  178. data/vendor/fastText/website/static/docs/en/html/functions_e.html +115 -0
  179. data/vendor/fastText/website/static/docs/en/html/functions_f.html +112 -0
  180. data/vendor/fastText/website/static/docs/en/html/functions_func.html +563 -0
  181. data/vendor/fastText/website/static/docs/en/html/functions_g.html +145 -0
  182. data/vendor/fastText/website/static/docs/en/html/functions_h.html +112 -0
  183. data/vendor/fastText/website/static/docs/en/html/functions_i.html +121 -0
  184. data/vendor/fastText/website/static/docs/en/html/functions_k.html +106 -0
  185. data/vendor/fastText/website/static/docs/en/html/functions_l.html +140 -0
  186. data/vendor/fastText/website/static/docs/en/html/functions_m.html +153 -0
  187. data/vendor/fastText/website/static/docs/en/html/functions_n.html +164 -0
  188. data/vendor/fastText/website/static/docs/en/html/functions_o.html +116 -0
  189. data/vendor/fastText/website/static/docs/en/html/functions_p.html +161 -0
  190. data/vendor/fastText/website/static/docs/en/html/functions_q.html +135 -0
  191. data/vendor/fastText/website/static/docs/en/html/functions_r.html +116 -0
  192. data/vendor/fastText/website/static/docs/en/html/functions_s.html +159 -0
  193. data/vendor/fastText/website/static/docs/en/html/functions_t.html +138 -0
  194. data/vendor/fastText/website/static/docs/en/html/functions_u.html +106 -0
  195. data/vendor/fastText/website/static/docs/en/html/functions_v.html +106 -0
  196. data/vendor/fastText/website/static/docs/en/html/functions_vars.html +486 -0
  197. data/vendor/fastText/website/static/docs/en/html/functions_w.html +124 -0
  198. data/vendor/fastText/website/static/docs/en/html/functions_z.html +104 -0
  199. data/vendor/fastText/website/static/docs/en/html/globals.html +170 -0
  200. data/vendor/fastText/website/static/docs/en/html/globals_defs.html +113 -0
  201. data/vendor/fastText/website/static/docs/en/html/globals_func.html +155 -0
  202. data/vendor/fastText/website/static/docs/en/html/index.html +100 -0
  203. data/vendor/fastText/website/static/docs/en/html/jquery.js +87 -0
  204. data/vendor/fastText/website/static/docs/en/html/main_8cc.html +582 -0
  205. data/vendor/fastText/website/static/docs/en/html/main_8cc.js +22 -0
  206. data/vendor/fastText/website/static/docs/en/html/matrix_8cc.html +114 -0
  207. data/vendor/fastText/website/static/docs/en/html/matrix_8h.html +121 -0
  208. data/vendor/fastText/website/static/docs/en/html/matrix_8h_source.html +123 -0
  209. data/vendor/fastText/website/static/docs/en/html/menu.js +26 -0
  210. data/vendor/fastText/website/static/docs/en/html/menudata.js +90 -0
  211. data/vendor/fastText/website/static/docs/en/html/model_8cc.html +113 -0
  212. data/vendor/fastText/website/static/docs/en/html/model_8h.html +183 -0
  213. data/vendor/fastText/website/static/docs/en/html/model_8h.js +8 -0
  214. data/vendor/fastText/website/static/docs/en/html/model_8h_source.html +139 -0
  215. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.html +343 -0
  216. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.js +13 -0
  217. data/vendor/fastText/website/static/docs/en/html/namespacefasttext_1_1utils.html +158 -0
  218. data/vendor/fastText/website/static/docs/en/html/namespacemembers.html +125 -0
  219. data/vendor/fastText/website/static/docs/en/html/namespacemembers_enum.html +107 -0
  220. data/vendor/fastText/website/static/docs/en/html/namespacemembers_func.html +110 -0
  221. data/vendor/fastText/website/static/docs/en/html/namespacemembers_type.html +104 -0
  222. data/vendor/fastText/website/static/docs/en/html/namespaces.html +106 -0
  223. data/vendor/fastText/website/static/docs/en/html/namespaces.js +4 -0
  224. data/vendor/fastText/website/static/docs/en/html/nav_f.png +0 -0
  225. data/vendor/fastText/website/static/docs/en/html/nav_g.png +0 -0
  226. data/vendor/fastText/website/static/docs/en/html/nav_h.png +0 -0
  227. data/vendor/fastText/website/static/docs/en/html/navtree.css +146 -0
  228. data/vendor/fastText/website/static/docs/en/html/navtree.js +517 -0
  229. data/vendor/fastText/website/static/docs/en/html/navtreedata.js +40 -0
  230. data/vendor/fastText/website/static/docs/en/html/navtreeindex0.js +253 -0
  231. data/vendor/fastText/website/static/docs/en/html/navtreeindex1.js +139 -0
  232. data/vendor/fastText/website/static/docs/en/html/open.png +0 -0
  233. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.html +118 -0
  234. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.js +4 -0
  235. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h.html +124 -0
  236. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h_source.html +133 -0
  237. data/vendor/fastText/website/static/docs/en/html/qmatrix_8cc.html +112 -0
  238. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h.html +126 -0
  239. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h_source.html +128 -0
  240. data/vendor/fastText/website/static/docs/en/html/real_8h.html +117 -0
  241. data/vendor/fastText/website/static/docs/en/html/real_8h.js +4 -0
  242. data/vendor/fastText/website/static/docs/en/html/real_8h_source.html +103 -0
  243. data/vendor/fastText/website/static/docs/en/html/resize.js +114 -0
  244. data/vendor/fastText/website/static/docs/en/html/search/all_0.html +26 -0
  245. data/vendor/fastText/website/static/docs/en/html/search/all_0.js +17 -0
  246. data/vendor/fastText/website/static/docs/en/html/search/all_1.html +26 -0
  247. data/vendor/fastText/website/static/docs/en/html/search/all_1.js +8 -0
  248. data/vendor/fastText/website/static/docs/en/html/search/all_10.html +26 -0
  249. data/vendor/fastText/website/static/docs/en/html/search/all_10.js +10 -0
  250. data/vendor/fastText/website/static/docs/en/html/search/all_11.html +26 -0
  251. data/vendor/fastText/website/static/docs/en/html/search/all_11.js +25 -0
  252. data/vendor/fastText/website/static/docs/en/html/search/all_12.html +26 -0
  253. data/vendor/fastText/website/static/docs/en/html/search/all_12.js +15 -0
  254. data/vendor/fastText/website/static/docs/en/html/search/all_13.html +26 -0
  255. data/vendor/fastText/website/static/docs/en/html/search/all_13.js +7 -0
  256. data/vendor/fastText/website/static/docs/en/html/search/all_14.html +26 -0
  257. data/vendor/fastText/website/static/docs/en/html/search/all_14.js +7 -0
  258. data/vendor/fastText/website/static/docs/en/html/search/all_15.html +26 -0
  259. data/vendor/fastText/website/static/docs/en/html/search/all_15.js +11 -0
  260. data/vendor/fastText/website/static/docs/en/html/search/all_16.html +26 -0
  261. data/vendor/fastText/website/static/docs/en/html/search/all_16.js +4 -0
  262. data/vendor/fastText/website/static/docs/en/html/search/all_17.html +26 -0
  263. data/vendor/fastText/website/static/docs/en/html/search/all_17.js +7 -0
  264. data/vendor/fastText/website/static/docs/en/html/search/all_2.html +26 -0
  265. data/vendor/fastText/website/static/docs/en/html/search/all_2.js +17 -0
  266. data/vendor/fastText/website/static/docs/en/html/search/all_3.html +26 -0
  267. data/vendor/fastText/website/static/docs/en/html/search/all_3.js +17 -0
  268. data/vendor/fastText/website/static/docs/en/html/search/all_4.html +26 -0
  269. data/vendor/fastText/website/static/docs/en/html/search/all_4.js +10 -0
  270. data/vendor/fastText/website/static/docs/en/html/search/all_5.html +26 -0
  271. data/vendor/fastText/website/static/docs/en/html/search/all_5.js +12 -0
  272. data/vendor/fastText/website/static/docs/en/html/search/all_6.html +26 -0
  273. data/vendor/fastText/website/static/docs/en/html/search/all_6.js +18 -0
  274. data/vendor/fastText/website/static/docs/en/html/search/all_7.html +26 -0
  275. data/vendor/fastText/website/static/docs/en/html/search/all_7.js +8 -0
  276. data/vendor/fastText/website/static/docs/en/html/search/all_8.html +26 -0
  277. data/vendor/fastText/website/static/docs/en/html/search/all_8.js +11 -0
  278. data/vendor/fastText/website/static/docs/en/html/search/all_9.html +26 -0
  279. data/vendor/fastText/website/static/docs/en/html/search/all_9.js +5 -0
  280. data/vendor/fastText/website/static/docs/en/html/search/all_a.html +26 -0
  281. data/vendor/fastText/website/static/docs/en/html/search/all_a.js +17 -0
  282. data/vendor/fastText/website/static/docs/en/html/search/all_b.html +26 -0
  283. data/vendor/fastText/website/static/docs/en/html/search/all_b.js +27 -0
  284. data/vendor/fastText/website/static/docs/en/html/search/all_c.html +26 -0
  285. data/vendor/fastText/website/static/docs/en/html/search/all_c.js +26 -0
  286. data/vendor/fastText/website/static/docs/en/html/search/all_d.html +26 -0
  287. data/vendor/fastText/website/static/docs/en/html/search/all_d.js +9 -0
  288. data/vendor/fastText/website/static/docs/en/html/search/all_e.html +26 -0
  289. data/vendor/fastText/website/static/docs/en/html/search/all_e.js +35 -0
  290. data/vendor/fastText/website/static/docs/en/html/search/all_f.html +26 -0
  291. data/vendor/fastText/website/static/docs/en/html/search/all_f.js +16 -0
  292. data/vendor/fastText/website/static/docs/en/html/search/classes_0.html +26 -0
  293. data/vendor/fastText/website/static/docs/en/html/search/classes_0.js +4 -0
  294. data/vendor/fastText/website/static/docs/en/html/search/classes_1.html +26 -0
  295. data/vendor/fastText/website/static/docs/en/html/search/classes_1.js +4 -0
  296. data/vendor/fastText/website/static/docs/en/html/search/classes_2.html +26 -0
  297. data/vendor/fastText/website/static/docs/en/html/search/classes_2.js +4 -0
  298. data/vendor/fastText/website/static/docs/en/html/search/classes_3.html +26 -0
  299. data/vendor/fastText/website/static/docs/en/html/search/classes_3.js +4 -0
  300. data/vendor/fastText/website/static/docs/en/html/search/classes_4.html +26 -0
  301. data/vendor/fastText/website/static/docs/en/html/search/classes_4.js +5 -0
  302. data/vendor/fastText/website/static/docs/en/html/search/classes_5.html +26 -0
  303. data/vendor/fastText/website/static/docs/en/html/search/classes_5.js +4 -0
  304. data/vendor/fastText/website/static/docs/en/html/search/classes_6.html +26 -0
  305. data/vendor/fastText/website/static/docs/en/html/search/classes_6.js +4 -0
  306. data/vendor/fastText/website/static/docs/en/html/search/classes_7.html +26 -0
  307. data/vendor/fastText/website/static/docs/en/html/search/classes_7.js +4 -0
  308. data/vendor/fastText/website/static/docs/en/html/search/classes_8.html +26 -0
  309. data/vendor/fastText/website/static/docs/en/html/search/classes_8.js +4 -0
  310. data/vendor/fastText/website/static/docs/en/html/search/close.png +0 -0
  311. data/vendor/fastText/website/static/docs/en/html/search/defines_0.html +26 -0
  312. data/vendor/fastText/website/static/docs/en/html/search/defines_0.js +5 -0
  313. data/vendor/fastText/website/static/docs/en/html/search/defines_1.html +26 -0
  314. data/vendor/fastText/website/static/docs/en/html/search/defines_1.js +4 -0
  315. data/vendor/fastText/website/static/docs/en/html/search/defines_2.html +26 -0
  316. data/vendor/fastText/website/static/docs/en/html/search/defines_2.js +4 -0
  317. data/vendor/fastText/website/static/docs/en/html/search/defines_3.html +26 -0
  318. data/vendor/fastText/website/static/docs/en/html/search/defines_3.js +4 -0
  319. data/vendor/fastText/website/static/docs/en/html/search/enums_0.html +26 -0
  320. data/vendor/fastText/website/static/docs/en/html/search/enums_0.js +4 -0
  321. data/vendor/fastText/website/static/docs/en/html/search/enums_1.html +26 -0
  322. data/vendor/fastText/website/static/docs/en/html/search/enums_1.js +4 -0
  323. data/vendor/fastText/website/static/docs/en/html/search/enums_2.html +26 -0
  324. data/vendor/fastText/website/static/docs/en/html/search/enums_2.js +4 -0
  325. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.html +26 -0
  326. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.js +4 -0
  327. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.html +26 -0
  328. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.js +4 -0
  329. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.html +26 -0
  330. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.js +4 -0
  331. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.html +26 -0
  332. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.js +4 -0
  333. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.html +26 -0
  334. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.js +6 -0
  335. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.html +26 -0
  336. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.js +4 -0
  337. data/vendor/fastText/website/static/docs/en/html/search/files_0.html +26 -0
  338. data/vendor/fastText/website/static/docs/en/html/search/files_0.js +5 -0
  339. data/vendor/fastText/website/static/docs/en/html/search/files_1.html +26 -0
  340. data/vendor/fastText/website/static/docs/en/html/search/files_1.js +5 -0
  341. data/vendor/fastText/website/static/docs/en/html/search/files_2.html +26 -0
  342. data/vendor/fastText/website/static/docs/en/html/search/files_2.js +5 -0
  343. data/vendor/fastText/website/static/docs/en/html/search/files_3.html +26 -0
  344. data/vendor/fastText/website/static/docs/en/html/search/files_3.js +8 -0
  345. data/vendor/fastText/website/static/docs/en/html/search/files_4.html +26 -0
  346. data/vendor/fastText/website/static/docs/en/html/search/files_4.js +5 -0
  347. data/vendor/fastText/website/static/docs/en/html/search/files_5.html +26 -0
  348. data/vendor/fastText/website/static/docs/en/html/search/files_5.js +5 -0
  349. data/vendor/fastText/website/static/docs/en/html/search/files_6.html +26 -0
  350. data/vendor/fastText/website/static/docs/en/html/search/files_6.js +4 -0
  351. data/vendor/fastText/website/static/docs/en/html/search/files_7.html +26 -0
  352. data/vendor/fastText/website/static/docs/en/html/search/files_7.js +5 -0
  353. data/vendor/fastText/website/static/docs/en/html/search/files_8.html +26 -0
  354. data/vendor/fastText/website/static/docs/en/html/search/files_8.js +5 -0
  355. data/vendor/fastText/website/static/docs/en/html/search/functions_0.html +26 -0
  356. data/vendor/fastText/website/static/docs/en/html/search/functions_0.js +14 -0
  357. data/vendor/fastText/website/static/docs/en/html/search/functions_1.html +26 -0
  358. data/vendor/fastText/website/static/docs/en/html/search/functions_1.js +5 -0
  359. data/vendor/fastText/website/static/docs/en/html/search/functions_10.html +26 -0
  360. data/vendor/fastText/website/static/docs/en/html/search/functions_10.js +5 -0
  361. data/vendor/fastText/website/static/docs/en/html/search/functions_11.html +26 -0
  362. data/vendor/fastText/website/static/docs/en/html/search/functions_11.js +18 -0
  363. data/vendor/fastText/website/static/docs/en/html/search/functions_12.html +26 -0
  364. data/vendor/fastText/website/static/docs/en/html/search/functions_12.js +8 -0
  365. data/vendor/fastText/website/static/docs/en/html/search/functions_13.html +26 -0
  366. data/vendor/fastText/website/static/docs/en/html/search/functions_13.js +5 -0
  367. data/vendor/fastText/website/static/docs/en/html/search/functions_14.html +26 -0
  368. data/vendor/fastText/website/static/docs/en/html/search/functions_14.js +4 -0
  369. data/vendor/fastText/website/static/docs/en/html/search/functions_15.html +26 -0
  370. data/vendor/fastText/website/static/docs/en/html/search/functions_15.js +4 -0
  371. data/vendor/fastText/website/static/docs/en/html/search/functions_16.html +26 -0
  372. data/vendor/fastText/website/static/docs/en/html/search/functions_16.js +4 -0
  373. data/vendor/fastText/website/static/docs/en/html/search/functions_17.html +26 -0
  374. data/vendor/fastText/website/static/docs/en/html/search/functions_17.js +7 -0
  375. data/vendor/fastText/website/static/docs/en/html/search/functions_2.html +26 -0
  376. data/vendor/fastText/website/static/docs/en/html/search/functions_2.js +11 -0
  377. data/vendor/fastText/website/static/docs/en/html/search/functions_3.html +26 -0
  378. data/vendor/fastText/website/static/docs/en/html/search/functions_3.js +9 -0
  379. data/vendor/fastText/website/static/docs/en/html/search/functions_4.html +26 -0
  380. data/vendor/fastText/website/static/docs/en/html/search/functions_4.js +4 -0
  381. data/vendor/fastText/website/static/docs/en/html/search/functions_5.html +26 -0
  382. data/vendor/fastText/website/static/docs/en/html/search/functions_5.js +7 -0
  383. data/vendor/fastText/website/static/docs/en/html/search/functions_6.html +26 -0
  384. data/vendor/fastText/website/static/docs/en/html/search/functions_6.js +17 -0
  385. data/vendor/fastText/website/static/docs/en/html/search/functions_7.html +26 -0
  386. data/vendor/fastText/website/static/docs/en/html/search/functions_7.js +5 -0
  387. data/vendor/fastText/website/static/docs/en/html/search/functions_8.html +26 -0
  388. data/vendor/fastText/website/static/docs/en/html/search/functions_8.js +8 -0
  389. data/vendor/fastText/website/static/docs/en/html/search/functions_9.html +26 -0
  390. data/vendor/fastText/website/static/docs/en/html/search/functions_9.js +4 -0
  391. data/vendor/fastText/website/static/docs/en/html/search/functions_a.html +26 -0
  392. data/vendor/fastText/website/static/docs/en/html/search/functions_a.js +8 -0
  393. data/vendor/fastText/website/static/docs/en/html/search/functions_b.html +26 -0
  394. data/vendor/fastText/website/static/docs/en/html/search/functions_b.js +10 -0
  395. data/vendor/fastText/website/static/docs/en/html/search/functions_c.html +26 -0
  396. data/vendor/fastText/website/static/docs/en/html/search/functions_c.js +10 -0
  397. data/vendor/fastText/website/static/docs/en/html/search/functions_d.html +26 -0
  398. data/vendor/fastText/website/static/docs/en/html/search/functions_d.js +6 -0
  399. data/vendor/fastText/website/static/docs/en/html/search/functions_e.html +26 -0
  400. data/vendor/fastText/website/static/docs/en/html/search/functions_e.js +26 -0
  401. data/vendor/fastText/website/static/docs/en/html/search/functions_f.html +26 -0
  402. data/vendor/fastText/website/static/docs/en/html/search/functions_f.js +6 -0
  403. data/vendor/fastText/website/static/docs/en/html/search/mag_sel.png +0 -0
  404. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.html +26 -0
  405. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.js +5 -0
  406. data/vendor/fastText/website/static/docs/en/html/search/nomatches.html +12 -0
  407. data/vendor/fastText/website/static/docs/en/html/search/search.css +271 -0
  408. data/vendor/fastText/website/static/docs/en/html/search/search.js +791 -0
  409. data/vendor/fastText/website/static/docs/en/html/search/search_l.png +0 -0
  410. data/vendor/fastText/website/static/docs/en/html/search/search_m.png +0 -0
  411. data/vendor/fastText/website/static/docs/en/html/search/search_r.png +0 -0
  412. data/vendor/fastText/website/static/docs/en/html/search/searchdata.js +42 -0
  413. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.html +26 -0
  414. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.js +4 -0
  415. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.html +26 -0
  416. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.js +4 -0
  417. data/vendor/fastText/website/static/docs/en/html/search/variables_0.html +26 -0
  418. data/vendor/fastText/website/static/docs/en/html/search/variables_0.js +4 -0
  419. data/vendor/fastText/website/static/docs/en/html/search/variables_1.html +26 -0
  420. data/vendor/fastText/website/static/docs/en/html/search/variables_1.js +6 -0
  421. data/vendor/fastText/website/static/docs/en/html/search/variables_10.html +26 -0
  422. data/vendor/fastText/website/static/docs/en/html/search/variables_10.js +8 -0
  423. data/vendor/fastText/website/static/docs/en/html/search/variables_11.html +26 -0
  424. data/vendor/fastText/website/static/docs/en/html/search/variables_11.js +11 -0
  425. data/vendor/fastText/website/static/docs/en/html/search/variables_12.html +26 -0
  426. data/vendor/fastText/website/static/docs/en/html/search/variables_12.js +4 -0
  427. data/vendor/fastText/website/static/docs/en/html/search/variables_13.html +26 -0
  428. data/vendor/fastText/website/static/docs/en/html/search/variables_13.js +10 -0
  429. data/vendor/fastText/website/static/docs/en/html/search/variables_2.html +26 -0
  430. data/vendor/fastText/website/static/docs/en/html/search/variables_2.js +9 -0
  431. data/vendor/fastText/website/static/docs/en/html/search/variables_3.html +26 -0
  432. data/vendor/fastText/website/static/docs/en/html/search/variables_3.js +9 -0
  433. data/vendor/fastText/website/static/docs/en/html/search/variables_4.html +26 -0
  434. data/vendor/fastText/website/static/docs/en/html/search/variables_4.js +7 -0
  435. data/vendor/fastText/website/static/docs/en/html/search/variables_5.html +26 -0
  436. data/vendor/fastText/website/static/docs/en/html/search/variables_5.js +4 -0
  437. data/vendor/fastText/website/static/docs/en/html/search/variables_6.html +26 -0
  438. data/vendor/fastText/website/static/docs/en/html/search/variables_6.js +5 -0
  439. data/vendor/fastText/website/static/docs/en/html/search/variables_7.html +26 -0
  440. data/vendor/fastText/website/static/docs/en/html/search/variables_7.js +5 -0
  441. data/vendor/fastText/website/static/docs/en/html/search/variables_8.html +26 -0
  442. data/vendor/fastText/website/static/docs/en/html/search/variables_8.js +4 -0
  443. data/vendor/fastText/website/static/docs/en/html/search/variables_9.html +26 -0
  444. data/vendor/fastText/website/static/docs/en/html/search/variables_9.js +10 -0
  445. data/vendor/fastText/website/static/docs/en/html/search/variables_a.html +26 -0
  446. data/vendor/fastText/website/static/docs/en/html/search/variables_a.js +14 -0
  447. data/vendor/fastText/website/static/docs/en/html/search/variables_b.html +26 -0
  448. data/vendor/fastText/website/static/docs/en/html/search/variables_b.js +17 -0
  449. data/vendor/fastText/website/static/docs/en/html/search/variables_c.html +26 -0
  450. data/vendor/fastText/website/static/docs/en/html/search/variables_c.js +6 -0
  451. data/vendor/fastText/website/static/docs/en/html/search/variables_d.html +26 -0
  452. data/vendor/fastText/website/static/docs/en/html/search/variables_d.js +10 -0
  453. data/vendor/fastText/website/static/docs/en/html/search/variables_e.html +26 -0
  454. data/vendor/fastText/website/static/docs/en/html/search/variables_e.js +11 -0
  455. data/vendor/fastText/website/static/docs/en/html/search/variables_f.html +26 -0
  456. data/vendor/fastText/website/static/docs/en/html/search/variables_f.js +6 -0
  457. data/vendor/fastText/website/static/docs/en/html/splitbar.png +0 -0
  458. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node-members.html +108 -0
  459. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.html +194 -0
  460. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.js +8 -0
  461. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry-members.html +107 -0
  462. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.html +178 -0
  463. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.js +7 -0
  464. data/vendor/fastText/website/static/docs/en/html/sync_off.png +0 -0
  465. data/vendor/fastText/website/static/docs/en/html/sync_on.png +0 -0
  466. data/vendor/fastText/website/static/docs/en/html/tab_a.png +0 -0
  467. data/vendor/fastText/website/static/docs/en/html/tab_b.png +0 -0
  468. data/vendor/fastText/website/static/docs/en/html/tab_h.png +0 -0
  469. data/vendor/fastText/website/static/docs/en/html/tab_s.png +0 -0
  470. data/vendor/fastText/website/static/docs/en/html/tabs.css +1 -0
  471. data/vendor/fastText/website/static/docs/en/html/utils_8cc.html +121 -0
  472. data/vendor/fastText/website/static/docs/en/html/utils_8cc.js +5 -0
  473. data/vendor/fastText/website/static/docs/en/html/utils_8h.html +122 -0
  474. data/vendor/fastText/website/static/docs/en/html/utils_8h.js +5 -0
  475. data/vendor/fastText/website/static/docs/en/html/utils_8h_source.html +104 -0
  476. data/vendor/fastText/website/static/docs/en/html/vector_8cc.html +121 -0
  477. data/vendor/fastText/website/static/docs/en/html/vector_8cc.js +4 -0
  478. data/vendor/fastText/website/static/docs/en/html/vector_8h.html +126 -0
  479. data/vendor/fastText/website/static/docs/en/html/vector_8h.js +5 -0
  480. data/vendor/fastText/website/static/docs/en/html/vector_8h_source.html +120 -0
  481. data/vendor/fastText/website/static/fasttext.css +48 -0
  482. data/vendor/fastText/website/static/img/authors/armand_joulin.jpg +0 -0
  483. data/vendor/fastText/website/static/img/authors/christian_puhrsch.png +0 -0
  484. data/vendor/fastText/website/static/img/authors/edouard_grave.jpeg +0 -0
  485. data/vendor/fastText/website/static/img/authors/piotr_bojanowski.jpg +0 -0
  486. data/vendor/fastText/website/static/img/authors/tomas_mikolov.jpg +0 -0
  487. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img1.png +0 -0
  488. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img2.png +0 -0
  489. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img1.jpg +0 -0
  490. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img2.jpg +0 -0
  491. data/vendor/fastText/website/static/img/blog/2017-10-02-blog-post-img1.png +0 -0
  492. data/vendor/fastText/website/static/img/cbo_vs_skipgram.png +0 -0
  493. data/vendor/fastText/website/static/img/fasttext-icon-api.png +0 -0
  494. data/vendor/fastText/website/static/img/fasttext-icon-bg-web.png +0 -0
  495. data/vendor/fastText/website/static/img/fasttext-icon-color-square.png +0 -0
  496. data/vendor/fastText/website/static/img/fasttext-icon-color-web.png +0 -0
  497. data/vendor/fastText/website/static/img/fasttext-icon-faq.png +0 -0
  498. data/vendor/fastText/website/static/img/fasttext-icon-tutorial.png +0 -0
  499. data/vendor/fastText/website/static/img/fasttext-icon-white-web.png +0 -0
  500. data/vendor/fastText/website/static/img/fasttext-logo-color-web.png +0 -0
  501. data/vendor/fastText/website/static/img/fasttext-logo-white-web.png +0 -0
  502. data/vendor/fastText/website/static/img/logo-color.png +0 -0
  503. data/vendor/fastText/website/static/img/model-black.png +0 -0
  504. data/vendor/fastText/website/static/img/model-blue.png +0 -0
  505. data/vendor/fastText/website/static/img/model-red.png +0 -0
  506. data/vendor/fastText/website/static/img/ogimage.png +0 -0
  507. data/vendor/fastText/website/static/img/oss_logo.png +0 -0
  508. data/vendor/fastText/wikifil.pl +57 -0
  509. data/vendor/fastText/word-vector-example.sh +39 -0
  510. metadata +621 -0
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Copyright (c) 2016-present, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ */
8
+
9
+ #pragma once
10
+
11
+ #include <istream>
12
+ #include <memory>
13
+ #include <ostream>
14
+ #include <random>
15
+ #include <string>
16
+ #include <unordered_map>
17
+ #include <vector>
18
+
19
+ #include "args.h"
20
+ #include "real.h"
21
+
22
+ namespace fasttext {
23
+
24
+ typedef int32_t id_type;
25
+ enum class entry_type : int8_t { word = 0, label = 1 };
26
+
27
+ struct entry {
28
+ std::string word;
29
+ int64_t count;
30
+ entry_type type;
31
+ std::vector<int32_t> subwords;
32
+ };
33
+
34
+ class Dictionary {
35
+ protected:
36
+ static const int32_t MAX_VOCAB_SIZE = 30000000;
37
+ static const int32_t MAX_LINE_SIZE = 1024;
38
+
39
+ int32_t find(const std::string&) const;
40
+ int32_t find(const std::string&, uint32_t h) const;
41
+ void initTableDiscard();
42
+ void initNgrams();
43
+ void reset(std::istream&) const;
44
+ void pushHash(std::vector<int32_t>&, int32_t) const;
45
+ void addSubwords(std::vector<int32_t>&, const std::string&, int32_t) const;
46
+
47
+ std::shared_ptr<Args> args_;
48
+ std::vector<int32_t> word2int_;
49
+ std::vector<entry> words_;
50
+
51
+ std::vector<real> pdiscard_;
52
+ int32_t size_;
53
+ int32_t nwords_;
54
+ int32_t nlabels_;
55
+ int64_t ntokens_;
56
+
57
+ int64_t pruneidx_size_;
58
+ std::unordered_map<int32_t, int32_t> pruneidx_;
59
+ void addWordNgrams(
60
+ std::vector<int32_t>& line,
61
+ const std::vector<int32_t>& hashes,
62
+ int32_t n) const;
63
+
64
+ public:
65
+ static const std::string EOS;
66
+ static const std::string BOW;
67
+ static const std::string EOW;
68
+
69
+ explicit Dictionary(std::shared_ptr<Args>);
70
+ explicit Dictionary(std::shared_ptr<Args>, std::istream&);
71
+ int32_t nwords() const;
72
+ int32_t nlabels() const;
73
+ int64_t ntokens() const;
74
+ int32_t getId(const std::string&) const;
75
+ int32_t getId(const std::string&, uint32_t h) const;
76
+ entry_type getType(int32_t) const;
77
+ entry_type getType(const std::string&) const;
78
+ bool discard(int32_t, real) const;
79
+ std::string getWord(int32_t) const;
80
+ const std::vector<int32_t>& getSubwords(int32_t) const;
81
+ const std::vector<int32_t> getSubwords(const std::string&) const;
82
+ void getSubwords(
83
+ const std::string&,
84
+ std::vector<int32_t>&,
85
+ std::vector<std::string>&) const;
86
+ void computeSubwords(
87
+ const std::string&,
88
+ std::vector<int32_t>&,
89
+ std::vector<std::string>* substrings = nullptr) const;
90
+ uint32_t hash(const std::string& str) const;
91
+ void add(const std::string&);
92
+ bool readWord(std::istream&, std::string&) const;
93
+ void readFromFile(std::istream&);
94
+ std::string getLabel(int32_t) const;
95
+ void save(std::ostream&) const;
96
+ void load(std::istream&);
97
+ std::vector<int64_t> getCounts(entry_type) const;
98
+ int32_t getLine(std::istream&, std::vector<int32_t>&, std::vector<int32_t>&)
99
+ const;
100
+ int32_t getLine(std::istream&, std::vector<int32_t>&, std::minstd_rand&)
101
+ const;
102
+ void threshold(int64_t, int64_t);
103
+ void prune(std::vector<int32_t>&);
104
+ bool isPruned() {
105
+ return pruneidx_size_ >= 0;
106
+ }
107
+ void dump(std::ostream&) const;
108
+ void init();
109
+ };
110
+
111
+ } // namespace fasttext
@@ -0,0 +1,821 @@
1
+ /**
2
+ * Copyright (c) 2016-present, Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under the MIT license found in the
6
+ * LICENSE file in the root directory of this source tree.
7
+ */
8
+
9
+ #include "fasttext.h"
10
+ #include "loss.h"
11
+ #include "quantmatrix.h"
12
+
13
+ #include <algorithm>
14
+ #include <iomanip>
15
+ #include <iostream>
16
+ #include <numeric>
17
+ #include <sstream>
18
+ #include <stdexcept>
19
+ #include <string>
20
+ #include <thread>
21
+ #include <vector>
22
+
23
+ namespace fasttext {
24
+
25
+ constexpr int32_t FASTTEXT_VERSION = 12; /* Version 1b */
26
+ constexpr int32_t FASTTEXT_FILEFORMAT_MAGIC_INT32 = 793712314;
27
+
28
+ bool comparePairs(
29
+ const std::pair<real, std::string>& l,
30
+ const std::pair<real, std::string>& r);
31
+
32
+ std::shared_ptr<Loss> FastText::createLoss(std::shared_ptr<Matrix>& output) {
33
+ loss_name lossName = args_->loss;
34
+ switch (lossName) {
35
+ case loss_name::hs:
36
+ return std::make_shared<HierarchicalSoftmaxLoss>(
37
+ output, getTargetCounts());
38
+ case loss_name::ns:
39
+ return std::make_shared<NegativeSamplingLoss>(
40
+ output, args_->neg, getTargetCounts());
41
+ case loss_name::softmax:
42
+ return std::make_shared<SoftmaxLoss>(output);
43
+ case loss_name::ova:
44
+ return std::make_shared<OneVsAllLoss>(output);
45
+ default:
46
+ throw std::runtime_error("Unknown loss");
47
+ }
48
+ }
49
+
50
+ FastText::FastText() : quant_(false), wordVectors_(nullptr) {}
51
+
52
+ void FastText::addInputVector(Vector& vec, int32_t ind) const {
53
+ vec.addRow(*input_, ind);
54
+ }
55
+
56
+ std::shared_ptr<const Dictionary> FastText::getDictionary() const {
57
+ return dict_;
58
+ }
59
+
60
+ const Args FastText::getArgs() const {
61
+ return *args_.get();
62
+ }
63
+
64
+ std::shared_ptr<const DenseMatrix> FastText::getInputMatrix() const {
65
+ if (quant_) {
66
+ throw std::runtime_error("Can't export quantized matrix");
67
+ }
68
+ assert(input_.get());
69
+ return std::dynamic_pointer_cast<DenseMatrix>(input_);
70
+ }
71
+
72
+ std::shared_ptr<const DenseMatrix> FastText::getOutputMatrix() const {
73
+ if (quant_ && args_->qout) {
74
+ throw std::runtime_error("Can't export quantized matrix");
75
+ }
76
+ assert(output_.get());
77
+ return std::dynamic_pointer_cast<DenseMatrix>(output_);
78
+ }
79
+
80
+ int32_t FastText::getWordId(const std::string& word) const {
81
+ return dict_->getId(word);
82
+ }
83
+
84
+ int32_t FastText::getSubwordId(const std::string& subword) const {
85
+ int32_t h = dict_->hash(subword) % args_->bucket;
86
+ return dict_->nwords() + h;
87
+ }
88
+
89
+ void FastText::getWordVector(Vector& vec, const std::string& word) const {
90
+ const std::vector<int32_t>& ngrams = dict_->getSubwords(word);
91
+ vec.zero();
92
+ for (int i = 0; i < ngrams.size(); i++) {
93
+ addInputVector(vec, ngrams[i]);
94
+ }
95
+ if (ngrams.size() > 0) {
96
+ vec.mul(1.0 / ngrams.size());
97
+ }
98
+ }
99
+
100
+ void FastText::getVector(Vector& vec, const std::string& word) const {
101
+ getWordVector(vec, word);
102
+ }
103
+
104
+ void FastText::getSubwordVector(Vector& vec, const std::string& subword) const {
105
+ vec.zero();
106
+ int32_t h = dict_->hash(subword) % args_->bucket;
107
+ h = h + dict_->nwords();
108
+ addInputVector(vec, h);
109
+ }
110
+
111
+ void FastText::saveVectors(const std::string& filename) {
112
+ std::ofstream ofs(filename);
113
+ if (!ofs.is_open()) {
114
+ throw std::invalid_argument(
115
+ filename + " cannot be opened for saving vectors!");
116
+ }
117
+ ofs << dict_->nwords() << " " << args_->dim << std::endl;
118
+ Vector vec(args_->dim);
119
+ for (int32_t i = 0; i < dict_->nwords(); i++) {
120
+ std::string word = dict_->getWord(i);
121
+ getWordVector(vec, word);
122
+ ofs << word << " " << vec << std::endl;
123
+ }
124
+ ofs.close();
125
+ }
126
+
127
+ void FastText::saveVectors() {
128
+ saveVectors(args_->output + ".vec");
129
+ }
130
+
131
+ void FastText::saveOutput(const std::string& filename) {
132
+ std::ofstream ofs(filename);
133
+ if (!ofs.is_open()) {
134
+ throw std::invalid_argument(
135
+ filename + " cannot be opened for saving vectors!");
136
+ }
137
+ if (quant_) {
138
+ throw std::invalid_argument(
139
+ "Option -saveOutput is not supported for quantized models.");
140
+ }
141
+ int32_t n =
142
+ (args_->model == model_name::sup) ? dict_->nlabels() : dict_->nwords();
143
+ ofs << n << " " << args_->dim << std::endl;
144
+ Vector vec(args_->dim);
145
+ for (int32_t i = 0; i < n; i++) {
146
+ std::string word = (args_->model == model_name::sup) ? dict_->getLabel(i)
147
+ : dict_->getWord(i);
148
+ vec.zero();
149
+ vec.addRow(*output_, i);
150
+ ofs << word << " " << vec << std::endl;
151
+ }
152
+ ofs.close();
153
+ }
154
+
155
+ void FastText::saveOutput() {
156
+ saveOutput(args_->output + ".output");
157
+ }
158
+
159
+ bool FastText::checkModel(std::istream& in) {
160
+ int32_t magic;
161
+ in.read((char*)&(magic), sizeof(int32_t));
162
+ if (magic != FASTTEXT_FILEFORMAT_MAGIC_INT32) {
163
+ return false;
164
+ }
165
+ in.read((char*)&(version), sizeof(int32_t));
166
+ if (version > FASTTEXT_VERSION) {
167
+ return false;
168
+ }
169
+ return true;
170
+ }
171
+
172
+ void FastText::signModel(std::ostream& out) {
173
+ const int32_t magic = FASTTEXT_FILEFORMAT_MAGIC_INT32;
174
+ const int32_t version = FASTTEXT_VERSION;
175
+ out.write((char*)&(magic), sizeof(int32_t));
176
+ out.write((char*)&(version), sizeof(int32_t));
177
+ }
178
+
179
+ void FastText::saveModel() {
180
+ std::string fn(args_->output);
181
+ if (quant_) {
182
+ fn += ".ftz";
183
+ } else {
184
+ fn += ".bin";
185
+ }
186
+ saveModel(fn);
187
+ }
188
+
189
+ void FastText::saveModel(const std::string& filename) {
190
+ std::ofstream ofs(filename, std::ofstream::binary);
191
+ if (!ofs.is_open()) {
192
+ throw std::invalid_argument(filename + " cannot be opened for saving!");
193
+ }
194
+ signModel(ofs);
195
+ args_->save(ofs);
196
+ dict_->save(ofs);
197
+
198
+ ofs.write((char*)&(quant_), sizeof(bool));
199
+ input_->save(ofs);
200
+
201
+ ofs.write((char*)&(args_->qout), sizeof(bool));
202
+ output_->save(ofs);
203
+
204
+ ofs.close();
205
+ }
206
+
207
+ void FastText::loadModel(const std::string& filename) {
208
+ std::ifstream ifs(filename, std::ifstream::binary);
209
+ if (!ifs.is_open()) {
210
+ throw std::invalid_argument(filename + " cannot be opened for loading!");
211
+ }
212
+ if (!checkModel(ifs)) {
213
+ throw std::invalid_argument(filename + " has wrong file format!");
214
+ }
215
+ loadModel(ifs);
216
+ ifs.close();
217
+ }
218
+
219
+ std::vector<int64_t> FastText::getTargetCounts() const {
220
+ if (args_->model == model_name::sup) {
221
+ return dict_->getCounts(entry_type::label);
222
+ } else {
223
+ return dict_->getCounts(entry_type::word);
224
+ }
225
+ }
226
+
227
+ void FastText::loadModel(std::istream& in) {
228
+ args_ = std::make_shared<Args>();
229
+ input_ = std::make_shared<DenseMatrix>();
230
+ output_ = std::make_shared<DenseMatrix>();
231
+ args_->load(in);
232
+ if (version == 11 && args_->model == model_name::sup) {
233
+ // backward compatibility: old supervised models do not use char ngrams.
234
+ args_->maxn = 0;
235
+ }
236
+ dict_ = std::make_shared<Dictionary>(args_, in);
237
+
238
+ bool quant_input;
239
+ in.read((char*)&quant_input, sizeof(bool));
240
+ if (quant_input) {
241
+ quant_ = true;
242
+ input_ = std::make_shared<QuantMatrix>();
243
+ }
244
+ input_->load(in);
245
+
246
+ if (!quant_input && dict_->isPruned()) {
247
+ throw std::invalid_argument(
248
+ "Invalid model file.\n"
249
+ "Please download the updated model from www.fasttext.cc.\n"
250
+ "See issue #332 on Github for more information.\n");
251
+ }
252
+
253
+ in.read((char*)&args_->qout, sizeof(bool));
254
+ if (quant_ && args_->qout) {
255
+ output_ = std::make_shared<QuantMatrix>();
256
+ }
257
+ output_->load(in);
258
+
259
+ auto loss = createLoss(output_);
260
+ bool normalizeGradient = (args_->model == model_name::sup);
261
+ model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
262
+ }
263
+
264
+ void FastText::printInfo(real progress, real loss, std::ostream& log_stream) {
265
+ std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
266
+ double t =
267
+ std::chrono::duration_cast<std::chrono::duration<double>>(end - start_)
268
+ .count();
269
+ double lr = args_->lr * (1.0 - progress);
270
+ double wst = 0;
271
+
272
+ int64_t eta = 2592000; // Default to one month in seconds (720 * 3600)
273
+
274
+ if (progress > 0 && t >= 0) {
275
+ progress = progress * 100;
276
+ eta = t * (100 - progress) / progress;
277
+ wst = double(tokenCount_) / t / args_->thread;
278
+ }
279
+ int32_t etah = eta / 3600;
280
+ int32_t etam = (eta % 3600) / 60;
281
+
282
+ log_stream << std::fixed;
283
+ log_stream << "Progress: ";
284
+ log_stream << std::setprecision(1) << std::setw(5) << progress << "%";
285
+ log_stream << " words/sec/thread: " << std::setw(7) << int64_t(wst);
286
+ log_stream << " lr: " << std::setw(9) << std::setprecision(6) << lr;
287
+ log_stream << " loss: " << std::setw(9) << std::setprecision(6) << loss;
288
+ log_stream << " ETA: " << std::setw(3) << etah;
289
+ log_stream << "h" << std::setw(2) << etam << "m";
290
+ log_stream << std::flush;
291
+ }
292
+
293
+ std::vector<int32_t> FastText::selectEmbeddings(int32_t cutoff) const {
294
+ std::shared_ptr<DenseMatrix> input =
295
+ std::dynamic_pointer_cast<DenseMatrix>(input_);
296
+ Vector norms(input->size(0));
297
+ input->l2NormRow(norms);
298
+ std::vector<int32_t> idx(input->size(0), 0);
299
+ std::iota(idx.begin(), idx.end(), 0);
300
+ auto eosid = dict_->getId(Dictionary::EOS);
301
+ std::sort(idx.begin(), idx.end(), [&norms, eosid](size_t i1, size_t i2) {
302
+ return eosid == i1 || (eosid != i2 && norms[i1] > norms[i2]);
303
+ });
304
+ idx.erase(idx.begin() + cutoff, idx.end());
305
+ return idx;
306
+ }
307
+
308
+ void FastText::quantize(const Args& qargs) {
309
+ if (args_->model != model_name::sup) {
310
+ throw std::invalid_argument(
311
+ "For now we only support quantization of supervised models");
312
+ }
313
+ args_->input = qargs.input;
314
+ args_->qout = qargs.qout;
315
+ args_->output = qargs.output;
316
+ std::shared_ptr<DenseMatrix> input =
317
+ std::dynamic_pointer_cast<DenseMatrix>(input_);
318
+ std::shared_ptr<DenseMatrix> output =
319
+ std::dynamic_pointer_cast<DenseMatrix>(output_);
320
+ bool normalizeGradient = (args_->model == model_name::sup);
321
+
322
+ if (qargs.cutoff > 0 && qargs.cutoff < input->size(0)) {
323
+ auto idx = selectEmbeddings(qargs.cutoff);
324
+ dict_->prune(idx);
325
+ std::shared_ptr<DenseMatrix> ninput =
326
+ std::make_shared<DenseMatrix>(idx.size(), args_->dim);
327
+ for (auto i = 0; i < idx.size(); i++) {
328
+ for (auto j = 0; j < args_->dim; j++) {
329
+ ninput->at(i, j) = input->at(idx[i], j);
330
+ }
331
+ }
332
+ input = ninput;
333
+ if (qargs.retrain) {
334
+ args_->epoch = qargs.epoch;
335
+ args_->lr = qargs.lr;
336
+ args_->thread = qargs.thread;
337
+ args_->verbose = qargs.verbose;
338
+ auto loss = createLoss(output_);
339
+ model_ = std::make_shared<Model>(input, output, loss, normalizeGradient);
340
+ startThreads();
341
+ }
342
+ }
343
+
344
+ input_ = std::make_shared<QuantMatrix>(
345
+ std::move(*(input.get())), qargs.dsub, qargs.qnorm);
346
+
347
+ if (args_->qout) {
348
+ output_ = std::make_shared<QuantMatrix>(
349
+ std::move(*(output.get())), 2, qargs.qnorm);
350
+ }
351
+
352
+ quant_ = true;
353
+ auto loss = createLoss(output_);
354
+ model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
355
+ }
356
+
357
+ void FastText::supervised(
358
+ Model::State& state,
359
+ real lr,
360
+ const std::vector<int32_t>& line,
361
+ const std::vector<int32_t>& labels) {
362
+ if (labels.size() == 0 || line.size() == 0) {
363
+ return;
364
+ }
365
+ if (args_->loss == loss_name::ova) {
366
+ model_->update(line, labels, Model::kAllLabelsAsTarget, lr, state);
367
+ } else {
368
+ std::uniform_int_distribution<> uniform(0, labels.size() - 1);
369
+ int32_t i = uniform(state.rng);
370
+ model_->update(line, labels, i, lr, state);
371
+ }
372
+ }
373
+
374
+ void FastText::cbow(
375
+ Model::State& state,
376
+ real lr,
377
+ const std::vector<int32_t>& line) {
378
+ std::vector<int32_t> bow;
379
+ std::uniform_int_distribution<> uniform(1, args_->ws);
380
+ for (int32_t w = 0; w < line.size(); w++) {
381
+ int32_t boundary = uniform(state.rng);
382
+ bow.clear();
383
+ for (int32_t c = -boundary; c <= boundary; c++) {
384
+ if (c != 0 && w + c >= 0 && w + c < line.size()) {
385
+ const std::vector<int32_t>& ngrams = dict_->getSubwords(line[w + c]);
386
+ bow.insert(bow.end(), ngrams.cbegin(), ngrams.cend());
387
+ }
388
+ }
389
+ model_->update(bow, line, w, lr, state);
390
+ }
391
+ }
392
+
393
+ void FastText::skipgram(
394
+ Model::State& state,
395
+ real lr,
396
+ const std::vector<int32_t>& line) {
397
+ std::uniform_int_distribution<> uniform(1, args_->ws);
398
+ for (int32_t w = 0; w < line.size(); w++) {
399
+ int32_t boundary = uniform(state.rng);
400
+ const std::vector<int32_t>& ngrams = dict_->getSubwords(line[w]);
401
+ for (int32_t c = -boundary; c <= boundary; c++) {
402
+ if (c != 0 && w + c >= 0 && w + c < line.size()) {
403
+ model_->update(ngrams, line, w + c, lr, state);
404
+ }
405
+ }
406
+ }
407
+ }
408
+
409
+ std::tuple<int64_t, double, double>
410
+ FastText::test(std::istream& in, int32_t k, real threshold) {
411
+ Meter meter;
412
+ test(in, k, threshold, meter);
413
+
414
+ return std::tuple<int64_t, double, double>(
415
+ meter.nexamples(), meter.precision(), meter.recall());
416
+ }
417
+
418
+ void FastText::test(std::istream& in, int32_t k, real threshold, Meter& meter)
419
+ const {
420
+ std::vector<int32_t> line;
421
+ std::vector<int32_t> labels;
422
+ Predictions predictions;
423
+
424
+ while (in.peek() != EOF) {
425
+ line.clear();
426
+ labels.clear();
427
+ dict_->getLine(in, line, labels);
428
+
429
+ if (!labels.empty() && !line.empty()) {
430
+ predictions.clear();
431
+ predict(k, line, predictions, threshold);
432
+ meter.log(labels, predictions);
433
+ }
434
+ }
435
+ }
436
+
437
+ void FastText::predict(
438
+ int32_t k,
439
+ const std::vector<int32_t>& words,
440
+ Predictions& predictions,
441
+ real threshold) const {
442
+ if (words.empty()) {
443
+ return;
444
+ }
445
+ Model::State state(args_->dim, dict_->nlabels(), 0);
446
+ if (args_->model != model_name::sup) {
447
+ throw std::invalid_argument("Model needs to be supervised for prediction!");
448
+ }
449
+ model_->predict(words, k, threshold, predictions, state);
450
+ }
451
+
452
+ bool FastText::predictLine(
453
+ std::istream& in,
454
+ std::vector<std::pair<real, std::string>>& predictions,
455
+ int32_t k,
456
+ real threshold) const {
457
+ predictions.clear();
458
+ if (in.peek() == EOF) {
459
+ return false;
460
+ }
461
+
462
+ std::vector<int32_t> words, labels;
463
+ dict_->getLine(in, words, labels);
464
+ Predictions linePredictions;
465
+ predict(k, words, linePredictions, threshold);
466
+ for (const auto& p : linePredictions) {
467
+ predictions.push_back(
468
+ std::make_pair(std::exp(p.first), dict_->getLabel(p.second)));
469
+ }
470
+
471
+ return true;
472
+ }
473
+
474
+ void FastText::getSentenceVector(std::istream& in, fasttext::Vector& svec) {
475
+ svec.zero();
476
+ if (args_->model == model_name::sup) {
477
+ std::vector<int32_t> line, labels;
478
+ dict_->getLine(in, line, labels);
479
+ for (int32_t i = 0; i < line.size(); i++) {
480
+ addInputVector(svec, line[i]);
481
+ }
482
+ if (!line.empty()) {
483
+ svec.mul(1.0 / line.size());
484
+ }
485
+ } else {
486
+ Vector vec(args_->dim);
487
+ std::string sentence;
488
+ std::getline(in, sentence);
489
+ std::istringstream iss(sentence);
490
+ std::string word;
491
+ int32_t count = 0;
492
+ while (iss >> word) {
493
+ getWordVector(vec, word);
494
+ real norm = vec.norm();
495
+ if (norm > 0) {
496
+ vec.mul(1.0 / norm);
497
+ svec.addVector(vec);
498
+ count++;
499
+ }
500
+ }
501
+ if (count > 0) {
502
+ svec.mul(1.0 / count);
503
+ }
504
+ }
505
+ }
506
+
507
+ std::vector<std::pair<std::string, Vector>> FastText::getNgramVectors(
508
+ const std::string& word) const {
509
+ std::vector<std::pair<std::string, Vector>> result;
510
+ std::vector<int32_t> ngrams;
511
+ std::vector<std::string> substrings;
512
+ dict_->getSubwords(word, ngrams, substrings);
513
+ assert(ngrams.size() <= substrings.size());
514
+ for (int32_t i = 0; i < ngrams.size(); i++) {
515
+ Vector vec(args_->dim);
516
+ if (ngrams[i] >= 0) {
517
+ vec.addRow(*input_, ngrams[i]);
518
+ }
519
+ result.push_back(std::make_pair(substrings[i], std::move(vec)));
520
+ }
521
+ return result;
522
+ }
523
+
524
+ // deprecated. use getNgramVectors instead
525
+ void FastText::ngramVectors(std::string word) {
526
+ std::vector<std::pair<std::string, Vector>> ngramVectors =
527
+ getNgramVectors(word);
528
+
529
+ for (const auto& ngramVector : ngramVectors) {
530
+ std::cout << ngramVector.first << " " << ngramVector.second << std::endl;
531
+ }
532
+ }
533
+
534
+ void FastText::precomputeWordVectors(DenseMatrix& wordVectors) {
535
+ Vector vec(args_->dim);
536
+ wordVectors.zero();
537
+ for (int32_t i = 0; i < dict_->nwords(); i++) {
538
+ std::string word = dict_->getWord(i);
539
+ getWordVector(vec, word);
540
+ real norm = vec.norm();
541
+ if (norm > 0) {
542
+ wordVectors.addVectorToRow(vec, i, 1.0 / norm);
543
+ }
544
+ }
545
+ }
546
+
547
+ void FastText::lazyComputeWordVectors() {
548
+ if (!wordVectors_) {
549
+ wordVectors_ = std::unique_ptr<DenseMatrix>(
550
+ new DenseMatrix(dict_->nwords(), args_->dim));
551
+ precomputeWordVectors(*wordVectors_);
552
+ }
553
+ }
554
+
555
+ std::vector<std::pair<real, std::string>> FastText::getNN(
556
+ const std::string& word,
557
+ int32_t k) {
558
+ Vector query(args_->dim);
559
+
560
+ getWordVector(query, word);
561
+
562
+ lazyComputeWordVectors();
563
+ assert(wordVectors_);
564
+ return getNN(*wordVectors_, query, k, {word});
565
+ }
566
+
567
+ std::vector<std::pair<real, std::string>> FastText::getNN(
568
+ const DenseMatrix& wordVectors,
569
+ const Vector& query,
570
+ int32_t k,
571
+ const std::set<std::string>& banSet) {
572
+ std::vector<std::pair<real, std::string>> heap;
573
+
574
+ real queryNorm = query.norm();
575
+ if (std::abs(queryNorm) < 1e-8) {
576
+ queryNorm = 1;
577
+ }
578
+
579
+ for (int32_t i = 0; i < dict_->nwords(); i++) {
580
+ std::string word = dict_->getWord(i);
581
+ if (banSet.find(word) == banSet.end()) {
582
+ real dp = wordVectors.dotRow(query, i);
583
+ real similarity = dp / queryNorm;
584
+
585
+ if (heap.size() == k && similarity < heap.front().first) {
586
+ continue;
587
+ }
588
+ heap.push_back(std::make_pair(similarity, word));
589
+ std::push_heap(heap.begin(), heap.end(), comparePairs);
590
+ if (heap.size() > k) {
591
+ std::pop_heap(heap.begin(), heap.end(), comparePairs);
592
+ heap.pop_back();
593
+ }
594
+ }
595
+ }
596
+ std::sort_heap(heap.begin(), heap.end(), comparePairs);
597
+
598
+ return heap;
599
+ }
600
+
601
+ // depracted. use getNN instead
602
+ void FastText::findNN(
603
+ const DenseMatrix& wordVectors,
604
+ const Vector& query,
605
+ int32_t k,
606
+ const std::set<std::string>& banSet,
607
+ std::vector<std::pair<real, std::string>>& results) {
608
+ results.clear();
609
+ results = getNN(wordVectors, query, k, banSet);
610
+ }
611
+
612
+ std::vector<std::pair<real, std::string>> FastText::getAnalogies(
613
+ int32_t k,
614
+ const std::string& wordA,
615
+ const std::string& wordB,
616
+ const std::string& wordC) {
617
+ Vector query = Vector(args_->dim);
618
+ query.zero();
619
+
620
+ Vector buffer(args_->dim);
621
+ getWordVector(buffer, wordA);
622
+ query.addVector(buffer, 1.0 / (buffer.norm() + 1e-8));
623
+ getWordVector(buffer, wordB);
624
+ query.addVector(buffer, -1.0 / (buffer.norm() + 1e-8));
625
+ getWordVector(buffer, wordC);
626
+ query.addVector(buffer, 1.0 / (buffer.norm() + 1e-8));
627
+
628
+ lazyComputeWordVectors();
629
+ assert(wordVectors_);
630
+ return getNN(*wordVectors_, query, k, {wordA, wordB, wordC});
631
+ }
632
+
633
+ // depreacted, use getAnalogies instead
634
+ void FastText::analogies(int32_t k) {
635
+ std::string prompt("Query triplet (A - B + C)? ");
636
+ std::string wordA, wordB, wordC;
637
+ std::cout << prompt;
638
+ while (true) {
639
+ std::cin >> wordA;
640
+ std::cin >> wordB;
641
+ std::cin >> wordC;
642
+ auto results = getAnalogies(k, wordA, wordB, wordC);
643
+
644
+ for (auto& pair : results) {
645
+ std::cout << pair.second << " " << pair.first << std::endl;
646
+ }
647
+ std::cout << prompt;
648
+ }
649
+ }
650
+
651
+ void FastText::trainThread(int32_t threadId) {
652
+ std::ifstream ifs(args_->input);
653
+ utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);
654
+
655
+ Model::State state(args_->dim, output_->size(0), threadId);
656
+
657
+ const int64_t ntokens = dict_->ntokens();
658
+ int64_t localTokenCount = 0;
659
+ std::vector<int32_t> line, labels;
660
+ while (tokenCount_ < args_->epoch * ntokens) {
661
+ real progress = real(tokenCount_) / (args_->epoch * ntokens);
662
+ real lr = args_->lr * (1.0 - progress);
663
+ if (args_->model == model_name::sup) {
664
+ localTokenCount += dict_->getLine(ifs, line, labels);
665
+ supervised(state, lr, line, labels);
666
+ } else if (args_->model == model_name::cbow) {
667
+ localTokenCount += dict_->getLine(ifs, line, state.rng);
668
+ cbow(state, lr, line);
669
+ } else if (args_->model == model_name::sg) {
670
+ localTokenCount += dict_->getLine(ifs, line, state.rng);
671
+ skipgram(state, lr, line);
672
+ }
673
+ if (localTokenCount > args_->lrUpdateRate) {
674
+ tokenCount_ += localTokenCount;
675
+ localTokenCount = 0;
676
+ if (threadId == 0 && args_->verbose > 1)
677
+ loss_ = state.getLoss();
678
+ }
679
+ }
680
+ if (threadId == 0)
681
+ loss_ = state.getLoss();
682
+ ifs.close();
683
+ }
684
+
685
+ std::shared_ptr<Matrix> FastText::getInputMatrixFromFile(
686
+ const std::string& filename) const {
687
+ std::ifstream in(filename);
688
+ std::vector<std::string> words;
689
+ std::shared_ptr<DenseMatrix> mat; // temp. matrix for pretrained vectors
690
+ int64_t n, dim;
691
+ if (!in.is_open()) {
692
+ throw std::invalid_argument(filename + " cannot be opened for loading!");
693
+ }
694
+ in >> n >> dim;
695
+ if (dim != args_->dim) {
696
+ throw std::invalid_argument(
697
+ "Dimension of pretrained vectors (" + std::to_string(dim) +
698
+ ") does not match dimension (" + std::to_string(args_->dim) + ")!");
699
+ }
700
+ mat = std::make_shared<DenseMatrix>(n, dim);
701
+ for (size_t i = 0; i < n; i++) {
702
+ std::string word;
703
+ in >> word;
704
+ words.push_back(word);
705
+ dict_->add(word);
706
+ for (size_t j = 0; j < dim; j++) {
707
+ in >> mat->at(i, j);
708
+ }
709
+ }
710
+ in.close();
711
+
712
+ dict_->threshold(1, 0);
713
+ dict_->init();
714
+ std::shared_ptr<DenseMatrix> input = std::make_shared<DenseMatrix>(
715
+ dict_->nwords() + args_->bucket, args_->dim);
716
+ input->uniform(1.0 / args_->dim);
717
+
718
+ for (size_t i = 0; i < n; i++) {
719
+ int32_t idx = dict_->getId(words[i]);
720
+ if (idx < 0 || idx >= dict_->nwords()) {
721
+ continue;
722
+ }
723
+ for (size_t j = 0; j < dim; j++) {
724
+ input->at(idx, j) = mat->at(i, j);
725
+ }
726
+ }
727
+ return input;
728
+ }
729
+
730
+ void FastText::loadVectors(const std::string& filename) {
731
+ input_ = getInputMatrixFromFile(filename);
732
+ }
733
+
734
+ std::shared_ptr<Matrix> FastText::createRandomMatrix() const {
735
+ std::shared_ptr<DenseMatrix> input = std::make_shared<DenseMatrix>(
736
+ dict_->nwords() + args_->bucket, args_->dim);
737
+ input->uniform(1.0 / args_->dim);
738
+
739
+ return input;
740
+ }
741
+
742
+ std::shared_ptr<Matrix> FastText::createTrainOutputMatrix() const {
743
+ int64_t m =
744
+ (args_->model == model_name::sup) ? dict_->nlabels() : dict_->nwords();
745
+ std::shared_ptr<DenseMatrix> output =
746
+ std::make_shared<DenseMatrix>(m, args_->dim);
747
+ output->zero();
748
+
749
+ return output;
750
+ }
751
+
752
+ void FastText::train(const Args& args) {
753
+ args_ = std::make_shared<Args>(args);
754
+ dict_ = std::make_shared<Dictionary>(args_);
755
+ if (args_->input == "-") {
756
+ // manage expectations
757
+ throw std::invalid_argument("Cannot use stdin for training!");
758
+ }
759
+ std::ifstream ifs(args_->input);
760
+ if (!ifs.is_open()) {
761
+ throw std::invalid_argument(
762
+ args_->input + " cannot be opened for training!");
763
+ }
764
+ dict_->readFromFile(ifs);
765
+ ifs.close();
766
+
767
+ if (!args_->pretrainedVectors.empty()) {
768
+ input_ = getInputMatrixFromFile(args_->pretrainedVectors);
769
+ } else {
770
+ input_ = createRandomMatrix();
771
+ }
772
+ output_ = createTrainOutputMatrix();
773
+ auto loss = createLoss(output_);
774
+ bool normalizeGradient = (args_->model == model_name::sup);
775
+ model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
776
+ startThreads();
777
+ }
778
+
779
+ void FastText::startThreads() {
780
+ start_ = std::chrono::steady_clock::now();
781
+ tokenCount_ = 0;
782
+ loss_ = -1;
783
+ std::vector<std::thread> threads;
784
+ for (int32_t i = 0; i < args_->thread; i++) {
785
+ threads.push_back(std::thread([=]() { trainThread(i); }));
786
+ }
787
+ const int64_t ntokens = dict_->ntokens();
788
+ // Same condition as trainThread
789
+ while (tokenCount_ < args_->epoch * ntokens) {
790
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
791
+ if (loss_ >= 0 && args_->verbose > 1) {
792
+ real progress = real(tokenCount_) / (args_->epoch * ntokens);
793
+ std::cerr << "\r";
794
+ printInfo(progress, loss_, std::cerr);
795
+ }
796
+ }
797
+ for (int32_t i = 0; i < args_->thread; i++) {
798
+ threads[i].join();
799
+ }
800
+ if (args_->verbose > 0) {
801
+ std::cerr << "\r";
802
+ printInfo(1.0, loss_, std::cerr);
803
+ std::cerr << std::endl;
804
+ }
805
+ }
806
+
807
+ int FastText::getDimension() const {
808
+ return args_->dim;
809
+ }
810
+
811
+ bool FastText::isQuant() const {
812
+ return quant_;
813
+ }
814
+
815
+ bool comparePairs(
816
+ const std::pair<real, std::string>& l,
817
+ const std::pair<real, std::string>& r) {
818
+ return l.first > r.first;
819
+ }
820
+
821
+ } // namespace fasttext