fasttext 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (510) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE.txt +22 -0
  4. data/README.md +251 -0
  5. data/ext/fasttext/ext.cpp +291 -0
  6. data/ext/fasttext/extconf.rb +15 -0
  7. data/lib/fasttext.rb +41 -0
  8. data/lib/fasttext/classifier.rb +92 -0
  9. data/lib/fasttext/ext.bundle +0 -0
  10. data/lib/fasttext/model.rb +60 -0
  11. data/lib/fasttext/vectorizer.rb +58 -0
  12. data/lib/fasttext/version.rb +3 -0
  13. data/vendor/fastText/CMakeLists.txt +68 -0
  14. data/vendor/fastText/CODE_OF_CONDUCT.md +2 -0
  15. data/vendor/fastText/CONTRIBUTING.md +32 -0
  16. data/vendor/fastText/LICENSE +21 -0
  17. data/vendor/fastText/MANIFEST.in +5 -0
  18. data/vendor/fastText/Makefile +63 -0
  19. data/vendor/fastText/README.md +339 -0
  20. data/vendor/fastText/alignment/README.md +53 -0
  21. data/vendor/fastText/alignment/align.py +145 -0
  22. data/vendor/fastText/alignment/eval.py +60 -0
  23. data/vendor/fastText/alignment/example.sh +51 -0
  24. data/vendor/fastText/alignment/unsup_align.py +109 -0
  25. data/vendor/fastText/alignment/utils.py +154 -0
  26. data/vendor/fastText/classification-example.sh +41 -0
  27. data/vendor/fastText/classification-results.sh +94 -0
  28. data/vendor/fastText/crawl/README.md +26 -0
  29. data/vendor/fastText/crawl/dedup.cc +51 -0
  30. data/vendor/fastText/crawl/download_crawl.sh +57 -0
  31. data/vendor/fastText/crawl/filter_dedup.sh +13 -0
  32. data/vendor/fastText/crawl/filter_utf8.cc +105 -0
  33. data/vendor/fastText/crawl/process_wet_file.sh +30 -0
  34. data/vendor/fastText/docs/aligned-vectors.md +64 -0
  35. data/vendor/fastText/docs/api.md +6 -0
  36. data/vendor/fastText/docs/cheatsheet.md +66 -0
  37. data/vendor/fastText/docs/crawl-vectors.md +125 -0
  38. data/vendor/fastText/docs/dataset.md +6 -0
  39. data/vendor/fastText/docs/english-vectors.md +53 -0
  40. data/vendor/fastText/docs/faqs.md +63 -0
  41. data/vendor/fastText/docs/language-identification.md +47 -0
  42. data/vendor/fastText/docs/options.md +50 -0
  43. data/vendor/fastText/docs/pretrained-vectors.md +142 -0
  44. data/vendor/fastText/docs/python-module.md +314 -0
  45. data/vendor/fastText/docs/references.md +41 -0
  46. data/vendor/fastText/docs/supervised-models.md +54 -0
  47. data/vendor/fastText/docs/supervised-tutorial.md +349 -0
  48. data/vendor/fastText/docs/support.md +58 -0
  49. data/vendor/fastText/docs/unsupervised-tutorials.md +309 -0
  50. data/vendor/fastText/eval.py +95 -0
  51. data/vendor/fastText/get-wikimedia.sh +79 -0
  52. data/vendor/fastText/python/README.md +322 -0
  53. data/vendor/fastText/python/README.rst +406 -0
  54. data/vendor/fastText/python/benchmarks/README.rst +3 -0
  55. data/vendor/fastText/python/benchmarks/get_word_vector.py +49 -0
  56. data/vendor/fastText/python/doc/examples/FastTextEmbeddingBag.py +81 -0
  57. data/vendor/fastText/python/doc/examples/bin_to_vec.py +41 -0
  58. data/vendor/fastText/python/doc/examples/compute_accuracy.py +163 -0
  59. data/vendor/fastText/python/doc/examples/get_vocab.py +48 -0
  60. data/vendor/fastText/python/doc/examples/train_supervised.py +42 -0
  61. data/vendor/fastText/python/doc/examples/train_unsupervised.py +56 -0
  62. data/vendor/fastText/python/fasttext_module/fasttext/FastText.py +468 -0
  63. data/vendor/fastText/python/fasttext_module/fasttext/__init__.py +22 -0
  64. data/vendor/fastText/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc +388 -0
  65. data/vendor/fastText/python/fasttext_module/fasttext/tests/__init__.py +14 -0
  66. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_configurations.py +239 -0
  67. data/vendor/fastText/python/fasttext_module/fasttext/tests/test_script.py +629 -0
  68. data/vendor/fastText/python/fasttext_module/fasttext/util/__init__.py +13 -0
  69. data/vendor/fastText/python/fasttext_module/fasttext/util/util.py +60 -0
  70. data/vendor/fastText/quantization-example.sh +40 -0
  71. data/vendor/fastText/runtests.py +60 -0
  72. data/vendor/fastText/scripts/kbcompletion/README.md +19 -0
  73. data/vendor/fastText/scripts/kbcompletion/data.sh +69 -0
  74. data/vendor/fastText/scripts/kbcompletion/eval.cpp +108 -0
  75. data/vendor/fastText/scripts/kbcompletion/fb15k.sh +49 -0
  76. data/vendor/fastText/scripts/kbcompletion/fb15k237.sh +45 -0
  77. data/vendor/fastText/scripts/kbcompletion/svo.sh +38 -0
  78. data/vendor/fastText/scripts/kbcompletion/wn18.sh +49 -0
  79. data/vendor/fastText/scripts/quantization/quantization-results.sh +43 -0
  80. data/vendor/fastText/setup.cfg +2 -0
  81. data/vendor/fastText/setup.py +203 -0
  82. data/vendor/fastText/src/args.cc +320 -0
  83. data/vendor/fastText/src/args.h +68 -0
  84. data/vendor/fastText/src/densematrix.cc +155 -0
  85. data/vendor/fastText/src/densematrix.h +75 -0
  86. data/vendor/fastText/src/dictionary.cc +540 -0
  87. data/vendor/fastText/src/dictionary.h +111 -0
  88. data/vendor/fastText/src/fasttext.cc +821 -0
  89. data/vendor/fastText/src/fasttext.h +191 -0
  90. data/vendor/fastText/src/loss.cc +346 -0
  91. data/vendor/fastText/src/loss.h +163 -0
  92. data/vendor/fastText/src/main.cc +435 -0
  93. data/vendor/fastText/src/matrix.cc +25 -0
  94. data/vendor/fastText/src/matrix.h +44 -0
  95. data/vendor/fastText/src/meter.cc +68 -0
  96. data/vendor/fastText/src/meter.h +69 -0
  97. data/vendor/fastText/src/model.cc +98 -0
  98. data/vendor/fastText/src/model.h +79 -0
  99. data/vendor/fastText/src/productquantizer.cc +251 -0
  100. data/vendor/fastText/src/productquantizer.h +63 -0
  101. data/vendor/fastText/src/quantmatrix.cc +117 -0
  102. data/vendor/fastText/src/quantmatrix.h +60 -0
  103. data/vendor/fastText/src/real.h +15 -0
  104. data/vendor/fastText/src/utils.cc +28 -0
  105. data/vendor/fastText/src/utils.h +43 -0
  106. data/vendor/fastText/src/vector.cc +97 -0
  107. data/vendor/fastText/src/vector.h +61 -0
  108. data/vendor/fastText/tests/fetch_test_data.sh +202 -0
  109. data/vendor/fastText/website/README.md +6 -0
  110. data/vendor/fastText/website/blog/2016-08-18-blog-post.md +42 -0
  111. data/vendor/fastText/website/blog/2017-05-02-blog-post.md +60 -0
  112. data/vendor/fastText/website/blog/2017-10-02-blog-post.md +90 -0
  113. data/vendor/fastText/website/blog/2019-06-25-blog-post.md +168 -0
  114. data/vendor/fastText/website/core/Footer.js +127 -0
  115. data/vendor/fastText/website/package.json +12 -0
  116. data/vendor/fastText/website/pages/en/index.js +286 -0
  117. data/vendor/fastText/website/sidebars.json +18 -0
  118. data/vendor/fastText/website/siteConfig.js +102 -0
  119. data/vendor/fastText/website/static/docs/en/html/annotated.html +115 -0
  120. data/vendor/fastText/website/static/docs/en/html/annotated_dup.js +4 -0
  121. data/vendor/fastText/website/static/docs/en/html/args_8cc.html +113 -0
  122. data/vendor/fastText/website/static/docs/en/html/args_8h.html +134 -0
  123. data/vendor/fastText/website/static/docs/en/html/args_8h.js +14 -0
  124. data/vendor/fastText/website/static/docs/en/html/args_8h_source.html +139 -0
  125. data/vendor/fastText/website/static/docs/en/html/bc_s.png +0 -0
  126. data/vendor/fastText/website/static/docs/en/html/bdwn.png +0 -0
  127. data/vendor/fastText/website/static/docs/en/html/classes.html +121 -0
  128. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args-members.html +140 -0
  129. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.html +753 -0
  130. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Args.js +40 -0
  131. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary-members.html +148 -0
  132. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.html +1266 -0
  133. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Dictionary.js +43 -0
  134. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText-members.html +145 -0
  135. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.html +1149 -0
  136. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1FastText.js +45 -0
  137. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix-members.html +123 -0
  138. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.html +610 -0
  139. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Matrix.js +23 -0
  140. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model-members.html +150 -0
  141. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.html +1400 -0
  142. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Model.js +48 -0
  143. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html +131 -0
  144. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html +950 -0
  145. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js +31 -0
  146. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix-members.html +122 -0
  147. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.html +565 -0
  148. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1QMatrix.js +22 -0
  149. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector-members.html +121 -0
  150. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.html +542 -0
  151. data/vendor/fastText/website/static/docs/en/html/classfasttext_1_1Vector.js +21 -0
  152. data/vendor/fastText/website/static/docs/en/html/closed.png +0 -0
  153. data/vendor/fastText/website/static/docs/en/html/dictionary_8cc.html +116 -0
  154. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.html +142 -0
  155. data/vendor/fastText/website/static/docs/en/html/dictionary_8h.js +10 -0
  156. data/vendor/fastText/website/static/docs/en/html/dictionary_8h_source.html +127 -0
  157. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html +145 -0
  158. data/vendor/fastText/website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js +29 -0
  159. data/vendor/fastText/website/static/docs/en/html/doc.png +0 -0
  160. data/vendor/fastText/website/static/docs/en/html/doxygen.css +1596 -0
  161. data/vendor/fastText/website/static/docs/en/html/doxygen.png +0 -0
  162. data/vendor/fastText/website/static/docs/en/html/dynsections.js +97 -0
  163. data/vendor/fastText/website/static/docs/en/html/fasttext_8cc.html +119 -0
  164. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.html +168 -0
  165. data/vendor/fastText/website/static/docs/en/html/fasttext_8h.js +6 -0
  166. data/vendor/fastText/website/static/docs/en/html/fasttext_8h_source.html +155 -0
  167. data/vendor/fastText/website/static/docs/en/html/favicon.png +0 -0
  168. data/vendor/fastText/website/static/docs/en/html/files.html +125 -0
  169. data/vendor/fastText/website/static/docs/en/html/files.js +4 -0
  170. data/vendor/fastText/website/static/docs/en/html/folderclosed.png +0 -0
  171. data/vendor/fastText/website/static/docs/en/html/folderopen.png +0 -0
  172. data/vendor/fastText/website/static/docs/en/html/functions.html +139 -0
  173. data/vendor/fastText/website/static/docs/en/html/functions_0x7e.html +112 -0
  174. data/vendor/fastText/website/static/docs/en/html/functions_b.html +115 -0
  175. data/vendor/fastText/website/static/docs/en/html/functions_c.html +143 -0
  176. data/vendor/fastText/website/static/docs/en/html/functions_d.html +135 -0
  177. data/vendor/fastText/website/static/docs/en/html/functions_dup.js +27 -0
  178. data/vendor/fastText/website/static/docs/en/html/functions_e.html +115 -0
  179. data/vendor/fastText/website/static/docs/en/html/functions_f.html +112 -0
  180. data/vendor/fastText/website/static/docs/en/html/functions_func.html +563 -0
  181. data/vendor/fastText/website/static/docs/en/html/functions_g.html +145 -0
  182. data/vendor/fastText/website/static/docs/en/html/functions_h.html +112 -0
  183. data/vendor/fastText/website/static/docs/en/html/functions_i.html +121 -0
  184. data/vendor/fastText/website/static/docs/en/html/functions_k.html +106 -0
  185. data/vendor/fastText/website/static/docs/en/html/functions_l.html +140 -0
  186. data/vendor/fastText/website/static/docs/en/html/functions_m.html +153 -0
  187. data/vendor/fastText/website/static/docs/en/html/functions_n.html +164 -0
  188. data/vendor/fastText/website/static/docs/en/html/functions_o.html +116 -0
  189. data/vendor/fastText/website/static/docs/en/html/functions_p.html +161 -0
  190. data/vendor/fastText/website/static/docs/en/html/functions_q.html +135 -0
  191. data/vendor/fastText/website/static/docs/en/html/functions_r.html +116 -0
  192. data/vendor/fastText/website/static/docs/en/html/functions_s.html +159 -0
  193. data/vendor/fastText/website/static/docs/en/html/functions_t.html +138 -0
  194. data/vendor/fastText/website/static/docs/en/html/functions_u.html +106 -0
  195. data/vendor/fastText/website/static/docs/en/html/functions_v.html +106 -0
  196. data/vendor/fastText/website/static/docs/en/html/functions_vars.html +486 -0
  197. data/vendor/fastText/website/static/docs/en/html/functions_w.html +124 -0
  198. data/vendor/fastText/website/static/docs/en/html/functions_z.html +104 -0
  199. data/vendor/fastText/website/static/docs/en/html/globals.html +170 -0
  200. data/vendor/fastText/website/static/docs/en/html/globals_defs.html +113 -0
  201. data/vendor/fastText/website/static/docs/en/html/globals_func.html +155 -0
  202. data/vendor/fastText/website/static/docs/en/html/index.html +100 -0
  203. data/vendor/fastText/website/static/docs/en/html/jquery.js +87 -0
  204. data/vendor/fastText/website/static/docs/en/html/main_8cc.html +582 -0
  205. data/vendor/fastText/website/static/docs/en/html/main_8cc.js +22 -0
  206. data/vendor/fastText/website/static/docs/en/html/matrix_8cc.html +114 -0
  207. data/vendor/fastText/website/static/docs/en/html/matrix_8h.html +121 -0
  208. data/vendor/fastText/website/static/docs/en/html/matrix_8h_source.html +123 -0
  209. data/vendor/fastText/website/static/docs/en/html/menu.js +26 -0
  210. data/vendor/fastText/website/static/docs/en/html/menudata.js +90 -0
  211. data/vendor/fastText/website/static/docs/en/html/model_8cc.html +113 -0
  212. data/vendor/fastText/website/static/docs/en/html/model_8h.html +183 -0
  213. data/vendor/fastText/website/static/docs/en/html/model_8h.js +8 -0
  214. data/vendor/fastText/website/static/docs/en/html/model_8h_source.html +139 -0
  215. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.html +343 -0
  216. data/vendor/fastText/website/static/docs/en/html/namespacefasttext.js +13 -0
  217. data/vendor/fastText/website/static/docs/en/html/namespacefasttext_1_1utils.html +158 -0
  218. data/vendor/fastText/website/static/docs/en/html/namespacemembers.html +125 -0
  219. data/vendor/fastText/website/static/docs/en/html/namespacemembers_enum.html +107 -0
  220. data/vendor/fastText/website/static/docs/en/html/namespacemembers_func.html +110 -0
  221. data/vendor/fastText/website/static/docs/en/html/namespacemembers_type.html +104 -0
  222. data/vendor/fastText/website/static/docs/en/html/namespaces.html +106 -0
  223. data/vendor/fastText/website/static/docs/en/html/namespaces.js +4 -0
  224. data/vendor/fastText/website/static/docs/en/html/nav_f.png +0 -0
  225. data/vendor/fastText/website/static/docs/en/html/nav_g.png +0 -0
  226. data/vendor/fastText/website/static/docs/en/html/nav_h.png +0 -0
  227. data/vendor/fastText/website/static/docs/en/html/navtree.css +146 -0
  228. data/vendor/fastText/website/static/docs/en/html/navtree.js +517 -0
  229. data/vendor/fastText/website/static/docs/en/html/navtreedata.js +40 -0
  230. data/vendor/fastText/website/static/docs/en/html/navtreeindex0.js +253 -0
  231. data/vendor/fastText/website/static/docs/en/html/navtreeindex1.js +139 -0
  232. data/vendor/fastText/website/static/docs/en/html/open.png +0 -0
  233. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.html +118 -0
  234. data/vendor/fastText/website/static/docs/en/html/productquantizer_8cc.js +4 -0
  235. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h.html +124 -0
  236. data/vendor/fastText/website/static/docs/en/html/productquantizer_8h_source.html +133 -0
  237. data/vendor/fastText/website/static/docs/en/html/qmatrix_8cc.html +112 -0
  238. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h.html +126 -0
  239. data/vendor/fastText/website/static/docs/en/html/qmatrix_8h_source.html +128 -0
  240. data/vendor/fastText/website/static/docs/en/html/real_8h.html +117 -0
  241. data/vendor/fastText/website/static/docs/en/html/real_8h.js +4 -0
  242. data/vendor/fastText/website/static/docs/en/html/real_8h_source.html +103 -0
  243. data/vendor/fastText/website/static/docs/en/html/resize.js +114 -0
  244. data/vendor/fastText/website/static/docs/en/html/search/all_0.html +26 -0
  245. data/vendor/fastText/website/static/docs/en/html/search/all_0.js +17 -0
  246. data/vendor/fastText/website/static/docs/en/html/search/all_1.html +26 -0
  247. data/vendor/fastText/website/static/docs/en/html/search/all_1.js +8 -0
  248. data/vendor/fastText/website/static/docs/en/html/search/all_10.html +26 -0
  249. data/vendor/fastText/website/static/docs/en/html/search/all_10.js +10 -0
  250. data/vendor/fastText/website/static/docs/en/html/search/all_11.html +26 -0
  251. data/vendor/fastText/website/static/docs/en/html/search/all_11.js +25 -0
  252. data/vendor/fastText/website/static/docs/en/html/search/all_12.html +26 -0
  253. data/vendor/fastText/website/static/docs/en/html/search/all_12.js +15 -0
  254. data/vendor/fastText/website/static/docs/en/html/search/all_13.html +26 -0
  255. data/vendor/fastText/website/static/docs/en/html/search/all_13.js +7 -0
  256. data/vendor/fastText/website/static/docs/en/html/search/all_14.html +26 -0
  257. data/vendor/fastText/website/static/docs/en/html/search/all_14.js +7 -0
  258. data/vendor/fastText/website/static/docs/en/html/search/all_15.html +26 -0
  259. data/vendor/fastText/website/static/docs/en/html/search/all_15.js +11 -0
  260. data/vendor/fastText/website/static/docs/en/html/search/all_16.html +26 -0
  261. data/vendor/fastText/website/static/docs/en/html/search/all_16.js +4 -0
  262. data/vendor/fastText/website/static/docs/en/html/search/all_17.html +26 -0
  263. data/vendor/fastText/website/static/docs/en/html/search/all_17.js +7 -0
  264. data/vendor/fastText/website/static/docs/en/html/search/all_2.html +26 -0
  265. data/vendor/fastText/website/static/docs/en/html/search/all_2.js +17 -0
  266. data/vendor/fastText/website/static/docs/en/html/search/all_3.html +26 -0
  267. data/vendor/fastText/website/static/docs/en/html/search/all_3.js +17 -0
  268. data/vendor/fastText/website/static/docs/en/html/search/all_4.html +26 -0
  269. data/vendor/fastText/website/static/docs/en/html/search/all_4.js +10 -0
  270. data/vendor/fastText/website/static/docs/en/html/search/all_5.html +26 -0
  271. data/vendor/fastText/website/static/docs/en/html/search/all_5.js +12 -0
  272. data/vendor/fastText/website/static/docs/en/html/search/all_6.html +26 -0
  273. data/vendor/fastText/website/static/docs/en/html/search/all_6.js +18 -0
  274. data/vendor/fastText/website/static/docs/en/html/search/all_7.html +26 -0
  275. data/vendor/fastText/website/static/docs/en/html/search/all_7.js +8 -0
  276. data/vendor/fastText/website/static/docs/en/html/search/all_8.html +26 -0
  277. data/vendor/fastText/website/static/docs/en/html/search/all_8.js +11 -0
  278. data/vendor/fastText/website/static/docs/en/html/search/all_9.html +26 -0
  279. data/vendor/fastText/website/static/docs/en/html/search/all_9.js +5 -0
  280. data/vendor/fastText/website/static/docs/en/html/search/all_a.html +26 -0
  281. data/vendor/fastText/website/static/docs/en/html/search/all_a.js +17 -0
  282. data/vendor/fastText/website/static/docs/en/html/search/all_b.html +26 -0
  283. data/vendor/fastText/website/static/docs/en/html/search/all_b.js +27 -0
  284. data/vendor/fastText/website/static/docs/en/html/search/all_c.html +26 -0
  285. data/vendor/fastText/website/static/docs/en/html/search/all_c.js +26 -0
  286. data/vendor/fastText/website/static/docs/en/html/search/all_d.html +26 -0
  287. data/vendor/fastText/website/static/docs/en/html/search/all_d.js +9 -0
  288. data/vendor/fastText/website/static/docs/en/html/search/all_e.html +26 -0
  289. data/vendor/fastText/website/static/docs/en/html/search/all_e.js +35 -0
  290. data/vendor/fastText/website/static/docs/en/html/search/all_f.html +26 -0
  291. data/vendor/fastText/website/static/docs/en/html/search/all_f.js +16 -0
  292. data/vendor/fastText/website/static/docs/en/html/search/classes_0.html +26 -0
  293. data/vendor/fastText/website/static/docs/en/html/search/classes_0.js +4 -0
  294. data/vendor/fastText/website/static/docs/en/html/search/classes_1.html +26 -0
  295. data/vendor/fastText/website/static/docs/en/html/search/classes_1.js +4 -0
  296. data/vendor/fastText/website/static/docs/en/html/search/classes_2.html +26 -0
  297. data/vendor/fastText/website/static/docs/en/html/search/classes_2.js +4 -0
  298. data/vendor/fastText/website/static/docs/en/html/search/classes_3.html +26 -0
  299. data/vendor/fastText/website/static/docs/en/html/search/classes_3.js +4 -0
  300. data/vendor/fastText/website/static/docs/en/html/search/classes_4.html +26 -0
  301. data/vendor/fastText/website/static/docs/en/html/search/classes_4.js +5 -0
  302. data/vendor/fastText/website/static/docs/en/html/search/classes_5.html +26 -0
  303. data/vendor/fastText/website/static/docs/en/html/search/classes_5.js +4 -0
  304. data/vendor/fastText/website/static/docs/en/html/search/classes_6.html +26 -0
  305. data/vendor/fastText/website/static/docs/en/html/search/classes_6.js +4 -0
  306. data/vendor/fastText/website/static/docs/en/html/search/classes_7.html +26 -0
  307. data/vendor/fastText/website/static/docs/en/html/search/classes_7.js +4 -0
  308. data/vendor/fastText/website/static/docs/en/html/search/classes_8.html +26 -0
  309. data/vendor/fastText/website/static/docs/en/html/search/classes_8.js +4 -0
  310. data/vendor/fastText/website/static/docs/en/html/search/close.png +0 -0
  311. data/vendor/fastText/website/static/docs/en/html/search/defines_0.html +26 -0
  312. data/vendor/fastText/website/static/docs/en/html/search/defines_0.js +5 -0
  313. data/vendor/fastText/website/static/docs/en/html/search/defines_1.html +26 -0
  314. data/vendor/fastText/website/static/docs/en/html/search/defines_1.js +4 -0
  315. data/vendor/fastText/website/static/docs/en/html/search/defines_2.html +26 -0
  316. data/vendor/fastText/website/static/docs/en/html/search/defines_2.js +4 -0
  317. data/vendor/fastText/website/static/docs/en/html/search/defines_3.html +26 -0
  318. data/vendor/fastText/website/static/docs/en/html/search/defines_3.js +4 -0
  319. data/vendor/fastText/website/static/docs/en/html/search/enums_0.html +26 -0
  320. data/vendor/fastText/website/static/docs/en/html/search/enums_0.js +4 -0
  321. data/vendor/fastText/website/static/docs/en/html/search/enums_1.html +26 -0
  322. data/vendor/fastText/website/static/docs/en/html/search/enums_1.js +4 -0
  323. data/vendor/fastText/website/static/docs/en/html/search/enums_2.html +26 -0
  324. data/vendor/fastText/website/static/docs/en/html/search/enums_2.js +4 -0
  325. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.html +26 -0
  326. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_0.js +4 -0
  327. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.html +26 -0
  328. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_1.js +4 -0
  329. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.html +26 -0
  330. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_2.js +4 -0
  331. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.html +26 -0
  332. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_3.js +4 -0
  333. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.html +26 -0
  334. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_4.js +6 -0
  335. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.html +26 -0
  336. data/vendor/fastText/website/static/docs/en/html/search/enumvalues_5.js +4 -0
  337. data/vendor/fastText/website/static/docs/en/html/search/files_0.html +26 -0
  338. data/vendor/fastText/website/static/docs/en/html/search/files_0.js +5 -0
  339. data/vendor/fastText/website/static/docs/en/html/search/files_1.html +26 -0
  340. data/vendor/fastText/website/static/docs/en/html/search/files_1.js +5 -0
  341. data/vendor/fastText/website/static/docs/en/html/search/files_2.html +26 -0
  342. data/vendor/fastText/website/static/docs/en/html/search/files_2.js +5 -0
  343. data/vendor/fastText/website/static/docs/en/html/search/files_3.html +26 -0
  344. data/vendor/fastText/website/static/docs/en/html/search/files_3.js +8 -0
  345. data/vendor/fastText/website/static/docs/en/html/search/files_4.html +26 -0
  346. data/vendor/fastText/website/static/docs/en/html/search/files_4.js +5 -0
  347. data/vendor/fastText/website/static/docs/en/html/search/files_5.html +26 -0
  348. data/vendor/fastText/website/static/docs/en/html/search/files_5.js +5 -0
  349. data/vendor/fastText/website/static/docs/en/html/search/files_6.html +26 -0
  350. data/vendor/fastText/website/static/docs/en/html/search/files_6.js +4 -0
  351. data/vendor/fastText/website/static/docs/en/html/search/files_7.html +26 -0
  352. data/vendor/fastText/website/static/docs/en/html/search/files_7.js +5 -0
  353. data/vendor/fastText/website/static/docs/en/html/search/files_8.html +26 -0
  354. data/vendor/fastText/website/static/docs/en/html/search/files_8.js +5 -0
  355. data/vendor/fastText/website/static/docs/en/html/search/functions_0.html +26 -0
  356. data/vendor/fastText/website/static/docs/en/html/search/functions_0.js +14 -0
  357. data/vendor/fastText/website/static/docs/en/html/search/functions_1.html +26 -0
  358. data/vendor/fastText/website/static/docs/en/html/search/functions_1.js +5 -0
  359. data/vendor/fastText/website/static/docs/en/html/search/functions_10.html +26 -0
  360. data/vendor/fastText/website/static/docs/en/html/search/functions_10.js +5 -0
  361. data/vendor/fastText/website/static/docs/en/html/search/functions_11.html +26 -0
  362. data/vendor/fastText/website/static/docs/en/html/search/functions_11.js +18 -0
  363. data/vendor/fastText/website/static/docs/en/html/search/functions_12.html +26 -0
  364. data/vendor/fastText/website/static/docs/en/html/search/functions_12.js +8 -0
  365. data/vendor/fastText/website/static/docs/en/html/search/functions_13.html +26 -0
  366. data/vendor/fastText/website/static/docs/en/html/search/functions_13.js +5 -0
  367. data/vendor/fastText/website/static/docs/en/html/search/functions_14.html +26 -0
  368. data/vendor/fastText/website/static/docs/en/html/search/functions_14.js +4 -0
  369. data/vendor/fastText/website/static/docs/en/html/search/functions_15.html +26 -0
  370. data/vendor/fastText/website/static/docs/en/html/search/functions_15.js +4 -0
  371. data/vendor/fastText/website/static/docs/en/html/search/functions_16.html +26 -0
  372. data/vendor/fastText/website/static/docs/en/html/search/functions_16.js +4 -0
  373. data/vendor/fastText/website/static/docs/en/html/search/functions_17.html +26 -0
  374. data/vendor/fastText/website/static/docs/en/html/search/functions_17.js +7 -0
  375. data/vendor/fastText/website/static/docs/en/html/search/functions_2.html +26 -0
  376. data/vendor/fastText/website/static/docs/en/html/search/functions_2.js +11 -0
  377. data/vendor/fastText/website/static/docs/en/html/search/functions_3.html +26 -0
  378. data/vendor/fastText/website/static/docs/en/html/search/functions_3.js +9 -0
  379. data/vendor/fastText/website/static/docs/en/html/search/functions_4.html +26 -0
  380. data/vendor/fastText/website/static/docs/en/html/search/functions_4.js +4 -0
  381. data/vendor/fastText/website/static/docs/en/html/search/functions_5.html +26 -0
  382. data/vendor/fastText/website/static/docs/en/html/search/functions_5.js +7 -0
  383. data/vendor/fastText/website/static/docs/en/html/search/functions_6.html +26 -0
  384. data/vendor/fastText/website/static/docs/en/html/search/functions_6.js +17 -0
  385. data/vendor/fastText/website/static/docs/en/html/search/functions_7.html +26 -0
  386. data/vendor/fastText/website/static/docs/en/html/search/functions_7.js +5 -0
  387. data/vendor/fastText/website/static/docs/en/html/search/functions_8.html +26 -0
  388. data/vendor/fastText/website/static/docs/en/html/search/functions_8.js +8 -0
  389. data/vendor/fastText/website/static/docs/en/html/search/functions_9.html +26 -0
  390. data/vendor/fastText/website/static/docs/en/html/search/functions_9.js +4 -0
  391. data/vendor/fastText/website/static/docs/en/html/search/functions_a.html +26 -0
  392. data/vendor/fastText/website/static/docs/en/html/search/functions_a.js +8 -0
  393. data/vendor/fastText/website/static/docs/en/html/search/functions_b.html +26 -0
  394. data/vendor/fastText/website/static/docs/en/html/search/functions_b.js +10 -0
  395. data/vendor/fastText/website/static/docs/en/html/search/functions_c.html +26 -0
  396. data/vendor/fastText/website/static/docs/en/html/search/functions_c.js +10 -0
  397. data/vendor/fastText/website/static/docs/en/html/search/functions_d.html +26 -0
  398. data/vendor/fastText/website/static/docs/en/html/search/functions_d.js +6 -0
  399. data/vendor/fastText/website/static/docs/en/html/search/functions_e.html +26 -0
  400. data/vendor/fastText/website/static/docs/en/html/search/functions_e.js +26 -0
  401. data/vendor/fastText/website/static/docs/en/html/search/functions_f.html +26 -0
  402. data/vendor/fastText/website/static/docs/en/html/search/functions_f.js +6 -0
  403. data/vendor/fastText/website/static/docs/en/html/search/mag_sel.png +0 -0
  404. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.html +26 -0
  405. data/vendor/fastText/website/static/docs/en/html/search/namespaces_0.js +5 -0
  406. data/vendor/fastText/website/static/docs/en/html/search/nomatches.html +12 -0
  407. data/vendor/fastText/website/static/docs/en/html/search/search.css +271 -0
  408. data/vendor/fastText/website/static/docs/en/html/search/search.js +791 -0
  409. data/vendor/fastText/website/static/docs/en/html/search/search_l.png +0 -0
  410. data/vendor/fastText/website/static/docs/en/html/search/search_m.png +0 -0
  411. data/vendor/fastText/website/static/docs/en/html/search/search_r.png +0 -0
  412. data/vendor/fastText/website/static/docs/en/html/search/searchdata.js +42 -0
  413. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.html +26 -0
  414. data/vendor/fastText/website/static/docs/en/html/search/typedefs_0.js +4 -0
  415. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.html +26 -0
  416. data/vendor/fastText/website/static/docs/en/html/search/typedefs_1.js +4 -0
  417. data/vendor/fastText/website/static/docs/en/html/search/variables_0.html +26 -0
  418. data/vendor/fastText/website/static/docs/en/html/search/variables_0.js +4 -0
  419. data/vendor/fastText/website/static/docs/en/html/search/variables_1.html +26 -0
  420. data/vendor/fastText/website/static/docs/en/html/search/variables_1.js +6 -0
  421. data/vendor/fastText/website/static/docs/en/html/search/variables_10.html +26 -0
  422. data/vendor/fastText/website/static/docs/en/html/search/variables_10.js +8 -0
  423. data/vendor/fastText/website/static/docs/en/html/search/variables_11.html +26 -0
  424. data/vendor/fastText/website/static/docs/en/html/search/variables_11.js +11 -0
  425. data/vendor/fastText/website/static/docs/en/html/search/variables_12.html +26 -0
  426. data/vendor/fastText/website/static/docs/en/html/search/variables_12.js +4 -0
  427. data/vendor/fastText/website/static/docs/en/html/search/variables_13.html +26 -0
  428. data/vendor/fastText/website/static/docs/en/html/search/variables_13.js +10 -0
  429. data/vendor/fastText/website/static/docs/en/html/search/variables_2.html +26 -0
  430. data/vendor/fastText/website/static/docs/en/html/search/variables_2.js +9 -0
  431. data/vendor/fastText/website/static/docs/en/html/search/variables_3.html +26 -0
  432. data/vendor/fastText/website/static/docs/en/html/search/variables_3.js +9 -0
  433. data/vendor/fastText/website/static/docs/en/html/search/variables_4.html +26 -0
  434. data/vendor/fastText/website/static/docs/en/html/search/variables_4.js +7 -0
  435. data/vendor/fastText/website/static/docs/en/html/search/variables_5.html +26 -0
  436. data/vendor/fastText/website/static/docs/en/html/search/variables_5.js +4 -0
  437. data/vendor/fastText/website/static/docs/en/html/search/variables_6.html +26 -0
  438. data/vendor/fastText/website/static/docs/en/html/search/variables_6.js +5 -0
  439. data/vendor/fastText/website/static/docs/en/html/search/variables_7.html +26 -0
  440. data/vendor/fastText/website/static/docs/en/html/search/variables_7.js +5 -0
  441. data/vendor/fastText/website/static/docs/en/html/search/variables_8.html +26 -0
  442. data/vendor/fastText/website/static/docs/en/html/search/variables_8.js +4 -0
  443. data/vendor/fastText/website/static/docs/en/html/search/variables_9.html +26 -0
  444. data/vendor/fastText/website/static/docs/en/html/search/variables_9.js +10 -0
  445. data/vendor/fastText/website/static/docs/en/html/search/variables_a.html +26 -0
  446. data/vendor/fastText/website/static/docs/en/html/search/variables_a.js +14 -0
  447. data/vendor/fastText/website/static/docs/en/html/search/variables_b.html +26 -0
  448. data/vendor/fastText/website/static/docs/en/html/search/variables_b.js +17 -0
  449. data/vendor/fastText/website/static/docs/en/html/search/variables_c.html +26 -0
  450. data/vendor/fastText/website/static/docs/en/html/search/variables_c.js +6 -0
  451. data/vendor/fastText/website/static/docs/en/html/search/variables_d.html +26 -0
  452. data/vendor/fastText/website/static/docs/en/html/search/variables_d.js +10 -0
  453. data/vendor/fastText/website/static/docs/en/html/search/variables_e.html +26 -0
  454. data/vendor/fastText/website/static/docs/en/html/search/variables_e.js +11 -0
  455. data/vendor/fastText/website/static/docs/en/html/search/variables_f.html +26 -0
  456. data/vendor/fastText/website/static/docs/en/html/search/variables_f.js +6 -0
  457. data/vendor/fastText/website/static/docs/en/html/splitbar.png +0 -0
  458. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node-members.html +108 -0
  459. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.html +194 -0
  460. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1Node.js +8 -0
  461. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry-members.html +107 -0
  462. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.html +178 -0
  463. data/vendor/fastText/website/static/docs/en/html/structfasttext_1_1entry.js +7 -0
  464. data/vendor/fastText/website/static/docs/en/html/sync_off.png +0 -0
  465. data/vendor/fastText/website/static/docs/en/html/sync_on.png +0 -0
  466. data/vendor/fastText/website/static/docs/en/html/tab_a.png +0 -0
  467. data/vendor/fastText/website/static/docs/en/html/tab_b.png +0 -0
  468. data/vendor/fastText/website/static/docs/en/html/tab_h.png +0 -0
  469. data/vendor/fastText/website/static/docs/en/html/tab_s.png +0 -0
  470. data/vendor/fastText/website/static/docs/en/html/tabs.css +1 -0
  471. data/vendor/fastText/website/static/docs/en/html/utils_8cc.html +121 -0
  472. data/vendor/fastText/website/static/docs/en/html/utils_8cc.js +5 -0
  473. data/vendor/fastText/website/static/docs/en/html/utils_8h.html +122 -0
  474. data/vendor/fastText/website/static/docs/en/html/utils_8h.js +5 -0
  475. data/vendor/fastText/website/static/docs/en/html/utils_8h_source.html +104 -0
  476. data/vendor/fastText/website/static/docs/en/html/vector_8cc.html +121 -0
  477. data/vendor/fastText/website/static/docs/en/html/vector_8cc.js +4 -0
  478. data/vendor/fastText/website/static/docs/en/html/vector_8h.html +126 -0
  479. data/vendor/fastText/website/static/docs/en/html/vector_8h.js +5 -0
  480. data/vendor/fastText/website/static/docs/en/html/vector_8h_source.html +120 -0
  481. data/vendor/fastText/website/static/fasttext.css +48 -0
  482. data/vendor/fastText/website/static/img/authors/armand_joulin.jpg +0 -0
  483. data/vendor/fastText/website/static/img/authors/christian_puhrsch.png +0 -0
  484. data/vendor/fastText/website/static/img/authors/edouard_grave.jpeg +0 -0
  485. data/vendor/fastText/website/static/img/authors/piotr_bojanowski.jpg +0 -0
  486. data/vendor/fastText/website/static/img/authors/tomas_mikolov.jpg +0 -0
  487. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img1.png +0 -0
  488. data/vendor/fastText/website/static/img/blog/2016-08-18-blog-post-img2.png +0 -0
  489. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img1.jpg +0 -0
  490. data/vendor/fastText/website/static/img/blog/2017-05-02-blog-post-img2.jpg +0 -0
  491. data/vendor/fastText/website/static/img/blog/2017-10-02-blog-post-img1.png +0 -0
  492. data/vendor/fastText/website/static/img/cbo_vs_skipgram.png +0 -0
  493. data/vendor/fastText/website/static/img/fasttext-icon-api.png +0 -0
  494. data/vendor/fastText/website/static/img/fasttext-icon-bg-web.png +0 -0
  495. data/vendor/fastText/website/static/img/fasttext-icon-color-square.png +0 -0
  496. data/vendor/fastText/website/static/img/fasttext-icon-color-web.png +0 -0
  497. data/vendor/fastText/website/static/img/fasttext-icon-faq.png +0 -0
  498. data/vendor/fastText/website/static/img/fasttext-icon-tutorial.png +0 -0
  499. data/vendor/fastText/website/static/img/fasttext-icon-white-web.png +0 -0
  500. data/vendor/fastText/website/static/img/fasttext-logo-color-web.png +0 -0
  501. data/vendor/fastText/website/static/img/fasttext-logo-white-web.png +0 -0
  502. data/vendor/fastText/website/static/img/logo-color.png +0 -0
  503. data/vendor/fastText/website/static/img/model-black.png +0 -0
  504. data/vendor/fastText/website/static/img/model-blue.png +0 -0
  505. data/vendor/fastText/website/static/img/model-red.png +0 -0
  506. data/vendor/fastText/website/static/img/ogimage.png +0 -0
  507. data/vendor/fastText/website/static/img/oss_logo.png +0 -0
  508. data/vendor/fastText/wikifil.pl +57 -0
  509. data/vendor/fastText/word-vector-example.sh +39 -0
  510. metadata +621 -0
@@ -0,0 +1,26 @@
1
+ ## Preprocessing Common Crawl
2
+
3
+ This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
4
+
5
+ This script uses the scripts and language identifier of [1].
6
+
7
+ This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
8
+
9
+ Set the variable WET_PATHS_URL to the crawl you want to process.
10
+ Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
11
+ Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
12
+
13
+ ### Reference
14
+
15
+ If you use this code, please cite:
16
+
17
+ [1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
18
+
19
+ ```
20
+ @inproceedings{grave2018learning,
21
+ title={Learning Word Vectors for 157 Languages},
22
+ author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
23
+ booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
24
+ year={2018}
25
+ }
26
+ ```
@@ -0,0 +1,51 @@
1
+ // Copyright (c) 2018-present, Facebook, Inc.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the MIT license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include <cstdint>
8
+ #include <iostream>
9
+ #include <fstream>
10
+ #include <string>
11
+ #include <vector>
12
+
13
+ uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull)
14
+ {
15
+ for (size_t i = 0; i < sz; i++, data++) {
16
+ h ^= uint64_t(*data);
17
+ h *= 1099511628211ull;
18
+ }
19
+ return h;
20
+ }
21
+
22
+ int main(int argc, char** argv)
23
+ {
24
+ uint64_t init_values[] = {
25
+ 14695981039346656037ull,
26
+ 9425296925403859339ull,
27
+ 13716263814064014149ull,
28
+ 3525492407291847033ull,
29
+ 8607404175481815707ull,
30
+ 9818874561736458749ull,
31
+ 10026508429719773353ull,
32
+ 3560712257386009938ull
33
+ };
34
+ size_t n = 1ull<<34, num_hashes = 2;
35
+ std::vector<bool> seen(n);
36
+
37
+ std::ios_base::sync_with_stdio(false);
38
+
39
+ for (std::string line; std::getline(std::cin, line);) {
40
+ bool b = true;
41
+ for (size_t i = 0; i < num_hashes; i++) {
42
+ uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n;
43
+ b = b && seen[h];
44
+ seen[h] = true;
45
+ }
46
+ if (!b) {
47
+ std::cout << line << std::endl;
48
+ }
49
+ }
50
+ return 0;
51
+ }
@@ -0,0 +1,57 @@
1
+ #!/bin/usr/env sh
2
+ # Copyright (c) 2018-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ set -e
9
+
10
+ # Set this variable to the crawl you want to process.
11
+ WET_PATHS_URL="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-34/wet.paths.gz"
12
+
13
+ # Set NUM_LANGID and NUM_DEDUP according to the capacity of your machine.
14
+ # Please note that each dedup process uses 2GB of RAM, while langid is
15
+ # mostly limited by cpu usage.
16
+ NUM_LANGID=12
17
+ NUM_DEDUP=8
18
+ URL="https://commoncrawl.s3.amazonaws.com/"
19
+
20
+ if [ ! -d fastText ]; then
21
+ git clone https://github.com/facebookresearch/fastText.git
22
+ fi
23
+
24
+ if [ ! -f fastText/fasttext ]; then
25
+ cd fastText
26
+ make
27
+ cd ..
28
+ fi
29
+
30
+ if [ ! -f lid.176.bin ]; then
31
+ wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
32
+ fi
33
+
34
+ if [ ! -d tmp ]; then
35
+ mkdir tmp
36
+ fi
37
+
38
+ if [ ! -d shard ]; then
39
+ mkdir shard
40
+ fi
41
+
42
+ if [ ! -f wet.paths ]; then
43
+ wget "${WET_PATHS_URL}"
44
+ gunzip wet.paths.gz
45
+ fi
46
+
47
+ ## Language identification
48
+ cat wet.paths | xargs -n 1 -P "${NUM_LANGID}" -I '{}' sh process_wet_file.sh "${URL}{}"
49
+
50
+ ## Deduplication
51
+ g++ -std=c++11 -O3 -o dedup dedup.cc
52
+ g++ -std=c++11 -O3 -o filter_utf8 filter_utf8.cc
53
+ find shard -name '*.txt' | xargs -n 1 -P "${NUM_DEDUP}" -I '{}' sh filter_dedup.sh "{}"
54
+
55
+ ## Example of data filtering + tokenization
56
+ git clone https://github.com/moses-smt/mosesdecoder.git
57
+ perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l es < shard/es.dedup > shard/es.tok
@@ -0,0 +1,13 @@
1
+ #!/bin/usr/env sh
2
+ # Copyright (c) 2018-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ set -e
9
+
10
+ LG=$(basename --suffix=".txt" "${1}")
11
+
12
+ ./filter_utf8 < "shard/${LG}.txt" \
13
+ | ./dedup > "shard/${LG}.dedup"
@@ -0,0 +1,105 @@
1
+ // Copyright (c) 2018-present, Facebook, Inc.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the MIT license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include <cstdint>
8
+ #include <iostream>
9
+ #include <string>
10
+
11
+ // Check that the next n bytes are continuation bytes.
12
+ bool continuation(uint8_t* str, int n)
13
+ {
14
+ for (int i = 0; i < n; i++) {
15
+ if ((str[i] & 0xc0) != 0x80) return false;
16
+ }
17
+ return true;
18
+ }
19
+
20
+ // Invalid UTF8 correspond to codepoints which are larger than U+10FFFF.
21
+ // This value is encoded in UTF8 as:
22
+ // * 11110.100 10.001111 10.111111 10.111111
23
+ // We thus check if the first byte is larger than 0xf4, or if it is equal
24
+ // to 0xf4 and the second byte is larger than 0x8f.
25
+ bool invalid(uint8_t* str)
26
+ {
27
+ return str[0] > 0xf4 || (str[0] == 0xf4 && str[1] > 0x8f);
28
+ }
29
+
30
+ // Surrogate halves corresponds to the range U+D800 through U+DFFF,
31
+ // which are encoded in UTF8 as:
32
+ // * 1110.1101 10.100000 10.000000
33
+ // * 1110.1101 10.111111 10.111111
34
+ // We thus check is the first byte is equal to 0xed and if the
35
+ // sixth bit of the second byte is set.
36
+ bool surrogate(uint8_t* str)
37
+ {
38
+ return str[0] == 0xed && str[1] & 0x20;
39
+ }
40
+
41
+ // Sequences of length 2 are overlong if the leading 4 bits (noted as y)
42
+ // are equal to 0: 110.yyyyx 10xxxxxx
43
+ bool overlong_2(uint8_t* str)
44
+ {
45
+ return (str[0] & 0x1e) == 0;
46
+ }
47
+
48
+ // Sequences of lenth 3 are overlong if the leading 5 bits (noted as y)
49
+ // are equal to 0: 1110.yyyy 10.yxxxxx 10.xxxxxx
50
+ bool overlong_3(uint8_t* str)
51
+ {
52
+ return (str[0] & 0x0f) == 0 && (str[1] & 0x20) == 0;
53
+ }
54
+
55
+ // Sequences of length 4 are overlong if the leading 5 bits (noted as y)
56
+ // are equal to 0: 11110.yyy 10.yyxxxx 10.xxxxxx 10.xxxxxx
57
+ bool overlong_4(uint8_t* str)
58
+ {
59
+ return (str[0] & 0x07) == 0 && (str[1] & 0x30) == 0;
60
+ }
61
+
62
+ bool valid_utf8(uint8_t* str, size_t length)
63
+ {
64
+ uint8_t* end = str + length;
65
+ while (str < end) {
66
+ if (str[0] < 0x80) {
67
+ // 0.xxxxxxx
68
+ str += 1;
69
+ } else if ((str[0] & 0xe0) == 0xc0) {
70
+ // 110.xxxxx 10.xxxxxx
71
+ if (str + 1 >= end) return false;
72
+ if (!continuation(str + 1, 1)) return false;
73
+ if (overlong_2(str)) return false;
74
+ str += 2;
75
+ } else if ((str[0] & 0xf0) == 0xe0) {
76
+ // 1110.xxxx 10.xxxxxx 10.xxxxxx
77
+ if (str + 2 >= end) return false;
78
+ if (!continuation(str + 1, 2)) return false;
79
+ if (overlong_3(str)) return false;
80
+ if (surrogate(str)) return false;
81
+ str += 3;
82
+ } else if ((str[0] & 0xf8) == 0xf0) {
83
+ // 11110.xxx 10.xxxxxx 10.xxxxxx 10.xxxxxx
84
+ if (str + 3 >= end) return false;
85
+ if (!continuation(str + 1, 3)) return false;
86
+ if (overlong_4(str)) return false;
87
+ if (invalid(str)) return false;
88
+ str += 4;
89
+ } else {
90
+ return false;
91
+ }
92
+ }
93
+ return true;
94
+ }
95
+
96
+ int main(int argc, char** argv)
97
+ {
98
+ std::ios_base::sync_with_stdio(false);
99
+ for (std::string line; std::getline(std::cin, line);) {
100
+ if (valid_utf8((uint8_t*) line.data(), line.length())) {
101
+ std::cout << line << std::endl;
102
+ }
103
+ }
104
+ return 0;
105
+ }
@@ -0,0 +1,30 @@
1
+ #!/bin/usr/env sh
2
+ # Copyright (c) 2018-present, Facebook, Inc.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ set -e
9
+
10
+ URL=$1
11
+
12
+ FILENAME=$(basename --suffix=".warc.wet.gz" "${URL}")
13
+
14
+ echo "Processing ${FILENAME}."
15
+
16
+ wget -q -P tmp "${URL}"
17
+
18
+ #echo "Extracting ${FILENAME}.warc.wet.gz"
19
+ gunzip "tmp/${FILENAME}.warc.wet.gz"
20
+
21
+ #echo "Language identification for ${FILENAME}.warc.wet"
22
+ fastText/fasttext predict-prob lid.176.bin "tmp/${FILENAME}.warc.wet" > "tmp/${FILENAME}.lid"
23
+
24
+ #echo "Splitting ${FILENAME}.warc.wet per language"
25
+ paste "tmp/${FILENAME}.lid" "tmp/${FILENAME}.warc.wet" | \
26
+ awk '($2 > 0.8 || ($1=="__label__hr" && $2 > 0.4)) && length() > 100 {lang = substr($1, 10); $1=""; $2=""; print $0 >> "shard/"lang".txt"}'
27
+
28
+ #echo "Removing tmp files"
29
+ rm "tmp/${FILENAME}.lid"
30
+ rm "tmp/${FILENAME}.warc.wet"
@@ -0,0 +1,64 @@
1
+ ---
2
+ id: aligned-vectors
3
+ title: Aligned word vectors
4
+ ---
5
+
6
+ We are publishing aligned word vectors for 44 languages based on the pre-trained vectors computed on [*Wikipedia*](https://www.wikipedia.org) using fastText.
7
+ The alignments are performed with the RCSLS method described in [*Joulin et al (2018)*](https://arxiv.org/abs/1804.07745).
8
+
9
+ ### Vectors
10
+
11
+ The aligned vectors can be downloaded from:
12
+
13
+ |||||
14
+ |-|-|-|-|
15
+ | Afrikaans: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.af.align.vec) | Arabic: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ar.align.vec) | Bulgarian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bg.align.vec) | Bengali: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bn.align.vec) |
16
+ | Bosnian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bs.align.vec) | Catalan: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ca.align.vec) | Czech: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.cs.align.vec) | Danish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.da.align.vec) |
17
+ | German: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.de.align.vec) | Greek: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.el.align.vec) | English: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.en.align.vec) | Spanish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.es.align.vec) |
18
+ | Estonian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.et.align.vec) | Persian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fa.align.vec) | Finnish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fi.align.vec) | French: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fr.align.vec) |
19
+ | Hebrew: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.he.align.vec) | Hindi: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hi.align.vec) | Croatian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hr.align.vec) | Hungarian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hu.align.vec) |
20
+ | Indonesian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.id.align.vec) | Italian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.it.align.vec) | Korean: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ko.align.vec) | Lithuanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.lt.align.vec) |
21
+ | Latvian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.lv.align.vec) | Macedonian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.mk.align.vec) | Malay: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ms.align.vec) | Dutch: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.nl.align.vec) |
22
+ | Norwegian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.no.align.vec) | Polish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.pl.align.vec) | Portuguese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.pt.align.vec) | Romanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ro.align.vec) |
23
+ | Russian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ru.align.vec) | Slovak: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sk.align.vec) | Slovenian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sl.align.vec) | Albanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sq.align.vec) |
24
+ | Swedish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sv.align.vec) | Tamil: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ta.align.vec) | Thai: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.th.align.vec) | Tagalog: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.tl.align.vec) |
25
+ | Turkish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.tr.align.vec) | Ukrainian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.uk.align.vec) | Vietnamese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.vi.align.vec) | Chinese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.zh.align.vec) |
26
+
27
+ ### Format
28
+
29
+ The word vectors come in the default text format of fastText.
30
+ The first line gives the number of vectors and their dimension.
31
+ The other lines contain a word followed by its vector. Each value is space separated.
32
+
33
+ ### License
34
+
35
+ The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/).
36
+
37
+ ### References
38
+
39
+ If you use these word vectors, please cite the following papers:
40
+
41
+ [1] A. Joulin, P. Bojanowski, T. Mikolov, H. Jegou, E. Grave, [*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745)
42
+
43
+ ```markup
44
+ @InProceedings{joulin2018loss,
45
+ title={Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion},
46
+ author={Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and J\'egou, Herv\'e and Grave, Edouard},
47
+ year={2018},
48
+ booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
49
+ }
50
+ ```
51
+
52
+ [2] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606)
53
+
54
+ ```markup
55
+ @article{bojanowski2017enriching,
56
+ title={Enriching Word Vectors with Subword Information},
57
+ author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
58
+ journal={Transactions of the Association for Computational Linguistics},
59
+ volume={5},
60
+ year={2017},
61
+ issn={2307-387X},
62
+ pages={135--146}
63
+ }
64
+ ```
@@ -0,0 +1,6 @@
1
+ ---
2
+ id: api
3
+ title:API
4
+ ---
5
+
6
+ We automatically generate our [API documentation](/docs/en/html/index.html) with doxygen.
@@ -0,0 +1,66 @@
1
+ ---
2
+ id: cheatsheet
3
+ title: Cheatsheet
4
+ ---
5
+
6
+ ## Word representation learning
7
+
8
+ In order to learn word vectors do:
9
+
10
+ ```bash
11
+ $ ./fasttext skipgram -input data.txt -output model
12
+ ```
13
+
14
+ ## Obtaining word vectors
15
+
16
+ Print word vectors for a text file `queries.txt` containing words.
17
+
18
+ ```bash
19
+ $ ./fasttext print-word-vectors model.bin < queries.txt
20
+ ```
21
+
22
+ ## Text classification
23
+
24
+ In order to train a text classifier do:
25
+
26
+ ```bash
27
+ $ ./fasttext supervised -input train.txt -output model
28
+ ```
29
+
30
+ Once the model was trained, you can evaluate it by computing the precision and recall at k (P@k and R@k) on a test set using:
31
+
32
+ ```bash
33
+ $ ./fasttext test model.bin test.txt 1
34
+ ```
35
+
36
+ In order to obtain the k most likely labels for a piece of text, use:
37
+
38
+ ```bash
39
+ $ ./fasttext predict model.bin test.txt k
40
+ ```
41
+
42
+ In order to obtain the k most likely labels and their associated probabilities for a piece of text, use:
43
+
44
+ ```bash
45
+ $ ./fasttext predict-prob model.bin test.txt k
46
+ ```
47
+
48
+ If you want to compute vector representations of sentences or paragraphs, please use:
49
+
50
+ ```bash
51
+ $ ./fasttext print-sentence-vectors model.bin < text.txt
52
+ ```
53
+
54
+ ## Quantization
55
+
56
+ In order to create a `.ftz` file with a smaller memory footprint do:
57
+
58
+ ```bash
59
+ $ ./fasttext quantize -output model
60
+ ```
61
+
62
+ All other commands such as test also work with this model
63
+
64
+ ```bash
65
+ $ ./fasttext test model.ftz test.txt
66
+ ```
@@ -0,0 +1,125 @@
1
+ ---
2
+ id: crawl-vectors
3
+ title: Word vectors for 157 languages
4
+ ---
5
+
6
+ We distribute pre-trained word vectors for 157 languages, trained on [*Common Crawl*](http://commoncrawl.org/) and [*Wikipedia*](https://www.wikipedia.org) using fastText.
7
+ These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives.
8
+ We also distribute three new word analogy datasets, for French, Hindi and Polish.
9
+
10
+ ### Format
11
+
12
+ The word vectors are available in both binary and text formats.
13
+
14
+ Using the binary models, vectors for out-of-vocabulary words can be obtained with
15
+ ```
16
+ $ ./fasttext print-word-vectors wiki.it.300.bin < oov_words.txt
17
+ ```
18
+ where the file oov_words.txt contains out-of-vocabulary words.
19
+
20
+ In the text format, each line contain a word followed by its vector.
21
+ Each value is space separated, and words are sorted by frequency in descending order.
22
+ These text models can easily be loaded in Python using the following code:
23
+ ```python
24
+ import io
25
+
26
+ def load_vectors(fname):
27
+ fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
28
+ n, d = map(int, fin.readline().split())
29
+ data = {}
30
+ for line in fin:
31
+ tokens = line.rstrip().split(' ')
32
+ data[tokens[0]] = map(float, tokens[1:])
33
+ return data
34
+ ```
35
+
36
+ ### Tokenization
37
+
38
+ We used the [*Stanford word segmenter*](https://nlp.stanford.edu/software/segmenter.html) for Chinese, [*Mecab*](http://taku910.github.io/mecab/) for Japanese and [*UETsegmenter*](https://github.com/phongnt570/UETsegmenter) for Vietnamese.
39
+ For languages using the Latin, Cyrillic, Hebrew or Greek scripts, we used the tokenizer from the [*Europarl*](http://www.statmt.org/europarl/) preprocessing tools.
40
+ For the remaining languages, we used the ICU tokenizer.
41
+
42
+ More information about the training of these models can be found in the article [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893).
43
+
44
+ ### License
45
+
46
+ The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/).
47
+
48
+ ### References
49
+
50
+ If you use these word vectors, please cite the following paper:
51
+
52
+ E. Grave\*, P. Bojanowski\*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
53
+
54
+ ```markup
55
+ @inproceedings{grave2018learning,
56
+ title={Learning Word Vectors for 157 Languages},
57
+ author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
58
+ booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
59
+ year={2018}
60
+ }
61
+ ```
62
+
63
+ ### Evaluation datasets
64
+
65
+ The analogy evaluation datasets described in the paper are available here: [French](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-fr.txt), [Hindi](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-hi.txt), [Polish](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-pl.txt).
66
+
67
+ ### Models
68
+
69
+ The models can be downloaded from:
70
+
71
+ ||||
72
+ |-|-|-|
73
+ | Afrikaans: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.vec.gz) | Albanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz) | Alemannic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.vec.gz) |
74
+ | Amharic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.vec.gz) | Arabic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz) | Aragonese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.vec.gz) |
75
+ | Armenian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.vec.gz) | Assamese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.vec.gz) | Asturian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.vec.gz) |
76
+ | Azerbaijani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.vec.gz) | Bashkir: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz) | Basque: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.vec.gz) |
77
+ | Bavarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.vec.gz) | Belarusian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.vec.gz) | Bengali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz) |
78
+ | Bihari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.vec.gz) | Bishnupriya Manipuri: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.vec.gz) | Bosnian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.vec.gz) |
79
+ | Breton: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.vec.gz) | Bulgarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz) | Burmese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.vec.gz) |
80
+ | Catalan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.vec.gz) | Cebuano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.vec.gz) | Central Bicolano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.vec.gz) |
81
+ | Chechen: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.vec.gz) | Chinese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz) | Chuvash: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.vec.gz) |
82
+ | Corsican: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.vec.gz) | Croatian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.vec.gz) | Czech: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz) |
83
+ | Danish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz) | Divehi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.vec.gz) | Dutch: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz) |
84
+ | Eastern Punjabi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.vec.gz) | Egyptian Arabic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.vec.gz) | Emilian-Romagnol: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.vec.gz) |
85
+ | English: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz) | Erzya: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.vec.gz) | Esperanto: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.vec.gz) |
86
+ | Estonian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.vec.gz) | Fiji Hindi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.vec.gz) | Finnish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz) |
87
+ | French: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz) | Galician: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.vec.gz) | Georgian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.vec.gz) |
88
+ | German: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz) | Goan Konkani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.vec.gz) | Greek: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz) |
89
+ | Gujarati: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.vec.gz) | Haitian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.vec.gz) | Hebrew: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.vec.gz) |
90
+ | Hill Mari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.vec.gz) | Hindi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz) | Hungarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.vec.gz) |
91
+ | Icelandic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.vec.gz) | Ido: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.vec.gz) | Ilokano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.vec.gz) |
92
+ | Indonesian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz) | Interlingua: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.vec.gz) | Irish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz) |
93
+ | Italian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz) | Japanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz) | Javanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.vec.gz) |
94
+ | Kannada: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.vec.gz) | Kapampangan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.vec.gz) | Kazakh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.vec.gz) |
95
+ | Khmer: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.vec.gz) | Kirghiz: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.vec.gz) | Korean: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.vec.gz) |
96
+ | Kurdish (Kurmanji): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.vec.gz) | Kurdish (Sorani): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.vec.gz) | Latin: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz) |
97
+ | Latvian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.vec.gz) | Limburgish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.vec.gz) | Lithuanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.vec.gz) |
98
+ | Lombard: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.vec.gz) | Low Saxon: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.vec.gz) | Luxembourgish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.vec.gz) |
99
+ | Macedonian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.vec.gz) | Maithili: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.vec.gz) | Malagasy: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.vec.gz) |
100
+ | Malay: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.vec.gz) | Malayalam: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz) | Maltese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.vec.gz) |
101
+ | Manx: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.vec.gz) | Marathi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz) | Mazandarani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.vec.gz) |
102
+ | Meadow Mari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.vec.gz) | Minangkabau: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.vec.gz) | Mingrelian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.vec.gz) |
103
+ | Mirandese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.vec.gz) | Mongolian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.vec.gz) | Nahuatl: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.vec.gz) |
104
+ | Neapolitan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.vec.gz) | Nepali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.vec.gz) | Newar: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.vec.gz) |
105
+ | North Frisian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.vec.gz) | Northern Sotho: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.vec.gz) | Norwegian (Bokmål): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.vec.gz) |
106
+ | Norwegian (Nynorsk): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.vec.gz) | Occitan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.vec.gz) | Oriya: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.vec.gz) |
107
+ | Ossetian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.vec.gz) | Palatinate German: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.vec.gz) | Pashto: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.vec.gz) |
108
+ | Persian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz) | Piedmontese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.vec.gz) | Polish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz) |
109
+ | Portuguese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz) | Quechua: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.vec.gz) | Romanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz) |
110
+ | Romansh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.vec.gz) | Russian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz) | Sakha: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.vec.gz) |
111
+ | Sanskrit: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.vec.gz) | Sardinian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.vec.gz) | Scots: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.vec.gz) |
112
+ | Scottish Gaelic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.vec.gz) | Serbian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.vec.gz) | Serbo-Croatian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.vec.gz) |
113
+ | Sicilian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.vec.gz) | Sindhi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.vec.gz) | Sinhalese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz) |
114
+ | Slovak: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.vec.gz) | Slovenian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz) | Somali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.vec.gz) |
115
+ | Southern Azerbaijani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.vec.gz) | Spanish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz) | Sundanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.vec.gz) |
116
+ | Swahili: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.vec.gz) | Swedish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz) | Tagalog: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz) |
117
+ | Tajik: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.vec.gz) | Tamil: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz) | Tatar: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.vec.gz) |
118
+ | Telugu: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz) | Thai: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.vec.gz) | Tibetan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.vec.gz) |
119
+ | Turkish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz) | Turkmen: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.vec.gz) | Ukrainian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.vec.gz) |
120
+ | Upper Sorbian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.vec.gz) | Urdu: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.vec.gz) | Uyghur: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.vec.gz) |
121
+ | Uzbek: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.vec.gz) | Venetian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.vec.gz) | Vietnamese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz) |
122
+ | Volapük: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.vec.gz) | Walloon: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.vec.gz) | Waray: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.vec.gz) |
123
+ | Welsh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.vec.gz) | West Flemish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.vec.gz) | West Frisian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.vec.gz) |
124
+ | Western Punjabi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.vec.gz) | Yiddish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.vec.gz) | Yoruba: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.vec.gz) |
125
+ | Zazaki: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.vec.gz) | Zeelandic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.vec.gz) |