fasttext-community 0.10.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. fasttext_community-0.10.7/LICENSE +21 -0
  2. fasttext_community-0.10.7/MANIFEST.in +5 -0
  3. fasttext_community-0.10.7/PKG-INFO +432 -0
  4. fasttext_community-0.10.7/README.md +347 -0
  5. fasttext_community-0.10.7/pyproject.toml +33 -0
  6. fasttext_community-0.10.7/python/README.md +327 -0
  7. fasttext_community-0.10.7/python/README.rst +406 -0
  8. fasttext_community-0.10.7/python/benchmarks/README.rst +3 -0
  9. fasttext_community-0.10.7/python/fasttext_module/fasttext/FastText.py +624 -0
  10. fasttext_community-0.10.7/python/fasttext_module/fasttext/__init__.py +22 -0
  11. fasttext_community-0.10.7/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc +528 -0
  12. fasttext_community-0.10.7/python/fasttext_module/fasttext/tests/__init__.py +13 -0
  13. fasttext_community-0.10.7/python/fasttext_module/fasttext/tests/helpers.py +161 -0
  14. fasttext_community-0.10.7/python/fasttext_module/fasttext/tests/test_configurations.py +239 -0
  15. fasttext_community-0.10.7/python/fasttext_module/fasttext/tests/test_script.py +480 -0
  16. fasttext_community-0.10.7/python/fasttext_module/fasttext/util/__init__.py +15 -0
  17. fasttext_community-0.10.7/python/fasttext_module/fasttext/util/util.py +209 -0
  18. fasttext_community-0.10.7/python/fasttext_module/fasttext_community.egg-info/PKG-INFO +432 -0
  19. fasttext_community-0.10.7/python/fasttext_module/fasttext_community.egg-info/SOURCES.txt +52 -0
  20. fasttext_community-0.10.7/python/fasttext_module/fasttext_community.egg-info/dependency_links.txt +1 -0
  21. fasttext_community-0.10.7/python/fasttext_module/fasttext_community.egg-info/requires.txt +11 -0
  22. fasttext_community-0.10.7/python/fasttext_module/fasttext_community.egg-info/top_level.txt +2 -0
  23. fasttext_community-0.10.7/setup.cfg +7 -0
  24. fasttext_community-0.10.7/setup.py +165 -0
  25. fasttext_community-0.10.7/src/aligned.h +98 -0
  26. fasttext_community-0.10.7/src/args.cc +494 -0
  27. fasttext_community-0.10.7/src/args.h +97 -0
  28. fasttext_community-0.10.7/src/autotune.cc +477 -0
  29. fasttext_community-0.10.7/src/autotune.h +89 -0
  30. fasttext_community-0.10.7/src/densematrix.cc +265 -0
  31. fasttext_community-0.10.7/src/densematrix.h +85 -0
  32. fasttext_community-0.10.7/src/dictionary.cc +590 -0
  33. fasttext_community-0.10.7/src/dictionary.h +114 -0
  34. fasttext_community-0.10.7/src/fasttext.cc +832 -0
  35. fasttext_community-0.10.7/src/fasttext.h +170 -0
  36. fasttext_community-0.10.7/src/loss.cc +346 -0
  37. fasttext_community-0.10.7/src/loss.h +163 -0
  38. fasttext_community-0.10.7/src/main.cc +454 -0
  39. fasttext_community-0.10.7/src/matrix.cc +25 -0
  40. fasttext_community-0.10.7/src/matrix.h +45 -0
  41. fasttext_community-0.10.7/src/meter.cc +214 -0
  42. fasttext_community-0.10.7/src/meter.h +91 -0
  43. fasttext_community-0.10.7/src/model.cc +93 -0
  44. fasttext_community-0.10.7/src/model.h +79 -0
  45. fasttext_community-0.10.7/src/productquantizer.cc +251 -0
  46. fasttext_community-0.10.7/src/productquantizer.h +63 -0
  47. fasttext_community-0.10.7/src/quantmatrix.cc +125 -0
  48. fasttext_community-0.10.7/src/quantmatrix.h +61 -0
  49. fasttext_community-0.10.7/src/real.h +14 -0
  50. fasttext_community-0.10.7/src/utils.cc +53 -0
  51. fasttext_community-0.10.7/src/utils.h +72 -0
  52. fasttext_community-0.10.7/src/vector.cc +96 -0
  53. fasttext_community-0.10.7/src/vector.h +62 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016-present, Facebook, Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include LICENSE
2
+ include PATENTS
3
+
4
+ recursive-include python *.md *.rst
5
+ recursive-include src *.h
@@ -0,0 +1,432 @@
1
+ Metadata-Version: 2.4
2
+ Name: fasttext-community
3
+ Version: 0.10.7
4
+ Summary: fasttext Python bindings
5
+ Author-email: Nurzhan Muratkhan <nurzhanmuratkhan@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/munlicode/fasttext-community
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Topic :: Software Development
12
+ Classifier: Topic :: Scientific/Engineering
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/x-rst
15
+ License-File: LICENSE
16
+ Requires-Dist: pybind11>=2.2
17
+ Requires-Dist: setuptools>=0.7.0
18
+ Requires-Dist: numpy
19
+ Provides-Extra: test
20
+ Requires-Dist: pytest; extra == "test"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest; extra == "dev"
23
+ Requires-Dist: build; extra == "dev"
24
+ Requires-Dist: twine; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ fastText |CircleCI|
28
+ ===================
29
+
30
+ `fastText <https://fasttext.cc/>`__ is a library for efficient learning
31
+ of word representations and sentence classification.
32
+
33
+ In this document we present how to use fastText in python.
34
+
35
+ Table of contents
36
+ -----------------
37
+
38
+ - `Requirements <#requirements>`__
39
+ - `Installation <#installation>`__
40
+ - `Usage overview <#usage-overview>`__
41
+ - `Word representation model <#word-representation-model>`__
42
+ - `Text classification model <#text-classification-model>`__
43
+ - `IMPORTANT: Preprocessing data / encoding
44
+ conventions <#important-preprocessing-data-encoding-conventions>`__
45
+ - `More examples <#more-examples>`__
46
+ - `API <#api>`__
47
+ - `train_unsupervised parameters <#train_unsupervised-parameters>`__
48
+ - `train_supervised parameters <#train_supervised-parameters>`__
49
+ - `model object <#model-object>`__
50
+
51
+ Requirements
52
+ ============
53
+
54
+ `fastText <https://fasttext.cc/>`__ builds on modern Mac OS and Linux
55
+ distributions. Since it uses C++11 features, it requires a compiler with
56
+ good C++11 support. You will need `Python <https://www.python.org/>`__
57
+ (version 2.7 or ≥ 3.4), `NumPy <http://www.numpy.org/>`__ &
58
+ `SciPy <https://www.scipy.org/>`__ and
59
+ `pybind11 <https://github.com/pybind/pybind11>`__.
60
+
61
+ Installation
62
+ ============
63
+
64
+ To install the latest release, you can do :
65
+
66
+ .. code:: bash
67
+
68
+ $ pip install fasttext-community
69
+
70
+ or, to get the latest development version of fasttext, you can install
71
+ from our github repository :
72
+
73
+ .. code:: bash
74
+
75
+ $ git clone https://github.com/munlicode/fasttext-community.git
76
+ $ cd fastText
77
+ $ sudo pip install .
78
+ $ # or :
79
+ $ sudo python setup.py install
80
+
81
+ Usage overview
82
+ ==============
83
+
84
+ Word representation model
85
+ -------------------------
86
+
87
+ In order to learn word vectors, as `described
88
+ here <https://fasttext.cc/docs/en/references.html#enriching-word-vectors-with-subword-information>`__,
89
+ we can use ``fasttext.train_unsupervised`` function like this:
90
+
91
+ .. code:: py
92
+
93
+ import fasttext
94
+
95
+ # Skipgram model :
96
+ model = fasttext.train_unsupervised('data.txt', model='skipgram')
97
+
98
+ # or, cbow model :
99
+ model = fasttext.train_unsupervised('data.txt', model='cbow')
100
+
101
+ where ``data.txt`` is a training file containing utf-8 encoded text.
102
+
103
+ The returned ``model`` object represents your learned model, and you can
104
+ use it to retrieve information.
105
+
106
+ .. code:: py
107
+
108
+ print(model.words) # list of words in dictionary
109
+ print(model['king']) # get the vector of the word 'king'
110
+
111
+ Saving and loading a model object
112
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
113
+
114
+ You can save your trained model object by calling the function
115
+ ``save_model``.
116
+
117
+ .. code:: py
118
+
119
+ model.save_model("model_filename.bin")
120
+
121
+ and retrieve it later thanks to the function ``load_model`` :
122
+
123
+ .. code:: py
124
+
125
+ model = fasttext.load_model("model_filename.bin")
126
+
127
+ For more information about word representation usage of fasttext, you
128
+ can refer to our `word representations
129
+ tutorial <https://fasttext.cc/docs/en/unsupervised-tutorial.html>`__.
130
+
131
+ Text classification model
132
+ -------------------------
133
+
134
+ In order to train a text classifier using the method `described
135
+ here <https://fasttext.cc/docs/en/references.html#bag-of-tricks-for-efficient-text-classification>`__,
136
+ we can use ``fasttext.train_supervised`` function like this:
137
+
138
+ .. code:: py
139
+
140
+ import fasttext
141
+
142
+ model = fasttext.train_supervised('data.train.txt')
143
+
144
+ where ``data.train.txt`` is a text file containing a training sentence
145
+ per line along with the labels. By default, we assume that labels are
146
+ words that are prefixed by the string ``__label__``
147
+
148
+ Once the model is trained, we can retrieve the list of words and labels:
149
+
150
+ .. code:: py
151
+
152
+ print(model.words)
153
+ print(model.labels)
154
+
155
+ To evaluate our model by computing the precision at 1 (P@1) and the
156
+ recall on a test set, we use the ``test`` function:
157
+
158
+ .. code:: py
159
+
160
+ def print_results(N, p, r):
161
+ print("N\t" + str(N))
162
+ print("P@{}\t{:.3f}".format(1, p))
163
+ print("R@{}\t{:.3f}".format(1, r))
164
+
165
+ print_results(*model.test('test.txt'))
166
+
167
+ We can also predict labels for a specific text :
168
+
169
+ .. code:: py
170
+
171
+ model.predict("Which baking dish is best to bake a banana bread ?")
172
+
173
+ By default, ``predict`` returns only one label : the one with the
174
+ highest probability. You can also predict more than one label by
175
+ specifying the parameter ``k``:
176
+
177
+ .. code:: py
178
+
179
+ model.predict("Which baking dish is best to bake a banana bread ?", k=3)
180
+
181
+ If you want to predict more than one sentence you can pass an array of
182
+ strings :
183
+
184
+ .. code:: py
185
+
186
+ model.predict(["Which baking dish is best to bake a banana bread ?", "Why not put knives in the dishwasher?"], k=3)
187
+
188
+ Of course, you can also save and load a model to/from a file as `in the
189
+ word representation usage <#saving-and-loading-a-model-object>`__.
190
+
191
+ For more information about text classification usage of fasttext, you
192
+ can refer to our `text classification
193
+ tutorial <https://fasttext.cc/docs/en/supervised-tutorial.html>`__.
194
+
195
+ Compress model files with quantization
196
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
197
+
198
+ When you want to save a supervised model file, fastText can compress it
199
+ in order to have a much smaller model file by sacrificing only a little
200
+ bit performance.
201
+
202
+ .. code:: py
203
+
204
+ # with the previously trained `model` object, call :
205
+ model.quantize(input='data.train.txt', retrain=True)
206
+
207
+ # then display results and save the new model :
208
+ print_results(*model.test(valid_data))
209
+ model.save_model("model_filename.ftz")
210
+
211
+ ``model_filename.ftz`` will have a much smaller size than
212
+ ``model_filename.bin``.
213
+
214
+ For further reading on quantization, you can refer to `this paragraph
215
+ from our blog
216
+ post <https://fasttext.cc/blog/2017/10/02/blog-post.html#model-compression>`__.
217
+
218
+ IMPORTANT: Preprocessing data / encoding conventions
219
+ ----------------------------------------------------
220
+
221
+ In general it is important to properly preprocess your data. In
222
+ particular our example scripts in the `root
223
+ folder <https://github.com/facebookresearch/fastText>`__ do this.
224
+
225
+ fastText assumes UTF-8 encoded text. All text must be `unicode for
226
+ Python2 <https://docs.python.org/2/library/functions.html#unicode>`__
227
+ and `str for
228
+ Python3 <https://docs.python.org/3.5/library/stdtypes.html#textseq>`__.
229
+ The passed text will be `encoded as UTF-8 by
230
+ pybind11 <https://pybind11.readthedocs.io/en/master/advanced/cast/strings.html?highlight=utf-8#strings-bytes-and-unicode-conversions>`__
231
+ before passed to the fastText C++ library. This means it is important to
232
+ use UTF-8 encoded text when building a model. On Unix-like systems you
233
+ can convert text using `iconv <https://en.wikipedia.org/wiki/Iconv>`__.
234
+
235
+ fastText will tokenize (split text into pieces) based on the following
236
+ ASCII characters (bytes). In particular, it is not aware of UTF-8
237
+ whitespace. We advice the user to convert UTF-8 whitespace / word
238
+ boundaries into one of the following symbols as appropiate.
239
+
240
+ - space
241
+ - tab
242
+ - vertical tab
243
+ - carriage return
244
+ - formfeed
245
+ - the null character
246
+
247
+ The newline character is used to delimit lines of text. In particular,
248
+ the EOS token is appended to a line of text if a newline character is
249
+ encountered. The only exception is if the number of tokens exceeds the
250
+ MAX\_LINE\_SIZE constant as defined in the `Dictionary
251
+ header <https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h>`__.
252
+ This means if you have text that is not separate by newlines, such as
253
+ the `fil9 dataset <http://mattmahoney.net/dc/textdata>`__, it will be
254
+ broken into chunks with MAX\_LINE\_SIZE of tokens and the EOS token is
255
+ not appended.
256
+
257
+ The length of a token is the number of UTF-8 characters by considering
258
+ the `leading two bits of a
259
+ byte <https://en.wikipedia.org/wiki/UTF-8#Description>`__ to identify
260
+ `subsequent bytes of a multi-byte
261
+ sequence <https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc>`__.
262
+ Knowing this is especially important when choosing the minimum and
263
+ maximum length of subwords. Further, the EOS token (as specified in the
264
+ `Dictionary
265
+ header <https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h>`__)
266
+ is considered a character and will not be broken into subwords.
267
+
268
+ More examples
269
+ -------------
270
+
271
+ In order to have a better knowledge of fastText models, please consider
272
+ the main
273
+ `README <https://github.com/facebookresearch/fastText/blob/master/README.md>`__
274
+ and in particular `the tutorials on our
275
+ website <https://fasttext.cc/docs/en/supervised-tutorial.html>`__.
276
+
277
+ You can find further python examples in `the doc
278
+ folder <https://github.com/facebookresearch/fastText/tree/master/python/doc/examples>`__.
279
+
280
+ As with any package you can get help on any Python function using the
281
+ help function.
282
+
283
+ For example
284
+
285
+ ::
286
+
287
+ +>>> import fasttext
288
+ +>>> help(fasttext.FastText)
289
+
290
+ Help on module fasttext.FastText in fasttext:
291
+
292
+ NAME
293
+ fasttext.FastText
294
+
295
+ DESCRIPTION
296
+ # Copyright (c) 2017-present, Facebook, Inc.
297
+ # All rights reserved.
298
+ #
299
+ # This source code is licensed under the MIT license found in the
300
+ # LICENSE file in the root directory of this source tree.
301
+
302
+ FUNCTIONS
303
+ load_model(path)
304
+ Load a model given a filepath and return a model object.
305
+
306
+ tokenize(text)
307
+ Given a string of text, tokenize it and return a list of tokens
308
+ [...]
309
+
310
+ API
311
+ ===
312
+
313
+ ``train_unsupervised`` parameters
314
+ ---------------------------------
315
+
316
+ .. code:: python
317
+
318
+ input # training file path (required)
319
+ model # unsupervised fasttext model {cbow, skipgram} [skipgram]
320
+ lr # learning rate [0.05]
321
+ dim # size of word vectors [100]
322
+ ws # size of the context window [5]
323
+ epoch # number of epochs [5]
324
+ minCount # minimal number of word occurences [5]
325
+ minn # min length of char ngram [3]
326
+ maxn # max length of char ngram [6]
327
+ neg # number of negatives sampled [5]
328
+ wordNgrams # max length of word ngram [1]
329
+ loss # loss function {ns, hs, softmax, ova} [ns]
330
+ bucket # number of buckets [2000000]
331
+ thread # number of threads [number of cpus]
332
+ lrUpdateRate # change the rate of updates for the learning rate [100]
333
+ t # sampling threshold [0.0001]
334
+ verbose # verbose [2]
335
+
336
+ ``train_supervised`` parameters
337
+ -------------------------------
338
+
339
+ .. code:: python
340
+
341
+ input # training file path (required)
342
+ lr # learning rate [0.1]
343
+ dim # size of word vectors [100]
344
+ ws # size of the context window [5]
345
+ epoch # number of epochs [5]
346
+ minCount # minimal number of word occurences [1]
347
+ minCountLabel # minimal number of label occurences [1]
348
+ minn # min length of char ngram [0]
349
+ maxn # max length of char ngram [0]
350
+ neg # number of negatives sampled [5]
351
+ wordNgrams # max length of word ngram [1]
352
+ loss # loss function {ns, hs, softmax, ova} [softmax]
353
+ bucket # number of buckets [2000000]
354
+ thread # number of threads [number of cpus]
355
+ lrUpdateRate # change the rate of updates for the learning rate [100]
356
+ t # sampling threshold [0.0001]
357
+ label # label prefix ['__label__']
358
+ verbose # verbose [2]
359
+ pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []
360
+
361
+ ``model`` object
362
+ ----------------
363
+
364
+ ``train_supervised``, ``train_unsupervised`` and ``load_model``
365
+ functions return an instance of ``_FastText`` class, that we generaly
366
+ name ``model`` object.
367
+
368
+ This object exposes those training arguments as properties : ``lr``,
369
+ ``dim``, ``ws``, ``epoch``, ``minCount``, ``minCountLabel``, ``minn``,
370
+ ``maxn``, ``neg``, ``wordNgrams``, ``loss``, ``bucket``, ``thread``,
371
+ ``lrUpdateRate``, ``t``, ``label``, ``verbose``, ``pretrainedVectors``.
372
+ So ``model.wordNgrams`` will give you the max length of word ngram used
373
+ for training this model.
374
+
375
+ In addition, the object exposes several functions :
376
+
377
+ .. code:: python
378
+
379
+ get_dimension # Get the dimension (size) of a lookup vector (hidden layer).
380
+ # This is equivalent to `dim` property.
381
+ get_input_vector # Given an index, get the corresponding vector of the Input Matrix.
382
+ get_input_matrix # Get a copy of the full input matrix of a Model.
383
+ get_labels # Get the entire list of labels of the dictionary
384
+ # This is equivalent to `labels` property.
385
+ get_line # Split a line of text into words and labels.
386
+ get_output_matrix # Get a copy of the full output matrix of a Model.
387
+ get_sentence_vector # Given a string, get a single vector represenation. This function
388
+ # assumes to be given a single line of text. We split words on
389
+ # whitespace (space, newline, tab, vertical tab) and the control
390
+ # characters carriage return, formfeed and the null character.
391
+ get_subword_id # Given a subword, return the index (within input matrix) it hashes to.
392
+ get_subwords # Given a word, get the subwords and their indicies.
393
+ get_word_id # Given a word, get the word id within the dictionary.
394
+ get_word_vector # Get the vector representation of word.
395
+ get_words # Get the entire list of words of the dictionary
396
+ # This is equivalent to `words` property.
397
+ is_quantized # whether the model has been quantized
398
+ predict # Given a string, get a list of labels and a list of corresponding probabilities.
399
+ quantize # Quantize the model reducing the size of the model and it's memory footprint.
400
+ save_model # Save the model to the given path
401
+ test # Evaluate supervised model using file given by path
402
+ test_label # Return the precision and recall score for each label.
403
+
404
+ The properties ``words``, ``labels`` return the words and labels from
405
+ the dictionary :
406
+
407
+ .. code:: py
408
+
409
+ model.words # equivalent to model.get_words()
410
+ model.labels # equivalent to model.get_labels()
411
+
412
+ The object overrides ``__getitem__`` and ``__contains__`` functions in
413
+ order to return the representation of a word and to check if a word is
414
+ in the vocabulary.
415
+
416
+ .. code:: py
417
+
418
+ model['king'] # equivalent to model.get_word_vector('king')
419
+ 'king' in model # equivalent to `'king' in model.get_words()`
420
+
421
+ Join the fastText community
422
+ ---------------------------
423
+
424
+ - `Facebook page <https://www.facebook.com/groups/1174547215919768>`__
425
+ - `Stack
426
+ overflow <https://stackoverflow.com/questions/tagged/fasttext>`__
427
+ - `Google
428
+ group <https://groups.google.com/forum/#!forum/fasttext-library>`__
429
+ - `GitHub <https://github.com/facebookresearch/fastText>`__
430
+
431
+ .. |CircleCI| image:: https://circleci.com/gh/facebookresearch/fastText/tree/master.svg?style=svg
432
+ :target: https://circleci.com/gh/facebookresearch/fastText/tree/master