ruby-spacy 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +58 -0
  3. data/.yardopts +2 -0
  4. data/Gemfile +18 -0
  5. data/Gemfile.lock +39 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +498 -0
  8. data/Rakefile +12 -0
  9. data/bin/console +15 -0
  10. data/bin/setup +8 -0
  11. data/examples/get_started/lexeme.rb +24 -0
  12. data/examples/get_started/linguistic_annotations.rb +32 -0
  13. data/examples/get_started/most_similar.rb +46 -0
  14. data/examples/get_started/named_entities.rb +24 -0
  15. data/examples/get_started/outputs/test_dep.svg +84 -0
  16. data/examples/get_started/outputs/test_dep_compact.svg +84 -0
  17. data/examples/get_started/outputs/test_ent.html +11 -0
  18. data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
  19. data/examples/get_started/similarity.rb +13 -0
  20. data/examples/get_started/tokenization.rb +22 -0
  21. data/examples/get_started/visualizing_dependencies.rb +14 -0
  22. data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
  23. data/examples/get_started/visualizing_named_entities.rb +12 -0
  24. data/examples/get_started/vocab.rb +10 -0
  25. data/examples/get_started/word_vectors.rb +24 -0
  26. data/examples/japanese/ancestors.rb +44 -0
  27. data/examples/japanese/entity_annotations_and_labels.rb +45 -0
  28. data/examples/japanese/information_extraction.rb +27 -0
  29. data/examples/japanese/lemmatization.rb +32 -0
  30. data/examples/japanese/most_similar.rb +46 -0
  31. data/examples/japanese/named_entity_recognition.rb +27 -0
  32. data/examples/japanese/navigating_parse_tree.rb +34 -0
  33. data/examples/japanese/noun_chunks.rb +23 -0
  34. data/examples/japanese/outputs/test_dep.svg +149 -0
  35. data/examples/japanese/outputs/test_ent.html +16 -0
  36. data/examples/japanese/pos_tagging.rb +34 -0
  37. data/examples/japanese/sentence_segmentation.rb +16 -0
  38. data/examples/japanese/similarity.rb +12 -0
  39. data/examples/japanese/tokenization.rb +38 -0
  40. data/examples/japanese/visualizing_dependencies.rb +13 -0
  41. data/examples/japanese/visualizing_named_entities.rb +14 -0
  42. data/examples/linguistic_features/ancestors.rb +41 -0
  43. data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
  44. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
  45. data/examples/linguistic_features/information_extraction.rb +36 -0
  46. data/examples/linguistic_features/iterating_children.rb +24 -0
  47. data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
  48. data/examples/linguistic_features/lemmatization.rb +31 -0
  49. data/examples/linguistic_features/morphology.rb +17 -0
  50. data/examples/linguistic_features/named_entity_recognition.rb +25 -0
  51. data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
  52. data/examples/linguistic_features/noun_chunks.rb +27 -0
  53. data/examples/linguistic_features/outputs/test_ent.html +11 -0
  54. data/examples/linguistic_features/pos_tagging.rb +31 -0
  55. data/examples/linguistic_features/retokenize_1.rb +29 -0
  56. data/examples/linguistic_features/retokenize_2.rb +16 -0
  57. data/examples/linguistic_features/rule_based_morphology.rb +12 -0
  58. data/examples/linguistic_features/sentence_segmentation.rb +16 -0
  59. data/examples/linguistic_features/similarity.rb +14 -0
  60. data/examples/linguistic_features/similarity_between_spans.rb +23 -0
  61. data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
  62. data/examples/linguistic_features/tokenization.rb +23 -0
  63. data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
  64. data/examples/rule_based_matching/matcher.rb +19 -0
  65. data/lib/ruby-spacy.rb +567 -0
  66. data/lib/ruby-spacy/version.rb +6 -0
  67. data/ruby-spacy.gemspec +42 -0
  68. metadata +157 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6c149833c6cc16782d7964c27989535ee681f9816f58231d1eecc57f2c8f99c1
4
+ data.tar.gz: 5ac0417c29eea0dfa7a48c394e832bcbd7567fd5e4783f8a6de4d15132c479a3
5
+ SHA512:
6
+ metadata.gz: bbf0271475ebab0f6f64621be98bca42a45fbf0b76a6285d17e3593bf4c6e53bd91c55bd6664ea7dd6bc23448d64cb3035bee55eb9e525662580618a7d5bbab6
7
+ data.tar.gz: 0dd4301b1d9272dcc22ad172b8fb9363c46b52b0c58a34d8bf25499a77b4e96e3617a49d9d4c03a34d2b185d2830c6644a9df8968f81fde69a9f94b45691faf3
data/.gitignore ADDED
@@ -0,0 +1,58 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ # Ignore Byebug command history file.
17
+ .byebug_history
18
+
19
+ ## Specific to RubyMotion:
20
+ .dat*
21
+ .repl_history
22
+ build/
23
+ *.bridgesupport
24
+ build-iPhoneOS/
25
+ build-iPhoneSimulator/
26
+
27
+ ## Specific to RubyMotion (use of CocoaPods):
28
+ #
29
+ # We recommend against adding the Pods directory to your .gitignore. However
30
+ # you should judge for yourself, the pros and cons are mentioned at:
31
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
32
+ #
33
+ # vendor/Pods/
34
+
35
+ ## Documentation cache and generated files:
36
+ /.yardoc/
37
+ /_yardoc/
38
+ /doc/
39
+ /rdoc/
40
+
41
+ ## Environment normalization:
42
+ /.bundle/
43
+ /vendor/bundle
44
+ /lib/bundler/man/
45
+
46
+ # for a library or gem, you might want to ignore these files since the code is
47
+ # intended to run in multiple environments; otherwise, check them in:
48
+ # Gemfile.lock
49
+ # .ruby-version
50
+ # .ruby-gemset
51
+
52
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
53
+ .rvmrc
54
+
55
+ # Used by RuboCop. Remote config files pulled in from inherit_from directive.
56
+ # .rubocop-https?--*
57
+
58
+ .DS_Store
data/.yardopts ADDED
@@ -0,0 +1,2 @@
1
+ --markup-provider=redcarpet
2
+ --markup=markdown
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in ruby-spacy.gemspec
6
+ gemspec
7
+
8
+ gem 'pycall'
9
+ gem 'numpy'
10
+ gem 'terminal-table'
11
+
12
+ group :development do
13
+ gem "rake", "~> 13.0"
14
+ gem "minitest", "~> 5.0"
15
+ gem 'yard'
16
+ gem 'redcarpet'
17
+ gem 'github-markup'
18
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,39 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ ruby-spacy (0.1.0)
5
+ numpy (~> 0.4.0)
6
+ pycall (~> 1.4.0)
7
+ terminal-table (~> 3.0.1)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ github-markup (4.0.0)
13
+ minitest (5.14.4)
14
+ numpy (0.4.0)
15
+ pycall (>= 1.2.0.beta1)
16
+ pycall (1.4.0)
17
+ rake (13.0.3)
18
+ redcarpet (3.5.1)
19
+ terminal-table (3.0.1)
20
+ unicode-display_width (>= 1.1.1, < 3)
21
+ unicode-display_width (2.0.0)
22
+ yard (0.9.26)
23
+
24
+ PLATFORMS
25
+ arm64-darwin-20
26
+
27
+ DEPENDENCIES
28
+ github-markup
29
+ minitest (~> 5.0)
30
+ numpy
31
+ pycall
32
+ rake (~> 13.0)
33
+ redcarpet
34
+ ruby-spacy!
35
+ terminal-table
36
+ yard
37
+
38
+ BUNDLED WITH
39
+ 2.2.21
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Yoichiro Hasebe
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,498 @@
1
+ # ruby-spacy
2
+
3
+ ⚠️ This project is **work-in-progress** and is provided as-is. There may be breaking changes committed to this repository without notice.
4
+
5
+ ## Overview
6
+
7
+ **ruby-spacy** is a wrapper module for using [spaCy](https://spacy.io/) from the Ruby programming language via [PyCall](https://github.com/mrkn/pycall.rb). This module aims to make it easy and natural for Ruby programmers to use spaCy. This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
8
+
9
+ | | Functionality |
10
+ |:---|:---------------------------------------------------|
11
+ | ✅ | Tokenization, lemmatization, sentence segmentation |
12
+ | ✅ | Part-of-speech tagging and dependency parsing |
13
+ | ✅ | Named entity recognition |
14
+ | ✅ | Syntactic dependency visualization |
15
+ | ✅ | Access to pre-trained word vectors |
16
+
17
+ ## Installation of prerequisites
18
+
19
+ Make sure that the `enable-shared` option is enabled in your Python installation. You can use [pyenv](https://github.com/pyenv/pyenv) to install any version of Python you like. Install Python 3.8.5, for instance, using pyenv with `enable-shared` as follows:
20
+
21
+ ```shell
22
+ $ env CONFIGURE_OPTS="--enable-shared" pyenv install 3.8.5
23
+ ```
24
+
25
+ Don't forget to make it accessible from your working directory.
26
+
27
+ ```shell
28
+ $ pyenv local 3.8.5
29
+ ```
30
+
31
+ Or alternatively:
32
+
33
+ ```shell
34
+ $ pyenv global 3.8.5
35
+ ```
36
+
37
+ Then, install [spaCy](https://spacy.io/). If you use `pip`, the following command will do:
38
+
39
+ ```shell
40
+ $ pip install spacy
41
+ ```
42
+
43
+ Install trained language models. For a starter, `en_core_web_sm` will be the most useful to conduct basic text processing in English. However, if you want to use advanced features of spaCy, such as named entity recognition or document similarity calculation, you should also install a larger model like `en_core_web_lg`.
44
+
45
+
46
+ ```shell
47
+ $ python -m spacy download en_core_web_sm
48
+ $ python -m spacy download en_core_web_lg
49
+ ```
50
+
51
+ See [Spacy: Models & Languages](https://spacy.io/usage/models) for other models in various languages. To install models for the Japanese language, for instance, you can do it as follows:
52
+
53
+ ```shell
54
+ $ python -m spacy download ja_core_news_sm
55
+ $ python -m spacy download ja_core_news_lg
56
+ ```
57
+
58
+ ## Installation of ruby-spacy
59
+
60
+ Add this line to your application's Gemfile:
61
+
62
+ ```ruby
63
+ gem 'ruby-spacy'
64
+ ```
65
+
66
+ And then execute:
67
+
68
+ $ bundle install
69
+
70
+ Or install it yourself as:
71
+
72
+ $ gem install ruby-spacy
73
+
74
+ ## Usage
75
+
76
+ See [Examples](#examples) below.
77
+
78
+ ## Examples
79
+
80
+ Many of the following examples are Python-to-Ruby translations of code snippets in [spaCy 101](https://spacy.io/usage/spacy-101). For more examples, look inside the `examples` directory.
81
+
82
+ ### Tokenization
83
+
84
+ → [spaCy: Tokenization](https://spacy.io/usage/spacy-101#annotations-token)
85
+
86
+ Ruby code:
87
+
88
+ ```ruby
89
+ require "ruby-spacy"
90
+ require "terminal-table"
91
+
92
+ nlp = Spacy::Language.new("en_core_web_sm")
93
+
94
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
95
+
96
+ row = []
97
+
98
+ doc.each do |token|
99
+ row << token.text
100
+ end
101
+
102
+ headings = [1,2,3,4,5,6,7,8,9,10]
103
+ table = Terminal::Table.new rows: [row], headings: headings
104
+
105
+ puts table
106
+ ```
107
+
108
+ Output:
109
+
110
+ | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
111
+ |:-----:|:--:|:-------:|:--:|:------:|:----:|:-------:|:---:|:-:|:--:|:-------:|
112
+ | Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
113
+
114
+ ### Part-of-speech tagging
115
+
116
+ → [spaCy: Part-of-speech tags and dependencies](https://spacy.io/usage/spacy-101#annotations-pos-deps)
117
+
118
+ → [POS and morphology tags](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py)
119
+
120
+ Ruby code:
121
+
122
+ ```ruby
123
+ require "ruby-spacy"
124
+ require "terminal-table"
125
+
126
+ nlp = Spacy::Language.new("en_core_web_sm")
127
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
128
+
129
+ rows = []
130
+
131
+ doc.each do |token|
132
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
133
+ end
134
+
135
+ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
136
+ table = Terminal::Table.new rows: rows, headings: headings
137
+ puts table
138
+ ```
139
+
140
+ Output:
141
+
142
+ | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
143
+ |:--------|:--------|:------|:----|:---------|:------|:---------|:--------|
144
+ | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
145
+ | is | be | AUX | VBZ | aux | xx | true | true |
146
+ | looking | look | VERB | VBG | ROOT | xxxx | true | false |
147
+ | at | at | ADP | IN | prep | xx | true | true |
148
+ | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
149
+ | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
150
+ | startup | startup | NOUN | NN | advcl | xxxx | true | false |
151
+ | for | for | ADP | IN | prep | xxx | true | true |
152
+ | $ | $ | SYM | $ | quantmod | $ | false | false |
153
+ | 1 | 1 | NUM | CD | compound | d | false | false |
154
+ | billion | billion | NUM | CD | pobj | xxxx | true | false |
155
+
156
+ ### Part-of-speech tagging (Japanese)
157
+
158
+ Ruby code:
159
+
160
+ ```ruby
161
+ require( "ruby-spacy")
162
+ require "terminal-table"
163
+
164
+ nlp = Spacy::Language.new("ja_core_news_lg")
165
+ doc = nlp.read("任天堂は1983年にファミリー・コンピュータを14,800円で発売した。")
166
+
167
+ rows = []
168
+
169
+ doc.each do |token|
170
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
171
+ end
172
+
173
+ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
174
+ table = Terminal::Table.new rows: rows, headings: headings
175
+ puts table
176
+ ```
177
+
178
+ Output:
179
+
180
+ | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
181
+ |:-----------|:-----------|:------|:-------------------------|:-------|:-------|:---------|:--------|
182
+ | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj | xxx | true | false |
183
+ | は | は | ADP | 助詞-係助詞 | case | x | true | true |
184
+ | 1983 | 1983 | NUM | 名詞-数詞 | nummod | dddd | false | false |
185
+ | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
186
+ | に | に | ADP | 助詞-格助詞 | case | x | true | true |
187
+ | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj | xxxx | true | false |
188
+ | を | を | ADP | 助詞-格助詞 | case | x | true | true |
189
+ | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed | dd,ddd | false | false |
190
+ | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
191
+ | で | で | ADP | 助詞-格助詞 | case | x | true | true |
192
+ | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT | xx | true | false |
193
+ | し | する | AUX | 動詞-非自立可能 | aux | x | true | true |
194
+ | た | た | AUX | 助動詞 | aux | x | true | true |
195
+ | 。 | 。 | PUNCT | 補助記号-句点 | punct | 。 | false | false |
196
+
197
+ ### Visualizing dependency
198
+
199
+ → [spaCy: Visualizers](https://spacy.io/usage/visualizers)
200
+
201
+ Ruby code:
202
+
203
+ ```ruby
204
+ require "ruby-spacy"
205
+
206
+ nlp = Spacy::Language.new("en_core_web_sm")
207
+
208
+ sentence = "Autonomous cars shift insurance liability toward manufacturers"
209
+ doc = nlp.read(sentence)
210
+
211
+ dep_svg = doc.displacy(style: "dep", compact: false)
212
+
213
+ File.open(File.join("test_dep.svg"), "w") do |file|
214
+ file.write(dep_svg)
215
+ end
216
+ ```
217
+
218
+ Output:
219
+
220
+ ![](https://github.com/yohasebe/ruby-spacy/blob/main/examples/get_started/outputs/test_dep.svg)
221
+
222
+ ### Visualizing dependency (compact)
223
+
224
+ Ruby code:
225
+
226
+ ```ruby
227
+ require "ruby-spacy"
228
+
229
+ nlp = Spacy::Language.new("en_core_web_sm")
230
+
231
+ sentence = "Autonomous cars shift insurance liability toward manufacturers"
232
+ doc = nlp.read(sentence)
233
+
234
+ dep_svg = doc.displacy(style: "dep", compact: true)
235
+
236
+ File.open(File.join("test_dep_compact.svg"), "w") do |file|
237
+ file.write(dep_svg)
238
+ end
239
+ ```
240
+
241
+ Output:
242
+
243
+ ![](https://github.com/yohasebe/ruby-spacy/blob/main/examples/get_started/outputs/test_dep_compact.svg)
244
+
245
+ ### Named entity recognition
246
+
247
+ → [spaCy: Named entities](https://spacy.io/usage/spacy-101#annotations-ner)
248
+
249
+ Ruby code:
250
+
251
+ ```ruby
252
+ require "ruby-spacy"
253
+ require "terminal-table"
254
+
255
+ nlp = Spacy::Language.new("en_core_web_sm")
256
+ doc =nlp.read("Apple is looking at buying U.K. startup for $1 billion")
257
+
258
+ rows = []
259
+
260
+ doc.ents.each do |ent|
261
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
262
+ end
263
+
264
+ headings = ["text", "start_char", "end_char", "label"]
265
+ table = Terminal::Table.new rows: rows, headings: headings
266
+ puts table
267
+ ```
268
+
269
+ Output:
270
+
271
+ | text | start_char | end_char | label |
272
+ |:-----------|-----------:|---------:|:------|
273
+ | Apple | 0 | 5 | ORG |
274
+ | U.K. | 27 | 31 | GPE |
275
+ | $1 billion | 44 | 54 | MONEY |
276
+
277
+ ### Named entity recognition (Japanese)
278
+
279
+ Ruby code:
280
+
281
+ ```ruby
282
+ require( "ruby-spacy")
283
+ require "terminal-table"
284
+
285
+ nlp = Spacy::Language.new("ja_core_news_lg")
286
+
287
+ sentence = "任天堂は1983年にファミコンを14,800円で発売した。"
288
+ doc = nlp.read(sentence)
289
+
290
+ rows = []
291
+
292
+ doc.ents.each do |ent|
293
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
294
+ end
295
+
296
+ headings = ["text", "start", "end", "label"]
297
+ table = Terminal::Table.new rows: rows, headings: headings
298
+ print table
299
+ ```
300
+
301
+ Output:
302
+
303
+ | text | start | end | label |
304
+ |:-----------|------:|----:|:--------|
305
+ | 任天堂 | 0 | 3 | ORG |
306
+ | 1983年 | 4 | 9 | DATE |
307
+ | ファミコン | 10 | 15 | PRODUCT |
308
+ | 14,800円 | 16 | 23 | MONEY |
309
+
310
+ ### Checking availability of word vectors
311
+
312
+ → [spaCy: Word vectors and similarity](https://spacy.io/usage/spacy-101#vectors-similarity)
313
+
314
+ Ruby code:
315
+
316
+ ```ruby
317
+ require "ruby-spacy"
318
+ require "terminal-table"
319
+
320
+ nlp = Spacy::Language.new("en_core_web_lg")
321
+ doc = nlp.read("dog cat banana afskfsd")
322
+
323
+ rows = []
324
+
325
+ doc.each do |token|
326
+ rows << [token.text, token.has_vector, token.vector_norm, token.is_oov]
327
+ end
328
+
329
+ headings = ["text", "has_vector", "vector_norm", "is_oov"]
330
+ table = Terminal::Table.new rows: rows, headings: headings
331
+ puts table
332
+ ```
333
+
334
+ Output:
335
+
336
+ | text | has_vector | vector_norm | is_oov |
337
+ |:--------|:-----------|:------------|:-------|
338
+ | dog | true | 7.0336733 | false |
339
+ | cat | true | 6.6808186 | false |
340
+ | banana | true | 6.700014 | false |
341
+ | afskfsd | false | 0.0 | true |
342
+
343
+ ### Similarity calculation
344
+
345
+ Ruby code:
346
+
347
+ ```ruby
348
+ require "ruby-spacy"
349
+
350
+ nlp = Spacy::Language.new("en_core_web_lg")
351
+ doc1 = nlp.read("I like salty fries and hamburgers.")
352
+ doc2 = nlp.read("Fast food tastes very good.")
353
+
354
+ puts "Doc 1: " + doc1
355
+ puts "Doc 2: " + doc2
356
+ puts "Similarity: #{doc1.similarity(doc2)}"
357
+
358
+ ```
359
+
360
+ Output:
361
+
362
+ ```text
363
+ Doc 1: I like salty fries and hamburgers.
364
+ Doc 2: Fast food tastes very good.
365
+ Similarity: 0.7687607012190486
366
+ ```
367
+
368
+ ### Similarity calculation (Japanese)
369
+
370
+ Ruby code:
371
+
372
+ ```ruby
373
+ require "ruby-spacy"
374
+
375
+ nlp = Spacy::Language.new("ja_core_news_lg")
376
+ ja_doc1 = nlp.read("今日は雨ばっかり降って、嫌な天気ですね。")
377
+ puts "doc1: #{ja_doc1.text}"
378
+ ja_doc2 = nlp.read("あいにくの悪天候で残念です。")
379
+ puts "doc2: #{ja_doc2.text}"
380
+ puts "Similarity: #{ja_doc1.similarity(ja_doc2)}"
381
+ ```
382
+
383
+ Output:
384
+
385
+ ```text
386
+ doc1: 今日は雨ばっかり降って、嫌な天気ですね。
387
+ doc2: あいにくの悪天候で残念です。
388
+ Similarity: 0.8684192637149641
389
+ ```
390
+
391
+ ### Word vector calculation
392
+
393
+ **Tokyo - Japan + France = Paris ?**
394
+
395
+ Ruby code:
396
+
397
+ ```ruby
398
+ require "ruby-spacy"
399
+ require "terminal-table"
400
+
401
+ nlp = Spacy::Language.new("en_core_web_lg")
402
+
403
+ tokyo = nlp.get_lexeme("Tokyo")
404
+ japan = nlp.get_lexeme("Japan")
405
+ france = nlp.get_lexeme("France")
406
+
407
+ query = tokyo.vector - japan.vector + france.vector
408
+
409
+ rows = []
410
+
411
+ results = nlp.most_similar(query, 10)
412
+ results.each do |lexeme|
413
+ rows << [lexeme[:key], lexeme[:text], lexeme[:score],]
414
+ end
415
+
416
+ headings = ["key", "text", "score"]
417
+ table = Terminal::Table.new rows: rows, headings: headings
418
+ puts table
419
+ ```
420
+
421
+ Output:
422
+
423
+ | key | text | score |
424
+ |:---------------------|:------------|:-------------------|
425
+ | 1432967385481565694 | FRANCE | 0.8346999883651733 |
426
+ | 6613816697677965370 | France | 0.8346999883651733 |
427
+ | 4362406852232399325 | france | 0.8346999883651733 |
428
+ | 1637573253267610771 | PARIS | 0.7703999876976013 |
429
+ | 15322182186497800017 | paris | 0.7703999876976013 |
430
+ | 10427160276079242800 | Paris | 0.7703999876976013 |
431
+ | 975948890941980630 | TOULOUSE | 0.6381999850273132 |
432
+ | 7944504257273452052 | Toulouse | 0.6381999850273132 |
433
+ | 9614730213792621885 | toulouse | 0.6381999850273132 |
434
+ | 8515538464606421210 | marseille | 0.6370999813079834 |
435
+
436
+
437
+ ### Word vector calculation (Japanese)
438
+
439
+ **東京 - 日本 + フランス = パリ ?**
440
+
441
+ Ruby code:
442
+
443
+ ```ruby
444
+ require "ruby-spacy"
445
+ require "terminal-table"
446
+
447
+ nlp = Spacy::Language.new("ja_core_news_lg")
448
+
449
+ tokyo = nlp.get_lexeme("東京")
450
+ japan = nlp.get_lexeme("日本")
451
+ france = nlp.get_lexeme("フランス")
452
+
453
+ query = tokyo.vector - japan.vector + france.vector
454
+
455
+ rows = []
456
+
457
+ results = nlp.most_similar(query, 10)
458
+ results.each do |lexeme|
459
+ rows << [lexeme[:key], lexeme[:text], lexeme[:score],]
460
+ end
461
+
462
+ headings = ["key", "text", "score"]
463
+ table = Terminal::Table.new rows: rows, headings: headings
464
+ puts table
465
+ ```
466
+
467
+ Output:
468
+
469
+ | key | text | score |
470
+ |:---------------------|:---------------|:-------------------|
471
+ | 12090003238699662352 | パリ | 0.7376999855041504 |
472
+ | 18290786970454458111 | フランス | 0.7221999764442444 |
473
+ | 9360021637096476946 | 東京 | 0.6697999835014343 |
474
+ | 2437546359230213520 | ストラスブール | 0.631600022315979 |
475
+ | 13988178952745813186 | リヨン | 0.5939000248908997 |
476
+ | 10427160276079242800 | Paris | 0.574400007724762 |
477
+ | 5562396768860926997 | ベルギー | 0.5683000087738037 |
478
+ | 15029176915627965481 | ニース | 0.5679000020027161 |
479
+ | 9750625950625019690 | アルザス | 0.5644999742507935 |
480
+ | 2381640614569534741 | 南仏 | 0.5547999739646912 |
481
+
482
+
483
+ ## Author
484
+
485
+ Yoichiro Hasebe [<yohasebe@gmail.com>]
486
+
487
+
488
+ ## Acknowlegments
489
+
490
+ I would like to thank the following open source projects and their creators for making this project possible:
491
+
492
+ - [explosion/spaCy](https://github.com/explosion/spaCy)
493
+ - [mrkn/pycall.rb](https://github.com/mrkn/pycall.rb)
494
+
495
+ ## License
496
+
497
+ This library is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
498
+