ruby-spacy 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +58 -0
- data/.yardopts +2 -0
- data/Gemfile +18 -0
- data/Gemfile.lock +39 -0
- data/LICENSE.txt +21 -0
- data/README.md +498 -0
- data/Rakefile +12 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/examples/get_started/lexeme.rb +24 -0
- data/examples/get_started/linguistic_annotations.rb +32 -0
- data/examples/get_started/most_similar.rb +46 -0
- data/examples/get_started/named_entities.rb +24 -0
- data/examples/get_started/outputs/test_dep.svg +84 -0
- data/examples/get_started/outputs/test_dep_compact.svg +84 -0
- data/examples/get_started/outputs/test_ent.html +11 -0
- data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
- data/examples/get_started/similarity.rb +13 -0
- data/examples/get_started/tokenization.rb +22 -0
- data/examples/get_started/visualizing_dependencies.rb +14 -0
- data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
- data/examples/get_started/visualizing_named_entities.rb +12 -0
- data/examples/get_started/vocab.rb +10 -0
- data/examples/get_started/word_vectors.rb +24 -0
- data/examples/japanese/ancestors.rb +44 -0
- data/examples/japanese/entity_annotations_and_labels.rb +45 -0
- data/examples/japanese/information_extraction.rb +27 -0
- data/examples/japanese/lemmatization.rb +32 -0
- data/examples/japanese/most_similar.rb +46 -0
- data/examples/japanese/named_entity_recognition.rb +27 -0
- data/examples/japanese/navigating_parse_tree.rb +34 -0
- data/examples/japanese/noun_chunks.rb +23 -0
- data/examples/japanese/outputs/test_dep.svg +149 -0
- data/examples/japanese/outputs/test_ent.html +16 -0
- data/examples/japanese/pos_tagging.rb +34 -0
- data/examples/japanese/sentence_segmentation.rb +16 -0
- data/examples/japanese/similarity.rb +12 -0
- data/examples/japanese/tokenization.rb +38 -0
- data/examples/japanese/visualizing_dependencies.rb +13 -0
- data/examples/japanese/visualizing_named_entities.rb +14 -0
- data/examples/linguistic_features/ancestors.rb +41 -0
- data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
- data/examples/linguistic_features/information_extraction.rb +36 -0
- data/examples/linguistic_features/iterating_children.rb +24 -0
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
- data/examples/linguistic_features/lemmatization.rb +31 -0
- data/examples/linguistic_features/morphology.rb +17 -0
- data/examples/linguistic_features/named_entity_recognition.rb +25 -0
- data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
- data/examples/linguistic_features/noun_chunks.rb +27 -0
- data/examples/linguistic_features/outputs/test_ent.html +11 -0
- data/examples/linguistic_features/pos_tagging.rb +31 -0
- data/examples/linguistic_features/retokenize_1.rb +29 -0
- data/examples/linguistic_features/retokenize_2.rb +16 -0
- data/examples/linguistic_features/rule_based_morphology.rb +12 -0
- data/examples/linguistic_features/sentence_segmentation.rb +16 -0
- data/examples/linguistic_features/similarity.rb +14 -0
- data/examples/linguistic_features/similarity_between_spans.rb +23 -0
- data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
- data/examples/linguistic_features/tokenization.rb +23 -0
- data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
- data/examples/rule_based_matching/matcher.rb +19 -0
- data/lib/ruby-spacy.rb +567 -0
- data/lib/ruby-spacy/version.rb +6 -0
- data/ruby-spacy.gemspec +42 -0
- metadata +157 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6c149833c6cc16782d7964c27989535ee681f9816f58231d1eecc57f2c8f99c1
|
4
|
+
data.tar.gz: 5ac0417c29eea0dfa7a48c394e832bcbd7567fd5e4783f8a6de4d15132c479a3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: bbf0271475ebab0f6f64621be98bca42a45fbf0b76a6285d17e3593bf4c6e53bd91c55bd6664ea7dd6bc23448d64cb3035bee55eb9e525662580618a7d5bbab6
|
7
|
+
data.tar.gz: 0dd4301b1d9272dcc22ad172b8fb9363c46b52b0c58a34d8bf25499a77b4e96e3617a49d9d4c03a34d2b185d2830c6644a9df8968f81fde69a9f94b45691faf3
|
data/.gitignore
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
# .env
|
15
|
+
|
16
|
+
# Ignore Byebug command history file.
|
17
|
+
.byebug_history
|
18
|
+
|
19
|
+
## Specific to RubyMotion:
|
20
|
+
.dat*
|
21
|
+
.repl_history
|
22
|
+
build/
|
23
|
+
*.bridgesupport
|
24
|
+
build-iPhoneOS/
|
25
|
+
build-iPhoneSimulator/
|
26
|
+
|
27
|
+
## Specific to RubyMotion (use of CocoaPods):
|
28
|
+
#
|
29
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
30
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
31
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
32
|
+
#
|
33
|
+
# vendor/Pods/
|
34
|
+
|
35
|
+
## Documentation cache and generated files:
|
36
|
+
/.yardoc/
|
37
|
+
/_yardoc/
|
38
|
+
/doc/
|
39
|
+
/rdoc/
|
40
|
+
|
41
|
+
## Environment normalization:
|
42
|
+
/.bundle/
|
43
|
+
/vendor/bundle
|
44
|
+
/lib/bundler/man/
|
45
|
+
|
46
|
+
# for a library or gem, you might want to ignore these files since the code is
|
47
|
+
# intended to run in multiple environments; otherwise, check them in:
|
48
|
+
# Gemfile.lock
|
49
|
+
# .ruby-version
|
50
|
+
# .ruby-gemset
|
51
|
+
|
52
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
53
|
+
.rvmrc
|
54
|
+
|
55
|
+
# Used by RuboCop. Remote config files pulled in from inherit_from directive.
|
56
|
+
# .rubocop-https?--*
|
57
|
+
|
58
|
+
.DS_Store
|
data/.yardopts
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
source "https://rubygems.org"
|
4
|
+
|
5
|
+
# Specify your gem's dependencies in ruby-spacy.gemspec
|
6
|
+
gemspec
|
7
|
+
|
8
|
+
gem 'pycall'
|
9
|
+
gem 'numpy'
|
10
|
+
gem 'terminal-table'
|
11
|
+
|
12
|
+
group :development do
|
13
|
+
gem "rake", "~> 13.0"
|
14
|
+
gem "minitest", "~> 5.0"
|
15
|
+
gem 'yard'
|
16
|
+
gem 'redcarpet'
|
17
|
+
gem 'github-markup'
|
18
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
ruby-spacy (0.1.0)
|
5
|
+
numpy (~> 0.4.0)
|
6
|
+
pycall (~> 1.4.0)
|
7
|
+
terminal-table (~> 3.0.1)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
github-markup (4.0.0)
|
13
|
+
minitest (5.14.4)
|
14
|
+
numpy (0.4.0)
|
15
|
+
pycall (>= 1.2.0.beta1)
|
16
|
+
pycall (1.4.0)
|
17
|
+
rake (13.0.3)
|
18
|
+
redcarpet (3.5.1)
|
19
|
+
terminal-table (3.0.1)
|
20
|
+
unicode-display_width (>= 1.1.1, < 3)
|
21
|
+
unicode-display_width (2.0.0)
|
22
|
+
yard (0.9.26)
|
23
|
+
|
24
|
+
PLATFORMS
|
25
|
+
arm64-darwin-20
|
26
|
+
|
27
|
+
DEPENDENCIES
|
28
|
+
github-markup
|
29
|
+
minitest (~> 5.0)
|
30
|
+
numpy
|
31
|
+
pycall
|
32
|
+
rake (~> 13.0)
|
33
|
+
redcarpet
|
34
|
+
ruby-spacy!
|
35
|
+
terminal-table
|
36
|
+
yard
|
37
|
+
|
38
|
+
BUNDLED WITH
|
39
|
+
2.2.21
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2021 Yoichiro Hasebe
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,498 @@
|
|
1
|
+
# ruby-spacy
|
2
|
+
|
3
|
+
⚠️ This project is **work-in-progress** and is provided as-is. There may be breaking changes committed to this repository without notice.
|
4
|
+
|
5
|
+
## Overview
|
6
|
+
|
7
|
+
**ruby-spacy** is a wrapper module for using [spaCy](https://spacy.io/) from the Ruby programming language via [PyCall](https://github.com/mrkn/pycall.rb). This module aims to make it easy and natural for Ruby programmers to use spaCy. This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
8
|
+
|
9
|
+
| | Functionality |
|
10
|
+
|:---|:---------------------------------------------------|
|
11
|
+
| ✅ | Tokenization, lemmatization, sentence segmentation |
|
12
|
+
| ✅ | Part-of-speech tagging and dependency parsing |
|
13
|
+
| ✅ | Named entity recognition |
|
14
|
+
| ✅ | Syntactic dependency visualization |
|
15
|
+
| ✅ | Access to pre-trained word vectors |
|
16
|
+
|
17
|
+
## Installation of prerequisites
|
18
|
+
|
19
|
+
Make sure that the `enable-shared` option is enabled in your Python installation. You can use [pyenv](https://github.com/pyenv/pyenv) to install any version of Python you like. Install Python 3.8.5, for instance, using pyenv with `enable-shared` as follows:
|
20
|
+
|
21
|
+
```shell
|
22
|
+
$ env CONFIGURE_OPTS="--enable-shared" pyenv install 3.8.5
|
23
|
+
```
|
24
|
+
|
25
|
+
Don't forget to make it accessible from your working directory.
|
26
|
+
|
27
|
+
```shell
|
28
|
+
$ pyenv local 3.8.5
|
29
|
+
```
|
30
|
+
|
31
|
+
Or alternatively:
|
32
|
+
|
33
|
+
```shell
|
34
|
+
$ pyenv global 3.8.5
|
35
|
+
```
|
36
|
+
|
37
|
+
Then, install [spaCy](https://spacy.io/). If you use `pip`, the following command will do:
|
38
|
+
|
39
|
+
```shell
|
40
|
+
$ pip install spacy
|
41
|
+
```
|
42
|
+
|
43
|
+
Install trained language models. For a starter, `en_core_web_sm` will be the most useful to conduct basic text processing in English. However, if you want to use advanced features of spaCy, such as named entity recognition or document similarity calculation, you should also install a larger model like `en_core_web_lg`.
|
44
|
+
|
45
|
+
|
46
|
+
```shell
|
47
|
+
$ python -m spacy download en_core_web_sm
|
48
|
+
$ python -m spacy download en_core_web_lg
|
49
|
+
```
|
50
|
+
|
51
|
+
See [Spacy: Models & Languages](https://spacy.io/usage/models) for other models in various languages. To install models for the Japanese language, for instance, you can do it as follows:
|
52
|
+
|
53
|
+
```shell
|
54
|
+
$ python -m spacy download ja_core_news_sm
|
55
|
+
$ python -m spacy download ja_core_news_lg
|
56
|
+
```
|
57
|
+
|
58
|
+
## Installation of ruby-spacy
|
59
|
+
|
60
|
+
Add this line to your application's Gemfile:
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
gem 'ruby-spacy'
|
64
|
+
```
|
65
|
+
|
66
|
+
And then execute:
|
67
|
+
|
68
|
+
$ bundle install
|
69
|
+
|
70
|
+
Or install it yourself as:
|
71
|
+
|
72
|
+
$ gem install ruby-spacy
|
73
|
+
|
74
|
+
## Usage
|
75
|
+
|
76
|
+
See [Examples](#examples) below.
|
77
|
+
|
78
|
+
## Examples
|
79
|
+
|
80
|
+
Many of the following examples are Python-to-Ruby translations of code snippets in [spaCy 101](https://spacy.io/usage/spacy-101). For more examples, look inside the `examples` directory.
|
81
|
+
|
82
|
+
### Tokenization
|
83
|
+
|
84
|
+
→ [spaCy: Tokenization](https://spacy.io/usage/spacy-101#annotations-token)
|
85
|
+
|
86
|
+
Ruby code:
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
require "ruby-spacy"
|
90
|
+
require "terminal-table"
|
91
|
+
|
92
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
93
|
+
|
94
|
+
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
95
|
+
|
96
|
+
row = []
|
97
|
+
|
98
|
+
doc.each do |token|
|
99
|
+
row << token.text
|
100
|
+
end
|
101
|
+
|
102
|
+
headings = [1,2,3,4,5,6,7,8,9,10]
|
103
|
+
table = Terminal::Table.new rows: [row], headings: headings
|
104
|
+
|
105
|
+
puts table
|
106
|
+
```
|
107
|
+
|
108
|
+
Output:
|
109
|
+
|
110
|
+
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
|
111
|
+
|:-----:|:--:|:-------:|:--:|:------:|:----:|:-------:|:---:|:-:|:--:|:-------:|
|
112
|
+
| Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
|
113
|
+
|
114
|
+
### Part-of-speech tagging
|
115
|
+
|
116
|
+
→ [spaCy: Part-of-speech tags and dependencies](https://spacy.io/usage/spacy-101#annotations-pos-deps)
|
117
|
+
|
118
|
+
→ [POS and morphology tags](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py)
|
119
|
+
|
120
|
+
Ruby code:
|
121
|
+
|
122
|
+
```ruby
|
123
|
+
require "ruby-spacy"
|
124
|
+
require "terminal-table"
|
125
|
+
|
126
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
127
|
+
doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
128
|
+
|
129
|
+
rows = []
|
130
|
+
|
131
|
+
doc.each do |token|
|
132
|
+
rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
|
133
|
+
end
|
134
|
+
|
135
|
+
headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
|
136
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
137
|
+
puts table
|
138
|
+
```
|
139
|
+
|
140
|
+
Output:
|
141
|
+
|
142
|
+
| text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
|
143
|
+
|:--------|:--------|:------|:----|:---------|:------|:---------|:--------|
|
144
|
+
| Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
|
145
|
+
| is | be | AUX | VBZ | aux | xx | true | true |
|
146
|
+
| looking | look | VERB | VBG | ROOT | xxxx | true | false |
|
147
|
+
| at | at | ADP | IN | prep | xx | true | true |
|
148
|
+
| buying | buy | VERB | VBG | pcomp | xxxx | true | false |
|
149
|
+
| U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
|
150
|
+
| startup | startup | NOUN | NN | advcl | xxxx | true | false |
|
151
|
+
| for | for | ADP | IN | prep | xxx | true | true |
|
152
|
+
| $ | $ | SYM | $ | quantmod | $ | false | false |
|
153
|
+
| 1 | 1 | NUM | CD | compound | d | false | false |
|
154
|
+
| billion | billion | NUM | CD | pobj | xxxx | true | false |
|
155
|
+
|
156
|
+
### Part-of-speech tagging (Japanese)
|
157
|
+
|
158
|
+
Ruby code:
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
require( "ruby-spacy")
|
162
|
+
require "terminal-table"
|
163
|
+
|
164
|
+
nlp = Spacy::Language.new("ja_core_news_lg")
|
165
|
+
doc = nlp.read("任天堂は1983年にファミリー・コンピュータを14,800円で発売した。")
|
166
|
+
|
167
|
+
rows = []
|
168
|
+
|
169
|
+
doc.each do |token|
|
170
|
+
rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
|
171
|
+
end
|
172
|
+
|
173
|
+
headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
|
174
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
175
|
+
puts table
|
176
|
+
```
|
177
|
+
|
178
|
+
Output:
|
179
|
+
|
180
|
+
| text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
|
181
|
+
|:-----------|:-----------|:------|:-------------------------|:-------|:-------|:---------|:--------|
|
182
|
+
| 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj | xxx | true | false |
|
183
|
+
| は | は | ADP | 助詞-係助詞 | case | x | true | true |
|
184
|
+
| 1983 | 1983 | NUM | 名詞-数詞 | nummod | dddd | false | false |
|
185
|
+
| 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
|
186
|
+
| に | に | ADP | 助詞-格助詞 | case | x | true | true |
|
187
|
+
| ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj | xxxx | true | false |
|
188
|
+
| を | を | ADP | 助詞-格助詞 | case | x | true | true |
|
189
|
+
| 14,800 | 14,800 | NUM | 名詞-数詞 | fixed | dd,ddd | false | false |
|
190
|
+
| 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
|
191
|
+
| で | で | ADP | 助詞-格助詞 | case | x | true | true |
|
192
|
+
| 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT | xx | true | false |
|
193
|
+
| し | する | AUX | 動詞-非自立可能 | aux | x | true | true |
|
194
|
+
| た | た | AUX | 助動詞 | aux | x | true | true |
|
195
|
+
| 。 | 。 | PUNCT | 補助記号-句点 | punct | 。 | false | false |
|
196
|
+
|
197
|
+
### Visualizing dependency
|
198
|
+
|
199
|
+
→ [spaCy: Visualizers](https://spacy.io/usage/visualizers)
|
200
|
+
|
201
|
+
Ruby code:
|
202
|
+
|
203
|
+
```ruby
|
204
|
+
require "ruby-spacy"
|
205
|
+
|
206
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
207
|
+
|
208
|
+
sentence = "Autonomous cars shift insurance liability toward manufacturers"
|
209
|
+
doc = nlp.read(sentence)
|
210
|
+
|
211
|
+
dep_svg = doc.displacy(style: "dep", compact: false)
|
212
|
+
|
213
|
+
File.open(File.join("test_dep.svg"), "w") do |file|
|
214
|
+
file.write(dep_svg)
|
215
|
+
end
|
216
|
+
```
|
217
|
+
|
218
|
+
Output:
|
219
|
+
|
220
|
+
![](https://github.com/yohasebe/ruby-spacy/blob/main/examples/get_started/outputs/test_dep.svg)
|
221
|
+
|
222
|
+
### Visualizing dependency (compact)
|
223
|
+
|
224
|
+
Ruby code:
|
225
|
+
|
226
|
+
```ruby
|
227
|
+
require "ruby-spacy"
|
228
|
+
|
229
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
230
|
+
|
231
|
+
sentence = "Autonomous cars shift insurance liability toward manufacturers"
|
232
|
+
doc = nlp.read(sentence)
|
233
|
+
|
234
|
+
dep_svg = doc.displacy(style: "dep", compact: true)
|
235
|
+
|
236
|
+
File.open(File.join("test_dep_compact.svg"), "w") do |file|
|
237
|
+
file.write(dep_svg)
|
238
|
+
end
|
239
|
+
```
|
240
|
+
|
241
|
+
Output:
|
242
|
+
|
243
|
+
![](https://github.com/yohasebe/ruby-spacy/blob/main/examples/get_started/outputs/test_dep_compact.svg)
|
244
|
+
|
245
|
+
### Named entity recognition
|
246
|
+
|
247
|
+
→ [spaCy: Named entities](https://spacy.io/usage/spacy-101#annotations-ner)
|
248
|
+
|
249
|
+
Ruby code:
|
250
|
+
|
251
|
+
```ruby
|
252
|
+
require "ruby-spacy"
|
253
|
+
require "terminal-table"
|
254
|
+
|
255
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
256
|
+
doc =nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
257
|
+
|
258
|
+
rows = []
|
259
|
+
|
260
|
+
doc.ents.each do |ent|
|
261
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
|
262
|
+
end
|
263
|
+
|
264
|
+
headings = ["text", "start_char", "end_char", "label"]
|
265
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
266
|
+
puts table
|
267
|
+
```
|
268
|
+
|
269
|
+
Output:
|
270
|
+
|
271
|
+
| text | start_char | end_char | label |
|
272
|
+
|:-----------|-----------:|---------:|:------|
|
273
|
+
| Apple | 0 | 5 | ORG |
|
274
|
+
| U.K. | 27 | 31 | GPE |
|
275
|
+
| $1 billion | 44 | 54 | MONEY |
|
276
|
+
|
277
|
+
### Named entity recognition (Japanese)
|
278
|
+
|
279
|
+
Ruby code:
|
280
|
+
|
281
|
+
```ruby
|
282
|
+
require( "ruby-spacy")
|
283
|
+
require "terminal-table"
|
284
|
+
|
285
|
+
nlp = Spacy::Language.new("ja_core_news_lg")
|
286
|
+
|
287
|
+
sentence = "任天堂は1983年にファミコンを14,800円で発売した。"
|
288
|
+
doc = nlp.read(sentence)
|
289
|
+
|
290
|
+
rows = []
|
291
|
+
|
292
|
+
doc.ents.each do |ent|
|
293
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
|
294
|
+
end
|
295
|
+
|
296
|
+
headings = ["text", "start", "end", "label"]
|
297
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
298
|
+
print table
|
299
|
+
```
|
300
|
+
|
301
|
+
Output:
|
302
|
+
|
303
|
+
| text | start | end | label |
|
304
|
+
|:-----------|------:|----:|:--------|
|
305
|
+
| 任天堂 | 0 | 3 | ORG |
|
306
|
+
| 1983年 | 4 | 9 | DATE |
|
307
|
+
| ファミコン | 10 | 15 | PRODUCT |
|
308
|
+
| 14,800円 | 16 | 23 | MONEY |
|
309
|
+
|
310
|
+
### Checking availability of word vectors
|
311
|
+
|
312
|
+
→ [spaCy: Word vectors and similarity](https://spacy.io/usage/spacy-101#vectors-similarity)
|
313
|
+
|
314
|
+
Ruby code:
|
315
|
+
|
316
|
+
```ruby
|
317
|
+
require "ruby-spacy"
|
318
|
+
require "terminal-table"
|
319
|
+
|
320
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
321
|
+
doc = nlp.read("dog cat banana afskfsd")
|
322
|
+
|
323
|
+
rows = []
|
324
|
+
|
325
|
+
doc.each do |token|
|
326
|
+
rows << [token.text, token.has_vector, token.vector_norm, token.is_oov]
|
327
|
+
end
|
328
|
+
|
329
|
+
headings = ["text", "has_vector", "vector_norm", "is_oov"]
|
330
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
331
|
+
puts table
|
332
|
+
```
|
333
|
+
|
334
|
+
Output:
|
335
|
+
|
336
|
+
| text | has_vector | vector_norm | is_oov |
|
337
|
+
|:--------|:-----------|:------------|:-------|
|
338
|
+
| dog | true | 7.0336733 | false |
|
339
|
+
| cat | true | 6.6808186 | false |
|
340
|
+
| banana | true | 6.700014 | false |
|
341
|
+
| afskfsd | false | 0.0 | true |
|
342
|
+
|
343
|
+
### Similarity calculation
|
344
|
+
|
345
|
+
Ruby code:
|
346
|
+
|
347
|
+
```ruby
|
348
|
+
require "ruby-spacy"
|
349
|
+
|
350
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
351
|
+
doc1 = nlp.read("I like salty fries and hamburgers.")
|
352
|
+
doc2 = nlp.read("Fast food tastes very good.")
|
353
|
+
|
354
|
+
puts "Doc 1: " + doc1
|
355
|
+
puts "Doc 2: " + doc2
|
356
|
+
puts "Similarity: #{doc1.similarity(doc2)}"
|
357
|
+
|
358
|
+
```
|
359
|
+
|
360
|
+
Output:
|
361
|
+
|
362
|
+
```text
|
363
|
+
Doc 1: I like salty fries and hamburgers.
|
364
|
+
Doc 2: Fast food tastes very good.
|
365
|
+
Similarity: 0.7687607012190486
|
366
|
+
```
|
367
|
+
|
368
|
+
### Similarity calculation (Japanese)
|
369
|
+
|
370
|
+
Ruby code:
|
371
|
+
|
372
|
+
```ruby
|
373
|
+
require "ruby-spacy"
|
374
|
+
|
375
|
+
nlp = Spacy::Language.new("ja_core_news_lg")
|
376
|
+
ja_doc1 = nlp.read("今日は雨ばっかり降って、嫌な天気ですね。")
|
377
|
+
puts "doc1: #{ja_doc1.text}"
|
378
|
+
ja_doc2 = nlp.read("あいにくの悪天候で残念です。")
|
379
|
+
puts "doc2: #{ja_doc2.text}"
|
380
|
+
puts "Similarity: #{ja_doc1.similarity(ja_doc2)}"
|
381
|
+
```
|
382
|
+
|
383
|
+
Output:
|
384
|
+
|
385
|
+
```text
|
386
|
+
doc1: 今日は雨ばっかり降って、嫌な天気ですね。
|
387
|
+
doc2: あいにくの悪天候で残念です。
|
388
|
+
Similarity: 0.8684192637149641
|
389
|
+
```
|
390
|
+
|
391
|
+
### Word vector calculation
|
392
|
+
|
393
|
+
**Tokyo - Japan + France = Paris ?**
|
394
|
+
|
395
|
+
Ruby code:
|
396
|
+
|
397
|
+
```ruby
|
398
|
+
require "ruby-spacy"
|
399
|
+
require "terminal-table"
|
400
|
+
|
401
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
402
|
+
|
403
|
+
tokyo = nlp.get_lexeme("Tokyo")
|
404
|
+
japan = nlp.get_lexeme("Japan")
|
405
|
+
france = nlp.get_lexeme("France")
|
406
|
+
|
407
|
+
query = tokyo.vector - japan.vector + france.vector
|
408
|
+
|
409
|
+
rows = []
|
410
|
+
|
411
|
+
results = nlp.most_similar(query, 10)
|
412
|
+
results.each do |lexeme|
|
413
|
+
rows << [lexeme[:key], lexeme[:text], lexeme[:score],]
|
414
|
+
end
|
415
|
+
|
416
|
+
headings = ["key", "text", "score"]
|
417
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
418
|
+
puts table
|
419
|
+
```
|
420
|
+
|
421
|
+
Output:
|
422
|
+
|
423
|
+
| key | text | score |
|
424
|
+
|:---------------------|:------------|:-------------------|
|
425
|
+
| 1432967385481565694 | FRANCE | 0.8346999883651733 |
|
426
|
+
| 6613816697677965370 | France | 0.8346999883651733 |
|
427
|
+
| 4362406852232399325 | france | 0.8346999883651733 |
|
428
|
+
| 1637573253267610771 | PARIS | 0.7703999876976013 |
|
429
|
+
| 15322182186497800017 | paris | 0.7703999876976013 |
|
430
|
+
| 10427160276079242800 | Paris | 0.7703999876976013 |
|
431
|
+
| 975948890941980630 | TOULOUSE | 0.6381999850273132 |
|
432
|
+
| 7944504257273452052 | Toulouse | 0.6381999850273132 |
|
433
|
+
| 9614730213792621885 | toulouse | 0.6381999850273132 |
|
434
|
+
| 8515538464606421210 | marseille | 0.6370999813079834 |
|
435
|
+
|
436
|
+
|
437
|
+
### Word vector calculation (Japanese)
|
438
|
+
|
439
|
+
**東京 - 日本 + フランス = パリ ?**
|
440
|
+
|
441
|
+
Ruby code:
|
442
|
+
|
443
|
+
```ruby
|
444
|
+
require "ruby-spacy"
|
445
|
+
require "terminal-table"
|
446
|
+
|
447
|
+
nlp = Spacy::Language.new("ja_core_news_lg")
|
448
|
+
|
449
|
+
tokyo = nlp.get_lexeme("東京")
|
450
|
+
japan = nlp.get_lexeme("日本")
|
451
|
+
france = nlp.get_lexeme("フランス")
|
452
|
+
|
453
|
+
query = tokyo.vector - japan.vector + france.vector
|
454
|
+
|
455
|
+
rows = []
|
456
|
+
|
457
|
+
results = nlp.most_similar(query, 10)
|
458
|
+
results.each do |lexeme|
|
459
|
+
rows << [lexeme[:key], lexeme[:text], lexeme[:score],]
|
460
|
+
end
|
461
|
+
|
462
|
+
headings = ["key", "text", "score"]
|
463
|
+
table = Terminal::Table.new rows: rows, headings: headings
|
464
|
+
puts table
|
465
|
+
```
|
466
|
+
|
467
|
+
Output:
|
468
|
+
|
469
|
+
| key | text | score |
|
470
|
+
|:---------------------|:---------------|:-------------------|
|
471
|
+
| 12090003238699662352 | パリ | 0.7376999855041504 |
|
472
|
+
| 18290786970454458111 | フランス | 0.7221999764442444 |
|
473
|
+
| 9360021637096476946 | 東京 | 0.6697999835014343 |
|
474
|
+
| 2437546359230213520 | ストラスブール | 0.631600022315979 |
|
475
|
+
| 13988178952745813186 | リヨン | 0.5939000248908997 |
|
476
|
+
| 10427160276079242800 | Paris | 0.574400007724762 |
|
477
|
+
| 5562396768860926997 | ベルギー | 0.5683000087738037 |
|
478
|
+
| 15029176915627965481 | ニース | 0.5679000020027161 |
|
479
|
+
| 9750625950625019690 | アルザス | 0.5644999742507935 |
|
480
|
+
| 2381640614569534741 | 南仏 | 0.5547999739646912 |
|
481
|
+
|
482
|
+
|
483
|
+
## Author
|
484
|
+
|
485
|
+
Yoichiro Hasebe [<yohasebe@gmail.com>]
|
486
|
+
|
487
|
+
|
488
|
+
## Acknowlegments
|
489
|
+
|
490
|
+
I would like to thank the following open source projects and their creators for making this project possible:
|
491
|
+
|
492
|
+
- [explosion/spaCy](https://github.com/explosion/spaCy)
|
493
|
+
- [mrkn/pycall.rb](https://github.com/mrkn/pycall.rb)
|
494
|
+
|
495
|
+
## License
|
496
|
+
|
497
|
+
This library is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
498
|
+
|