ruby-spacy 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2cdb24ba1156e16b0cd14809b4a4ea0fe832257eed4060e6eba0d55314849151
4
- data.tar.gz: ba2d9c1957f1b650cf0a8902db6b3e901762ba94470c0443debf8b68a8b5f8c0
3
+ metadata.gz: 9add9d3b065bbf5064652cb115f824221d929a20478d182782df5db564cc8f45
4
+ data.tar.gz: f07d502f79883a452e7f250f0fe784425511a0de4f8a43db0b29ca03801bd755
5
5
  SHA512:
6
- metadata.gz: 68f4acdf7375c8bb4107681f3425a6b16ae3544c8c46c6f80cdb37643fe5c2fed2b6a2cac738325d0b3a5f9605495ac9230fa11090a167d6e8efc9d59066d88b
7
- data.tar.gz: 8eb877bea7a8b5d8f699cbf6637797b8d5bc4e6c6dcea228a5db0c7f56fa7add1c44b6e587ee212837124015ed3e0512fdf6f9015cbf7090f6d87bd7d19f4842
6
+ metadata.gz: 373c795a148034f4191cfaf130a23f464dc2b43927bf6aa3165999c78797365ce2f976021ea8b9ab1dd083736e5f9a1da51a5ccf0156d00ec39dac9fd19bde7c
7
+ data.tar.gz: e370e503c23d15a0a44be84bf578775b0a4acc5557468c7fc9468cde44e0e084018be8dc17c3e7c21d9efdaf229611ca234614fcd2e811272051c7c2922b408d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby-spacy (0.1.0)
4
+ ruby-spacy (0.1.2)
5
5
  numpy (~> 0.4.0)
6
6
  pycall (~> 1.4.0)
7
7
  terminal-table (~> 3.0.1)
@@ -23,6 +23,7 @@ GEM
23
23
 
24
24
  PLATFORMS
25
25
  arm64-darwin-20
26
+ x86_64-darwin-20
26
27
 
27
28
  DEPENDENCIES
28
29
  github-markup
data/README.md CHANGED
@@ -111,12 +111,10 @@ Output:
111
111
  |:-----:|:--:|:-------:|:--:|:------:|:----:|:-------:|:---:|:-:|:--:|:-------:|
112
112
  | Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
113
113
 
114
- ### Part-of-speech tagging
114
+ ### Part-of-speech and dependency
115
115
 
116
116
  → [spaCy: Part-of-speech tags and dependencies](https://spacy.io/usage/spacy-101#annotations-pos-deps)
117
117
 
118
- → [POS and morphology tags](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py)
119
-
120
118
  Ruby code:
121
119
 
122
120
  ```ruby
@@ -126,73 +124,117 @@ require "terminal-table"
126
124
  nlp = Spacy::Language.new("en_core_web_sm")
127
125
  doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
128
126
 
127
+ headings = ["text", "lemma", "pos", "tag", "dep"]
129
128
  rows = []
130
129
 
131
130
  doc.each do |token|
132
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
131
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
133
132
  end
134
133
 
135
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
136
134
  table = Terminal::Table.new rows: rows, headings: headings
137
135
  puts table
138
136
  ```
139
137
 
140
138
  Output:
141
139
 
142
- | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
143
- |:--------|:--------|:------|:----|:---------|:------|:---------|:--------|
144
- | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
145
- | is | be | AUX | VBZ | aux | xx | true | true |
146
- | looking | look | VERB | VBG | ROOT | xxxx | true | false |
147
- | at | at | ADP | IN | prep | xx | true | true |
148
- | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
149
- | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
150
- | startup | startup | NOUN | NN | advcl | xxxx | true | false |
151
- | for | for | ADP | IN | prep | xxx | true | true |
152
- | $ | $ | SYM | $ | quantmod | $ | false | false |
153
- | 1 | 1 | NUM | CD | compound | d | false | false |
154
- | billion | billion | NUM | CD | pobj | xxxx | true | false |
155
-
156
- ### Part-of-speech tagging (Japanese)
140
+ | text | lemma | pos | tag | dep |
141
+ |:--------|:--------|:------|:----|:---------|
142
+ | Apple | Apple | PROPN | NNP | nsubj |
143
+ | is | be | AUX | VBZ | aux |
144
+ | looking | look | VERB | VBG | ROOT |
145
+ | at | at | ADP | IN | prep |
146
+ | buying | buy | VERB | VBG | pcomp |
147
+ | U.K. | U.K. | PROPN | NNP | dobj |
148
+ | startup | startup | NOUN | NN | advcl |
149
+ | for | for | ADP | IN | prep |
150
+ | $ | $ | SYM | $ | quantmod |
151
+ | 1 | 1 | NUM | CD | compound |
152
+ | billion | billion | NUM | CD | pobj |
153
+
154
+ ### Part-of-speech and dependency (Japanese)
157
155
 
158
156
  Ruby code:
159
157
 
160
158
  ```ruby
161
- require( "ruby-spacy")
159
+ require "ruby-spacy"
162
160
  require "terminal-table"
163
161
 
164
162
  nlp = Spacy::Language.new("ja_core_news_lg")
165
- doc = nlp.read("任天堂は1983年にファミリー・コンピュータを14,800円で発売した。")
163
+ doc = nlp.read("任天堂は1983年にファミコンを14,800円で発売した。")
166
164
 
165
+ headings = ["text", "lemma", "pos", "tag", "dep"]
167
166
  rows = []
168
167
 
169
168
  doc.each do |token|
170
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
169
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
171
170
  end
172
171
 
173
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
174
172
  table = Terminal::Table.new rows: rows, headings: headings
175
173
  puts table
176
174
  ```
177
175
 
178
176
  Output:
179
177
 
180
- | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
181
- |:-----------|:-----------|:------|:-------------------------|:-------|:-------|:---------|:--------|
182
- | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj | xxx | true | false |
183
- | は | は | ADP | 助詞-係助詞 | case | x | true | true |
184
- | 1983 | 1983 | NUM | 名詞-数詞 | nummod | dddd | false | false |
185
- | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
186
- | に | に | ADP | 助詞-格助詞 | case | x | true | true |
187
- | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj | xxxx | true | false |
188
- | を | を | ADP | 助詞-格助詞 | case | x | true | true |
189
- | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed | dd,ddd | false | false |
190
- | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
191
- | で | で | ADP | 助詞-格助詞 | case | x | true | true |
192
- | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT | xx | true | false |
193
- | し | する | AUX | 動詞-非自立可能 | aux | x | true | true |
194
- | た | た | AUX | 助動詞 | aux | x | true | true |
195
- | 。 | 。 | PUNCT | 補助記号-句点 | punct | 。 | false | false |
178
+ | text | lemma | pos | tag | dep |
179
+ |:-----------|:-----------|:------|:-------------------------|:-------|
180
+ | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj |
181
+ | は | は | ADP | 助詞-係助詞 | case |
182
+ | 1983 | 1983 | NUM | 名詞-数詞 | nummod |
183
+ | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
184
+ | に | に | ADP | 助詞-格助詞 | case |
185
+ | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj |
186
+ | を | を | ADP | 助詞-格助詞 | case |
187
+ | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed |
188
+ | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
189
+ | で | で | ADP | 助詞-格助詞 | case |
190
+ | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT |
191
+ | し | する | AUX | 動詞-非自立可能 | aux |
192
+ | た | た | AUX | 助動詞 | aux |
193
+ | 。 | 。 | PUNCT | 補助記号-句点 | punct |
194
+
195
+ ### Morphology
196
+
197
+ → [POS and morphology tags](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py)
198
+
199
+ Ruby code:
200
+
201
+ ```ruby
202
+ require "ruby-spacy"
203
+ require "terminal-table"
204
+
205
+ nlp = Spacy::Language.new("en_core_web_sm")
206
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
207
+
208
+ headings = ["text", "shape", "is_alpha", "is_stop", "morphology"]
209
+ rows = []
210
+
211
+ doc.each do |token|
212
+ morph = token.morphology.map do |k, v|
213
+ "#{k} = #{v}"
214
+ end.join("\n")
215
+ rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
216
+ end
217
+
218
+ table = Terminal::Table.new rows: rows, headings: headings
219
+ puts table
220
+
221
+ ```
222
+
223
+ Output:
224
+
225
+ | text | shape | is_alpha | is_stop | morphology |
226
+ |:--------|:------|:---------|:--------|:------------------------------------------------------------------------------------|
227
+ | Apple | Xxxxx | true | false | NounType = Prop<br />Number = Sing |
228
+ | is | xx | true | true | Mood = Ind<br />Number = Sing<br />Person = 3<br />Tense = Pres<br />VerbForm = Fin |
229
+ | looking | xxxx | true | false | Aspect = Prog<br />Tense = Pres<br />VerbForm = Part |
230
+ | at | xx | true | true | |
231
+ | buying | xxxx | true | false | Aspect = Prog<br />Tense = Pres<br />VerbForm = Part |
232
+ | U.K. | X.X. | false | false | NounType = Prop<br />Number = Sing |
233
+ | startup | xxxx | true | false | Number = Sing |
234
+ | for | xxx | true | true | |
235
+ | $ | $ | false | false | |
236
+ | 1 | d | false | false | NumType = Card |
237
+ | billion | xxxx | true | false | NumType = Card |
196
238
 
197
239
  ### Visualizing dependency
198
240
 
@@ -0,0 +1,45 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
6
+
7
+ headings = ["text", "shape", "is_alpha", "is_stop", "morphology"]
8
+ rows = []
9
+
10
+ doc.each do |token|
11
+ morph = token.morphology.map do |k, v|
12
+ "#{k} = #{v}"
13
+ end.join("\n")
14
+ # end.join("<br />")
15
+ rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # +---------+-------+----------+---------+-----------------+
22
+ # | text | shape | is_alpha | is_stop | morphology |
23
+ # +---------+-------+----------+---------+-----------------+
24
+ # | Apple | Xxxxx | true | false | NounType = Prop |
25
+ # | | | | | Number = Sing |
26
+ # | is | xx | true | true | Mood = Ind |
27
+ # | | | | | Number = Sing |
28
+ # | | | | | Person = 3 |
29
+ # | | | | | Tense = Pres |
30
+ # | | | | | VerbForm = Fin |
31
+ # | looking | xxxx | true | false | Aspect = Prog |
32
+ # | | | | | Tense = Pres |
33
+ # | | | | | VerbForm = Part |
34
+ # | at | xx | true | true | |
35
+ # | buying | xxxx | true | false | Aspect = Prog |
36
+ # | | | | | Tense = Pres |
37
+ # | | | | | VerbForm = Part |
38
+ # | U.K. | X.X. | false | false | NounType = Prop |
39
+ # | | | | | Number = Sing |
40
+ # | startup | xxxx | true | false | Number = Sing |
41
+ # | for | xxx | true | true | |
42
+ # | $ | $ | false | false | |
43
+ # | 1 | d | false | false | NumType = Card |
44
+ # | billion | xxxx | true | false | NumType = Card |
45
+ # +---------+-------+----------+---------+-----------------+
@@ -4,28 +4,28 @@ require "terminal-table"
4
4
  nlp = Spacy::Language.new("en_core_web_sm")
5
5
  doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
6
6
 
7
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
7
+ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
15
15
  puts table
16
16
 
17
- # +---------+---------+-------+-----+----------+-------+----------+---------+
18
- # | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
19
- # +---------+---------+-------+-----+----------+-------+----------+---------+
20
- # | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
21
- # | is | be | AUX | VBZ | aux | xx | true | true |
22
- # | looking | look | VERB | VBG | ROOT | xxxx | true | false |
23
- # | at | at | ADP | IN | prep | xx | true | true |
24
- # | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
25
- # | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
26
- # | startup | startup | NOUN | NN | advcl | xxxx | true | false |
27
- # | for | for | ADP | IN | prep | xxx | true | true |
28
- # | $ | $ | SYM | $ | quantmod | $ | false | false |
29
- # | 1 | 1 | NUM | CD | compound | d | false | false |
30
- # | billion | billion | NUM | CD | pobj | xxxx | true | false |
31
- # +---------+---------+-------+-----+----------+-------+----------+---------+
17
+ # +---------+---------+-------+-----+----------+
18
+ # | text | lemma | pos | tag | dep |
19
+ # +---------+---------+-------+-----+----------+
20
+ # | Apple | Apple | PROPN | NNP | nsubj |
21
+ # | is | be | AUX | VBZ | aux |
22
+ # | looking | look | VERB | VBG | ROOT |
23
+ # | at | at | ADP | IN | prep |
24
+ # | buying | buy | VERB | VBG | pcomp |
25
+ # | U.K. | U.K. | PROPN | NNP | dobj |
26
+ # | startup | startup | NOUN | NN | advcl |
27
+ # | for | for | ADP | IN | prep |
28
+ # | $ | $ | SYM | $ | quantmod |
29
+ # | 1 | 1 | NUM | CD | compound |
30
+ # | billion | billion | NUM | CD | pobj |
31
+ # +---------+---------+-------+-----+----------+
@@ -4,31 +4,31 @@ require "terminal-table"
4
4
  nlp = Spacy::Language.new("ja_core_news_lg")
5
5
  doc = nlp.read("任天堂は1983年にファミコンを14,800円で発売した。")
6
6
 
7
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
7
+ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
15
15
  puts table
16
16
 
17
- # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
18
- # | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
19
- # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
20
- # | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj | xxx | true | false |
21
- # | は | は | ADP | 助詞-係助詞 | case | x | true | true |
22
- # | 1983 | 1983 | NUM | 名詞-数詞 | nummod | dddd | false | false |
23
- # | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
24
- # | に | に | ADP | 助詞-格助詞 | case | x | true | true |
25
- # | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj | xxxx | true | false |
26
- # | を | を | ADP | 助詞-格助詞 | case | x | true | true |
27
- # | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed | dd,ddd | false | false |
28
- # | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
29
- # | で | で | ADP | 助詞-格助詞 | case | x | true | true |
30
- # | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT | xx | true | false |
31
- # | し | する | AUX | 動詞-非自立可能 | aux | x | true | true |
32
- # | た | た | AUX | 助動詞 | aux | x | true | true |
33
- # | 。 | 。 | PUNCT | 補助記号-句点 | punct | 。 | false | false |
34
- # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
17
+ # +------------+------------+-------+--------------------------+--------+
18
+ # | text | lemma | pos | tag | dep |
19
+ # +------------+------------+-------+--------------------------+--------+
20
+ # | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj |
21
+ # | は | は | ADP | 助詞-係助詞 | case |
22
+ # | 1983 | 1983 | NUM | 名詞-数詞 | nummod |
23
+ # | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
24
+ # | に | に | ADP | 助詞-格助詞 | case |
25
+ # | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj |
26
+ # | を | を | ADP | 助詞-格助詞 | case |
27
+ # | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed |
28
+ # | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
29
+ # | で | で | ADP | 助詞-格助詞 | case |
30
+ # | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT |
31
+ # | し | する | AUX | 動詞-非自立可能 | aux |
32
+ # | た | た | AUX | 助動詞 | aux |
33
+ # | 。 | 。 | PUNCT | 補助記号-句点 | punct |
34
+ # +------------+------------+-------+--------------------------+--------+
data/lib/ruby-spacy.rb CHANGED
@@ -252,6 +252,26 @@ module Spacy
252
252
  @text
253
253
  end
254
254
 
255
+ # Returns a hash or string of morphological information
256
+ # @param dict [Boolean] if true, a hash will be returned instead of a string
257
+ # @return [Hash, String]
258
+ def morphology(hash = true)
259
+ if @py_token.has_morph
260
+ morph_analysis = @py_token.morph
261
+ if hash
262
+ return morph_analysis.to_dict
263
+ else
264
+ return morph_analysis.to_s
265
+ end
266
+ else
267
+ if hash
268
+ results = {}
269
+ else
270
+ return ""
271
+ end
272
+ end
273
+ end
274
+
255
275
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
256
276
  def method_missing(name, *args)
257
277
  @py_token.send(name, *args)
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Spacy
4
4
  # The version number of the module
5
- VERSION = "0.1.1"
5
+ VERSION = "0.1.2"
6
6
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spacy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
@@ -75,6 +75,7 @@ files:
75
75
  - bin/setup
76
76
  - examples/get_started/lexeme.rb
77
77
  - examples/get_started/linguistic_annotations.rb
78
+ - examples/get_started/morphology.rb
78
79
  - examples/get_started/most_similar.rb
79
80
  - examples/get_started/named_entities.rb
80
81
  - examples/get_started/outputs/test_dep.svg
@@ -111,7 +112,6 @@ files:
111
112
  - examples/linguistic_features/iterating_children.rb
112
113
  - examples/linguistic_features/iterating_lefts_and_rights.rb
113
114
  - examples/linguistic_features/lemmatization.rb
114
- - examples/linguistic_features/morphology.rb
115
115
  - examples/linguistic_features/named_entity_recognition.rb
116
116
  - examples/linguistic_features/navigating_parse_tree.rb
117
117
  - examples/linguistic_features/noun_chunks.rb
@@ -149,7 +149,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
149
  - !ruby/object:Gem::Version
150
150
  version: '0'
151
151
  requirements: []
152
- rubygems_version: 3.2.3
152
+ rubygems_version: 3.2.11
153
153
  signing_key:
154
154
  specification_version: 4
155
155
  summary: A wrapper module for using spaCy natural language processing library from
@@ -1,17 +0,0 @@
1
- require "ruby-spacy"
2
- require "terminal-table"
3
-
4
- nlp = Spacy::Language.new("en_core_web_sm")
5
-
6
- puts "Pipeline: " + nlp.pipe_names.to_s
7
-
8
- doc = nlp.read("I was reading the paper.")
9
-
10
- token = doc[0]
11
-
12
- puts "Morph features of the first word: " + token.morph.to_s
13
- puts "PronType of the word: " + token.morph.get("PronType").to_s
14
-
15
- # Pipeline: ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
16
- # Morph features of the first word: Case=Nom|Number=Sing|Person=1|PronType=Prs
17
- # PronType of the word: ['Prs']