ruby-spacy 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2cdb24ba1156e16b0cd14809b4a4ea0fe832257eed4060e6eba0d55314849151
4
- data.tar.gz: ba2d9c1957f1b650cf0a8902db6b3e901762ba94470c0443debf8b68a8b5f8c0
3
+ metadata.gz: 9add9d3b065bbf5064652cb115f824221d929a20478d182782df5db564cc8f45
4
+ data.tar.gz: f07d502f79883a452e7f250f0fe784425511a0de4f8a43db0b29ca03801bd755
5
5
  SHA512:
6
- metadata.gz: 68f4acdf7375c8bb4107681f3425a6b16ae3544c8c46c6f80cdb37643fe5c2fed2b6a2cac738325d0b3a5f9605495ac9230fa11090a167d6e8efc9d59066d88b
7
- data.tar.gz: 8eb877bea7a8b5d8f699cbf6637797b8d5bc4e6c6dcea228a5db0c7f56fa7add1c44b6e587ee212837124015ed3e0512fdf6f9015cbf7090f6d87bd7d19f4842
6
+ metadata.gz: 373c795a148034f4191cfaf130a23f464dc2b43927bf6aa3165999c78797365ce2f976021ea8b9ab1dd083736e5f9a1da51a5ccf0156d00ec39dac9fd19bde7c
7
+ data.tar.gz: e370e503c23d15a0a44be84bf578775b0a4acc5557468c7fc9468cde44e0e084018be8dc17c3e7c21d9efdaf229611ca234614fcd2e811272051c7c2922b408d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby-spacy (0.1.0)
4
+ ruby-spacy (0.1.2)
5
5
  numpy (~> 0.4.0)
6
6
  pycall (~> 1.4.0)
7
7
  terminal-table (~> 3.0.1)
@@ -23,6 +23,7 @@ GEM
23
23
 
24
24
  PLATFORMS
25
25
  arm64-darwin-20
26
+ x86_64-darwin-20
26
27
 
27
28
  DEPENDENCIES
28
29
  github-markup
data/README.md CHANGED
@@ -111,12 +111,10 @@ Output:
111
111
  |:-----:|:--:|:-------:|:--:|:------:|:----:|:-------:|:---:|:-:|:--:|:-------:|
112
112
  | Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
113
113
 
114
- ### Part-of-speech tagging
114
+ ### Part-of-speech and dependency
115
115
 
116
116
  → [spaCy: Part-of-speech tags and dependencies](https://spacy.io/usage/spacy-101#annotations-pos-deps)
117
117
 
118
- → [POS and morphology tags](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py)
119
-
120
118
  Ruby code:
121
119
 
122
120
  ```ruby
@@ -126,73 +124,117 @@ require "terminal-table"
126
124
  nlp = Spacy::Language.new("en_core_web_sm")
127
125
  doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
128
126
 
127
+ headings = ["text", "lemma", "pos", "tag", "dep"]
129
128
  rows = []
130
129
 
131
130
  doc.each do |token|
132
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
131
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
133
132
  end
134
133
 
135
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
136
134
  table = Terminal::Table.new rows: rows, headings: headings
137
135
  puts table
138
136
  ```
139
137
 
140
138
  Output:
141
139
 
142
- | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
143
- |:--------|:--------|:------|:----|:---------|:------|:---------|:--------|
144
- | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
145
- | is | be | AUX | VBZ | aux | xx | true | true |
146
- | looking | look | VERB | VBG | ROOT | xxxx | true | false |
147
- | at | at | ADP | IN | prep | xx | true | true |
148
- | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
149
- | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
150
- | startup | startup | NOUN | NN | advcl | xxxx | true | false |
151
- | for | for | ADP | IN | prep | xxx | true | true |
152
- | $ | $ | SYM | $ | quantmod | $ | false | false |
153
- | 1 | 1 | NUM | CD | compound | d | false | false |
154
- | billion | billion | NUM | CD | pobj | xxxx | true | false |
155
-
156
- ### Part-of-speech tagging (Japanese)
140
+ | text | lemma | pos | tag | dep |
141
+ |:--------|:--------|:------|:----|:---------|
142
+ | Apple | Apple | PROPN | NNP | nsubj |
143
+ | is | be | AUX | VBZ | aux |
144
+ | looking | look | VERB | VBG | ROOT |
145
+ | at | at | ADP | IN | prep |
146
+ | buying | buy | VERB | VBG | pcomp |
147
+ | U.K. | U.K. | PROPN | NNP | dobj |
148
+ | startup | startup | NOUN | NN | advcl |
149
+ | for | for | ADP | IN | prep |
150
+ | $ | $ | SYM | $ | quantmod |
151
+ | 1 | 1 | NUM | CD | compound |
152
+ | billion | billion | NUM | CD | pobj |
153
+
154
+ ### Part-of-speech and dependency (Japanese)
157
155
 
158
156
  Ruby code:
159
157
 
160
158
  ```ruby
161
- require( "ruby-spacy")
159
+ require "ruby-spacy"
162
160
  require "terminal-table"
163
161
 
164
162
  nlp = Spacy::Language.new("ja_core_news_lg")
165
- doc = nlp.read("任天堂は1983年にファミリー・コンピュータを14,800円で発売した。")
163
+ doc = nlp.read("任天堂は1983年にファミコンを14,800円で発売した。")
166
164
 
165
+ headings = ["text", "lemma", "pos", "tag", "dep"]
167
166
  rows = []
168
167
 
169
168
  doc.each do |token|
170
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
169
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
171
170
  end
172
171
 
173
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
174
172
  table = Terminal::Table.new rows: rows, headings: headings
175
173
  puts table
176
174
  ```
177
175
 
178
176
  Output:
179
177
 
180
- | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
181
- |:-----------|:-----------|:------|:-------------------------|:-------|:-------|:---------|:--------|
182
- | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj | xxx | true | false |
183
- | は | は | ADP | 助詞-係助詞 | case | x | true | true |
184
- | 1983 | 1983 | NUM | 名詞-数詞 | nummod | dddd | false | false |
185
- | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
186
- | に | に | ADP | 助詞-格助詞 | case | x | true | true |
187
- | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj | xxxx | true | false |
188
- | を | を | ADP | 助詞-格助詞 | case | x | true | true |
189
- | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed | dd,ddd | false | false |
190
- | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
191
- | で | で | ADP | 助詞-格助詞 | case | x | true | true |
192
- | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT | xx | true | false |
193
- | し | する | AUX | 動詞-非自立可能 | aux | x | true | true |
194
- | た | た | AUX | 助動詞 | aux | x | true | true |
195
- | 。 | 。 | PUNCT | 補助記号-句点 | punct | 。 | false | false |
178
+ | text | lemma | pos | tag | dep |
179
+ |:-----------|:-----------|:------|:-------------------------|:-------|
180
+ | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj |
181
+ | は | は | ADP | 助詞-係助詞 | case |
182
+ | 1983 | 1983 | NUM | 名詞-数詞 | nummod |
183
+ | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
184
+ | に | に | ADP | 助詞-格助詞 | case |
185
+ | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj |
186
+ | を | を | ADP | 助詞-格助詞 | case |
187
+ | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed |
188
+ | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
189
+ | で | で | ADP | 助詞-格助詞 | case |
190
+ | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT |
191
+ | し | する | AUX | 動詞-非自立可能 | aux |
192
+ | た | た | AUX | 助動詞 | aux |
193
+ | 。 | 。 | PUNCT | 補助記号-句点 | punct |
194
+
195
+ ### Morphology
196
+
197
+ → [POS and morphology tags](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py)
198
+
199
+ Ruby code:
200
+
201
+ ```ruby
202
+ require "ruby-spacy"
203
+ require "terminal-table"
204
+
205
+ nlp = Spacy::Language.new("en_core_web_sm")
206
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
207
+
208
+ headings = ["text", "shape", "is_alpha", "is_stop", "morphology"]
209
+ rows = []
210
+
211
+ doc.each do |token|
212
+ morph = token.morphology.map do |k, v|
213
+ "#{k} = #{v}"
214
+ end.join("\n")
215
+ rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
216
+ end
217
+
218
+ table = Terminal::Table.new rows: rows, headings: headings
219
+ puts table
220
+
221
+ ```
222
+
223
+ Output:
224
+
225
+ | text | shape | is_alpha | is_stop | morphology |
226
+ |:--------|:------|:---------|:--------|:------------------------------------------------------------------------------------|
227
+ | Apple | Xxxxx | true | false | NounType = Prop<br />Number = Sing |
228
+ | is | xx | true | true | Mood = Ind<br />Number = Sing<br />Person = 3<br />Tense = Pres<br />VerbForm = Fin |
229
+ | looking | xxxx | true | false | Aspect = Prog<br />Tense = Pres<br />VerbForm = Part |
230
+ | at | xx | true | true | |
231
+ | buying | xxxx | true | false | Aspect = Prog<br />Tense = Pres<br />VerbForm = Part |
232
+ | U.K. | X.X. | false | false | NounType = Prop<br />Number = Sing |
233
+ | startup | xxxx | true | false | Number = Sing |
234
+ | for | xxx | true | true | |
235
+ | $ | $ | false | false | |
236
+ | 1 | d | false | false | NumType = Card |
237
+ | billion | xxxx | true | false | NumType = Card |
196
238
 
197
239
  ### Visualizing dependency
198
240
 
@@ -0,0 +1,45 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
6
+
7
+ headings = ["text", "shape", "is_alpha", "is_stop", "morphology"]
8
+ rows = []
9
+
10
+ doc.each do |token|
11
+ morph = token.morphology.map do |k, v|
12
+ "#{k} = #{v}"
13
+ end.join("\n")
14
+ # end.join("<br />")
15
+ rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # +---------+-------+----------+---------+-----------------+
22
+ # | text | shape | is_alpha | is_stop | morphology |
23
+ # +---------+-------+----------+---------+-----------------+
24
+ # | Apple | Xxxxx | true | false | NounType = Prop |
25
+ # | | | | | Number = Sing |
26
+ # | is | xx | true | true | Mood = Ind |
27
+ # | | | | | Number = Sing |
28
+ # | | | | | Person = 3 |
29
+ # | | | | | Tense = Pres |
30
+ # | | | | | VerbForm = Fin |
31
+ # | looking | xxxx | true | false | Aspect = Prog |
32
+ # | | | | | Tense = Pres |
33
+ # | | | | | VerbForm = Part |
34
+ # | at | xx | true | true | |
35
+ # | buying | xxxx | true | false | Aspect = Prog |
36
+ # | | | | | Tense = Pres |
37
+ # | | | | | VerbForm = Part |
38
+ # | U.K. | X.X. | false | false | NounType = Prop |
39
+ # | | | | | Number = Sing |
40
+ # | startup | xxxx | true | false | Number = Sing |
41
+ # | for | xxx | true | true | |
42
+ # | $ | $ | false | false | |
43
+ # | 1 | d | false | false | NumType = Card |
44
+ # | billion | xxxx | true | false | NumType = Card |
45
+ # +---------+-------+----------+---------+-----------------+
@@ -4,28 +4,28 @@ require "terminal-table"
4
4
  nlp = Spacy::Language.new("en_core_web_sm")
5
5
  doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
6
6
 
7
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
7
+ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
15
15
  puts table
16
16
 
17
- # +---------+---------+-------+-----+----------+-------+----------+---------+
18
- # | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
19
- # +---------+---------+-------+-----+----------+-------+----------+---------+
20
- # | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
21
- # | is | be | AUX | VBZ | aux | xx | true | true |
22
- # | looking | look | VERB | VBG | ROOT | xxxx | true | false |
23
- # | at | at | ADP | IN | prep | xx | true | true |
24
- # | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
25
- # | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
26
- # | startup | startup | NOUN | NN | advcl | xxxx | true | false |
27
- # | for | for | ADP | IN | prep | xxx | true | true |
28
- # | $ | $ | SYM | $ | quantmod | $ | false | false |
29
- # | 1 | 1 | NUM | CD | compound | d | false | false |
30
- # | billion | billion | NUM | CD | pobj | xxxx | true | false |
31
- # +---------+---------+-------+-----+----------+-------+----------+---------+
17
+ # +---------+---------+-------+-----+----------+
18
+ # | text | lemma | pos | tag | dep |
19
+ # +---------+---------+-------+-----+----------+
20
+ # | Apple | Apple | PROPN | NNP | nsubj |
21
+ # | is | be | AUX | VBZ | aux |
22
+ # | looking | look | VERB | VBG | ROOT |
23
+ # | at | at | ADP | IN | prep |
24
+ # | buying | buy | VERB | VBG | pcomp |
25
+ # | U.K. | U.K. | PROPN | NNP | dobj |
26
+ # | startup | startup | NOUN | NN | advcl |
27
+ # | for | for | ADP | IN | prep |
28
+ # | $ | $ | SYM | $ | quantmod |
29
+ # | 1 | 1 | NUM | CD | compound |
30
+ # | billion | billion | NUM | CD | pobj |
31
+ # +---------+---------+-------+-----+----------+
@@ -4,31 +4,31 @@ require "terminal-table"
4
4
  nlp = Spacy::Language.new("ja_core_news_lg")
5
5
  doc = nlp.read("任天堂は1983年にファミコンを14,800円で発売した。")
6
6
 
7
- headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
7
+ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
15
15
  puts table
16
16
 
17
- # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
18
- # | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
19
- # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
20
- # | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj | xxx | true | false |
21
- # | は | は | ADP | 助詞-係助詞 | case | x | true | true |
22
- # | 1983 | 1983 | NUM | 名詞-数詞 | nummod | dddd | false | false |
23
- # | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
24
- # | に | に | ADP | 助詞-格助詞 | case | x | true | true |
25
- # | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj | xxxx | true | false |
26
- # | を | を | ADP | 助詞-格助詞 | case | x | true | true |
27
- # | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed | dd,ddd | false | false |
28
- # | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl | x | true | false |
29
- # | で | で | ADP | 助詞-格助詞 | case | x | true | true |
30
- # | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT | xx | true | false |
31
- # | し | する | AUX | 動詞-非自立可能 | aux | x | true | true |
32
- # | た | た | AUX | 助動詞 | aux | x | true | true |
33
- # | 。 | 。 | PUNCT | 補助記号-句点 | punct | 。 | false | false |
34
- # +------------+------------+-------+--------------------------+--------+--------+----------+---------+
17
+ # +------------+------------+-------+--------------------------+--------+
18
+ # | text | lemma | pos | tag | dep |
19
+ # +------------+------------+-------+--------------------------+--------+
20
+ # | 任天堂 | 任天堂 | PROPN | 名詞-固有名詞-一般 | nsubj |
21
+ # | は | は | ADP | 助詞-係助詞 | case |
22
+ # | 1983 | 1983 | NUM | 名詞-数詞 | nummod |
23
+ # | 年 | 年 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
24
+ # | に | に | ADP | 助詞-格助詞 | case |
25
+ # | ファミコン | ファミコン | NOUN | 名詞-普通名詞-一般 | obj |
26
+ # | を | を | ADP | 助詞-格助詞 | case |
27
+ # | 14,800 | 14,800 | NUM | 名詞-数詞 | fixed |
28
+ # | 円 | 円 | NOUN | 名詞-普通名詞-助数詞可能 | obl |
29
+ # | で | で | ADP | 助詞-格助詞 | case |
30
+ # | 発売 | 発売 | VERB | 名詞-普通名詞-サ変可能 | ROOT |
31
+ # | し | する | AUX | 動詞-非自立可能 | aux |
32
+ # | た | た | AUX | 助動詞 | aux |
33
+ # | 。 | 。 | PUNCT | 補助記号-句点 | punct |
34
+ # +------------+------------+-------+--------------------------+--------+
data/lib/ruby-spacy.rb CHANGED
@@ -252,6 +252,26 @@ module Spacy
252
252
  @text
253
253
  end
254
254
 
255
+ # Returns a hash or string of morphological information
256
+ # @param dict [Boolean] if true, a hash will be returned instead of a string
257
+ # @return [Hash, String]
258
+ def morphology(hash = true)
259
+ if @py_token.has_morph
260
+ morph_analysis = @py_token.morph
261
+ if hash
262
+ return morph_analysis.to_dict
263
+ else
264
+ return morph_analysis.to_s
265
+ end
266
+ else
267
+ if hash
268
+ results = {}
269
+ else
270
+ return ""
271
+ end
272
+ end
273
+ end
274
+
255
275
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
256
276
  def method_missing(name, *args)
257
277
  @py_token.send(name, *args)
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Spacy
4
4
  # The version number of the module
5
- VERSION = "0.1.1"
5
+ VERSION = "0.1.2"
6
6
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spacy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
@@ -75,6 +75,7 @@ files:
75
75
  - bin/setup
76
76
  - examples/get_started/lexeme.rb
77
77
  - examples/get_started/linguistic_annotations.rb
78
+ - examples/get_started/morphology.rb
78
79
  - examples/get_started/most_similar.rb
79
80
  - examples/get_started/named_entities.rb
80
81
  - examples/get_started/outputs/test_dep.svg
@@ -111,7 +112,6 @@ files:
111
112
  - examples/linguistic_features/iterating_children.rb
112
113
  - examples/linguistic_features/iterating_lefts_and_rights.rb
113
114
  - examples/linguistic_features/lemmatization.rb
114
- - examples/linguistic_features/morphology.rb
115
115
  - examples/linguistic_features/named_entity_recognition.rb
116
116
  - examples/linguistic_features/navigating_parse_tree.rb
117
117
  - examples/linguistic_features/noun_chunks.rb
@@ -149,7 +149,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
149
  - !ruby/object:Gem::Version
150
150
  version: '0'
151
151
  requirements: []
152
- rubygems_version: 3.2.3
152
+ rubygems_version: 3.2.11
153
153
  signing_key:
154
154
  specification_version: 4
155
155
  summary: A wrapper module for using spaCy natural language processing library from
@@ -1,17 +0,0 @@
1
- require "ruby-spacy"
2
- require "terminal-table"
3
-
4
- nlp = Spacy::Language.new("en_core_web_sm")
5
-
6
- puts "Pipeline: " + nlp.pipe_names.to_s
7
-
8
- doc = nlp.read("I was reading the paper.")
9
-
10
- token = doc[0]
11
-
12
- puts "Morph features of the first word: " + token.morph.to_s
13
- puts "PronType of the word: " + token.morph.get("PronType").to_s
14
-
15
- # Pipeline: ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
16
- # Morph features of the first word: Case=Nom|Number=Sing|Person=1|PronType=Prs
17
- # PronType of the word: ['Prs']