ruby-spacy 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +2 -1
- data/README.md +7 -7
- data/examples/get_started/lexeme.rb +2 -2
- data/examples/get_started/linguistic_annotations.rb +1 -1
- data/examples/get_started/morphology.rb +1 -1
- data/examples/get_started/named_entities.rb +1 -1
- data/examples/get_started/pos_tags_and_dependencies.rb +1 -1
- data/examples/japanese/ancestors.rb +9 -11
- data/examples/japanese/entity_annotations_and_labels.rb +1 -1
- data/examples/japanese/lemmatization.rb +1 -1
- data/examples/japanese/named_entity_recognition.rb +1 -1
- data/examples/japanese/navigating_parse_tree.rb +18 -18
- data/examples/japanese/noun_chunks.rb +1 -1
- data/examples/japanese/pos_tagging.rb +1 -1
- data/examples/linguistic_features/ancestors.rb +13 -10
- data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
- data/examples/linguistic_features/information_extraction.rb +2 -2
- data/examples/linguistic_features/iterating_children.rb +2 -2
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +4 -4
- data/examples/linguistic_features/lemmatization.rb +1 -1
- data/examples/linguistic_features/named_entity_recognition.rb +1 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
- data/examples/linguistic_features/noun_chunks.rb +1 -1
- data/examples/linguistic_features/pos_tagging.rb +1 -1
- data/examples/linguistic_features/retokenize_1.rb +1 -1
- data/examples/linguistic_features/retokenize_2.rb +2 -2
- data/examples/linguistic_features/rule_based_morphology.rb +1 -1
- data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
- data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
- data/lib/ruby-spacy.rb +181 -22
- data/lib/ruby-spacy/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd5a1c905e5aed7553ac5b1927a6b9cdecaf887c505ea3e38f806e886adeb60c
|
4
|
+
data.tar.gz: 6d3f3fd22e9d927d430d2b9e48dcd018da6eb601813192e6ea14e094cf51e331
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5419fb75109b837465c64da1ace956b91d0a0ab589cdb71ace9a308ce1af263edc0e2f206a80ab71a3ab17e86e6520ab432b657c5f60548c696a36049773c60
|
7
|
+
data.tar.gz: 385606212f290b701458bd1a555e553417ed20be2d1e2008107396a9adc224590c76317c52d30d7c97435c0650ef8c1a15a43fe4b92c797188944a302da51612
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ruby-spacy (0.1.
|
4
|
+
ruby-spacy (0.1.4)
|
5
5
|
numpy (~> 0.4.0)
|
6
6
|
pycall (~> 1.4.0)
|
7
7
|
terminal-table (~> 3.0.1)
|
@@ -24,6 +24,7 @@ GEM
|
|
24
24
|
PLATFORMS
|
25
25
|
arm64-darwin-20
|
26
26
|
x86_64-darwin-20
|
27
|
+
x86_64-linux
|
27
28
|
|
28
29
|
DEPENDENCIES
|
29
30
|
github-markup
|
data/README.md
CHANGED
@@ -128,7 +128,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
128
128
|
rows = []
|
129
129
|
|
130
130
|
doc.each do |token|
|
131
|
-
rows << [token.text, token.
|
131
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
132
132
|
end
|
133
133
|
|
134
134
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -166,7 +166,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
166
166
|
rows = []
|
167
167
|
|
168
168
|
doc.each do |token|
|
169
|
-
rows << [token.text, token.
|
169
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
170
170
|
end
|
171
171
|
|
172
172
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -212,7 +212,7 @@ doc.each do |token|
|
|
212
212
|
morph = token.morphology.map do |k, v|
|
213
213
|
"#{k} = #{v}"
|
214
214
|
end.join("\n")
|
215
|
-
rows << [token.text, token.
|
215
|
+
rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
|
216
216
|
end
|
217
217
|
|
218
218
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -300,7 +300,7 @@ doc =nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
|
300
300
|
rows = []
|
301
301
|
|
302
302
|
doc.ents.each do |ent|
|
303
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
303
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
304
304
|
end
|
305
305
|
|
306
306
|
headings = ["text", "start_char", "end_char", "label"]
|
@@ -332,7 +332,7 @@ doc = nlp.read(sentence)
|
|
332
332
|
rows = []
|
333
333
|
|
334
334
|
doc.ents.each do |ent|
|
335
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
335
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
336
336
|
end
|
337
337
|
|
338
338
|
headings = ["text", "start", "end", "label"]
|
@@ -393,8 +393,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
393
393
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
394
394
|
doc2 = nlp.read("Fast food tastes very good.")
|
395
395
|
|
396
|
-
puts "Doc 1: " + doc1
|
397
|
-
puts "Doc 2: " + doc2
|
396
|
+
puts "Doc 1: " + doc1.text
|
397
|
+
puts "Doc 2: " + doc2.text
|
398
398
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
399
399
|
|
400
400
|
```
|
@@ -8,8 +8,8 @@ headings = ["text", "shape", "prefix", "suffix", "is_alpha", "is_digit"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |word|
|
11
|
-
lexeme =
|
12
|
-
rows << [lexeme.text, lexeme.
|
11
|
+
lexeme = nlp.vocab(word.text)
|
12
|
+
rows << [lexeme.text, lexeme.shape, lexeme.prefix, lexeme.suffix, lexeme.is_alpha, lexeme.is_digit]
|
13
13
|
end
|
14
14
|
|
15
15
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,7 +12,7 @@ doc.each do |token|
|
|
12
12
|
"#{k} = #{v}"
|
13
13
|
end.join("\n")
|
14
14
|
# end.join("<br />")
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "start_char", "end_char", "label"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.ents.each do |ent|
|
11
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
11
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -23,9 +23,7 @@ puts "The root of the sentence is: " + root.text
|
|
23
23
|
puts "The subject of the sentence is: " + subject.text
|
24
24
|
|
25
25
|
subject.subtree.each do |descendant|
|
26
|
-
|
27
|
-
ancestors = Spacy::generator_to_array(descendant.ancestors)
|
28
|
-
rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
|
26
|
+
rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, descendant.ancestors.map(&:text).join(", ")]
|
29
27
|
end
|
30
28
|
|
31
29
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -34,11 +32,11 @@ puts table
|
|
34
32
|
# The sentence: 私の父は寿司が好きだ。
|
35
33
|
# The root of the sentence is: 好き
|
36
34
|
# The subject of the sentence is: 父
|
37
|
-
#
|
38
|
-
# | text | dep | n_lefts | n_rights | ancestors
|
39
|
-
#
|
40
|
-
# | 私 | nmod | 0 | 1 |
|
41
|
-
# | の | case | 0 | 0 |
|
42
|
-
# | 父 | dislocated | 1 | 1 |
|
43
|
-
# | は | case | 0 | 0 |
|
44
|
-
#
|
35
|
+
# +------+------------+---------+----------+--------------+
|
36
|
+
# | text | dep | n_lefts | n_rights | ancestors |
|
37
|
+
# +------+------------+---------+----------+--------------+
|
38
|
+
# | 私 | nmod | 0 | 1 | 父, 好き |
|
39
|
+
# | の | case | 0 | 0 | 私, 父, 好き |
|
40
|
+
# | 父 | dislocated | 1 | 1 | 好き |
|
41
|
+
# | は | case | 0 | 0 | 父, 好き |
|
42
|
+
# +------+------------+---------+----------+--------------+
|
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.each do |ent|
|
13
|
-
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.
|
13
|
+
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.ents.each do |ent|
|
13
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
13
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -9,26 +9,26 @@ headings = ["text", "dep", "head text", "head pos", "children"]
|
|
9
9
|
rows = []
|
10
10
|
|
11
11
|
doc.each do |token|
|
12
|
-
rows << [token.text, token.
|
12
|
+
rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
|
13
13
|
end
|
14
14
|
|
15
15
|
table = Terminal::Table.new rows: rows, headings: headings
|
16
16
|
puts table
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
18
|
+
+------+----------+-----------+----------+------------------------+
|
19
|
+
| text | dep | head text | head pos | children |
|
20
|
+
+------+----------+-----------+----------+------------------------+
|
21
|
+
| 自動 | compound | 車 | 92 | |
|
22
|
+
| 運転 | compound | 車 | 92 | |
|
23
|
+
| 車 | nsubj | 転嫁 | 100 | 自動, 運転, は |
|
24
|
+
| は | case | 車 | 92 | |
|
25
|
+
| 保険 | compound | 責任 | 92 | |
|
26
|
+
| 責任 | obj | 転嫁 | 100 | 保険, を |
|
27
|
+
| を | case | 責任 | 92 | |
|
28
|
+
| 製造 | compound | 者 | 92 | |
|
29
|
+
| 者 | obl | 転嫁 | 100 | 製造, に |
|
30
|
+
| に | case | 者 | 92 | |
|
31
|
+
| 転嫁 | ROOT | 転嫁 | 100 | 車, 責任, 者, する, 。 |
|
32
|
+
| する | aux | 転嫁 | 100 | |
|
33
|
+
| 。 | punct | 転嫁 | 100 | |
|
34
|
+
+------+----------+-----------+----------+------------------------+
|
@@ -9,7 +9,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
|
|
9
9
|
rows = []
|
10
10
|
|
11
11
|
doc.noun_chunks.each do |chunk|
|
12
|
-
rows << [chunk.text, chunk.root.text, chunk.root.
|
12
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
|
13
13
|
end
|
14
14
|
|
15
15
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -24,18 +24,21 @@ puts "The subject of the sentence is: " + subject.text
|
|
24
24
|
subject.subtree.each do |descendant|
|
25
25
|
# need to convert "ancestors" object from a python generator to a ruby array
|
26
26
|
ancestors = Spacy::generator_to_array(descendant.ancestors)
|
27
|
-
rows << [descendant.text, descendant.
|
27
|
+
rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, ancestors.map(&:text).join(", ")]
|
28
28
|
end
|
29
29
|
|
30
30
|
table = Terminal::Table.new rows: rows, headings: headings
|
31
31
|
print table
|
32
32
|
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
# |
|
38
|
-
#
|
39
|
-
# |
|
40
|
-
# |
|
41
|
-
#
|
33
|
+
# The sentence: Credit and mortgage account holders must submit their requests
|
34
|
+
# The root of the sentence is: submit
|
35
|
+
# The subject of the sentence is: holders
|
36
|
+
# +----------+----------+---------+----------+----------------------------------+
|
37
|
+
# | text | dep | n_lefts | n_rights | ancestors |
|
38
|
+
# +----------+----------+---------+----------+----------------------------------+
|
39
|
+
# | Credit | nmod | 0 | 2 | holders, submit |
|
40
|
+
# | and | cc | 0 | 0 | Credit, holders, submit |
|
41
|
+
# | mortgage | compound | 0 | 0 | account, Credit, holders, submit |
|
42
|
+
# | account | conj | 1 | 0 | Credit, holders, submit |
|
43
|
+
# | holders | nsubj | 1 | 0 | submit |
|
44
|
+
# +----------+----------+---------+----------+----------------------------------+
|
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.each do |ent|
|
13
|
-
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.
|
13
|
+
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -19,11 +19,11 @@ texts.each do |text|
|
|
19
19
|
doc.each do |token|
|
20
20
|
if token.ent_type_ == "MONEY"
|
21
21
|
if ["attr", "dobj"].index token.dep_
|
22
|
-
subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.
|
22
|
+
subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep == "nsubj"}
|
23
23
|
if !subj.empty?
|
24
24
|
puts(subj[0].text + " --> " + token.text)
|
25
25
|
end
|
26
|
-
elsif token.dep_ == "pobj" and token.head.
|
26
|
+
elsif token.dep_ == "pobj" and token.head.dep == "prep"
|
27
27
|
puts token.head.head.text + " --> " + token.text
|
28
28
|
end
|
29
29
|
end
|
@@ -12,7 +12,7 @@ doc.each do |token|
|
|
12
12
|
if token.pos_ == "VERB"
|
13
13
|
token.children.each do |child|
|
14
14
|
if child.dep_ == "nsubj"
|
15
|
-
results << child.head
|
15
|
+
results << child.head.text
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -20,5 +20,5 @@ end
|
|
20
20
|
|
21
21
|
puts results.to_s
|
22
22
|
|
23
|
-
# [shift]
|
23
|
+
# ["shift"]
|
24
24
|
|
@@ -7,14 +7,14 @@ doc = nlp.read("bright red apples on the tree")
|
|
7
7
|
|
8
8
|
puts "Text: " + doc.text
|
9
9
|
|
10
|
-
puts "Words to the left of 'apple': " +
|
11
|
-
puts "Words to the right of 'apple': " +
|
10
|
+
puts "Words to the left of 'apple': " + doc[2].lefts.map(&:text).join(", ")
|
11
|
+
puts "Words to the right of 'apple': " + doc[2].rights.map(&:text).join(", ")
|
12
12
|
|
13
13
|
puts "Num of the words to the left of 'apple': " + doc[2].n_lefts.to_s
|
14
14
|
puts "Num of the words to the right of 'apple': " + doc[2].n_rights.to_s
|
15
15
|
|
16
16
|
# Text: bright red apples on the tree
|
17
|
-
# Words to the left of 'apple':
|
18
|
-
# Words to the right of 'apple':
|
17
|
+
# Words to the left of 'apple': bright, red
|
18
|
+
# Words to the right of 'apple': on
|
19
19
|
# Num of the words to the left of 'apple': 2
|
20
20
|
# Num of the words to the right of 'apple': 1
|
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.ents.each do |ent|
|
13
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
13
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
19
19
|
puts table
|
20
20
|
|
21
21
|
# Lemmatizer mode: rule
|
22
|
-
#
|
23
|
-
# | text | dep | head text | head pos | children
|
24
|
-
#
|
25
|
-
# | Autonomous | amod | cars | NOUN |
|
26
|
-
# | cars | nsubj | shift | VERB |
|
27
|
-
# | shift | ROOT | shift | VERB |
|
28
|
-
# | insurance | compound | liability | NOUN |
|
29
|
-
# | liability | dobj | shift | VERB |
|
30
|
-
# | toward | prep | shift | VERB |
|
31
|
-
# | manufacturers | pobj | toward | ADP |
|
32
|
-
#
|
22
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
23
|
+
# | text | dep | head text | head pos | children |
|
24
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
25
|
+
# | Autonomous | amod | cars | NOUN | |
|
26
|
+
# | cars | nsubj | shift | VERB | Autonomous |
|
27
|
+
# | shift | ROOT | shift | VERB | cars, liability, toward |
|
28
|
+
# | insurance | compound | liability | NOUN | |
|
29
|
+
# | liability | dobj | shift | VERB | insurance |
|
30
|
+
# | toward | prep | shift | VERB | manufacturers |
|
31
|
+
# | manufacturers | pobj | toward | ADP | |
|
32
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.noun_chunks.each do |chunk|
|
15
|
-
rows << [chunk.text, chunk.root.text, chunk.root.
|
15
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,7 +12,7 @@ rows = []
|
|
12
12
|
doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.pos, token.dep, token.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
sentence = "I live in New York"
|
7
7
|
doc = nlp.read(sentence)
|
8
8
|
|
9
|
-
puts "Before: " + doc.tokens.
|
9
|
+
puts "Before: " + doc.tokens.map(&:text).join(", ")
|
10
10
|
|
11
11
|
doc.retokenize(3, 4)
|
12
12
|
|
13
|
-
puts "After: " + doc.tokens.
|
13
|
+
puts "After: " + doc.tokens.map(&:text).join(", ")
|
14
14
|
|
15
15
|
# Before: I, live, in, New, York
|
16
16
|
# After: I, live, in, New York
|
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
doc = nlp.read("Where are you?")
|
7
7
|
|
8
8
|
puts "Morph features of the third word: " + doc[2].morph.to_s
|
9
|
-
puts "POS of the third word: " + doc[2].
|
9
|
+
puts "POS of the third word: " + doc[2].pos
|
10
10
|
|
11
11
|
# Morph features of the third word: Case=Nom|Person=2|PronType=Prs
|
12
12
|
# POS of the third word: PRON
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
|
6
|
+
orange = nlp.vocab("orange")
|
7
|
+
lemon = nlp.vocab("lemon")
|
8
|
+
|
9
|
+
book = nlp.vocab("book")
|
10
|
+
magazine = nlp.vocab("magazine")
|
11
|
+
|
12
|
+
puts "orange <=> lemon: #{orange.similarity(lemon)}"
|
13
|
+
puts "book <=> magazine: #{book.similarity(magazine)}"
|
14
|
+
puts "orange <=> book: #{orange.similarity(book)}"
|
15
|
+
|
16
|
+
# orange <=> lemon: 0.7080526351928711
|
17
|
+
# book <=> magazine: 0.4355940818786621
|
18
|
+
# orange <=> book: 0.12197211384773254
|
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
|
|
10
10
|
|
11
11
|
matches.each do |match|
|
12
12
|
span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
|
13
|
-
puts span.text + " / " + span.
|
13
|
+
puts span.text + " / " + span.label
|
14
14
|
end
|
15
15
|
|
16
16
|
# Barack Obama / US_PRESIDENT
|
data/lib/ruby-spacy.rb
CHANGED
@@ -165,6 +165,9 @@ module Spacy
|
|
165
165
|
# so that ents canbe "each"-ed in Ruby
|
166
166
|
ent_array = []
|
167
167
|
PyCall::List.(@py_doc.ents).each do |ent|
|
168
|
+
ent.define_singleton_method :label do
|
169
|
+
return self.label_
|
170
|
+
end
|
168
171
|
ent_array << ent
|
169
172
|
end
|
170
173
|
ent_array
|
@@ -252,10 +255,16 @@ module Spacy
|
|
252
255
|
# @param text [String] A text string representing a lexeme
|
253
256
|
# @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
|
254
257
|
def get_lexeme(text)
|
255
|
-
text = text.gsub("'", "\'")
|
256
258
|
@py_nlp.vocab[text]
|
257
259
|
end
|
258
260
|
|
261
|
+
# Returns a ruby lexeme object
|
262
|
+
# @param text [String] a text string representing the vocabulary item
|
263
|
+
# @return [Lexeme]
|
264
|
+
def vocab(text)
|
265
|
+
Lexeme.new(@py_nlp.vocab[text])
|
266
|
+
end
|
267
|
+
|
259
268
|
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
260
269
|
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
261
270
|
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
@@ -386,18 +395,24 @@ module Spacy
|
|
386
395
|
chunk_array = []
|
387
396
|
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
388
397
|
py_chunks.each do |py_span|
|
389
|
-
chunk_array <<
|
398
|
+
chunk_array << Span.new(@doc, py_span: py_span)
|
390
399
|
end
|
391
400
|
chunk_array
|
392
401
|
end
|
393
402
|
|
403
|
+
# Returns the head token
|
404
|
+
# @return [Token]
|
405
|
+
def root
|
406
|
+
Token.new(@py_span.root)
|
407
|
+
end
|
408
|
+
|
394
409
|
# Returns an array of spans that represents sentences.
|
395
410
|
# @return [Array<Span>]
|
396
411
|
def sents
|
397
412
|
sentence_array = []
|
398
413
|
py_sentences = PyCall::List.(@py_span.sents)
|
399
414
|
py_sentences.each do |py_span|
|
400
|
-
sentence_array <<
|
415
|
+
sentence_array << Span.new(@doc, py_span: py_span)
|
401
416
|
end
|
402
417
|
sentence_array
|
403
418
|
end
|
@@ -407,7 +422,7 @@ module Spacy
|
|
407
422
|
def ents
|
408
423
|
ent_array = []
|
409
424
|
PyCall::List.(@py_span.ents).each do |py_span|
|
410
|
-
ent_array <<
|
425
|
+
ent_array << Span.new(@doc, py_span: py_span)
|
411
426
|
end
|
412
427
|
ent_array
|
413
428
|
end
|
@@ -416,7 +431,7 @@ module Spacy
|
|
416
431
|
# @return [Span]
|
417
432
|
def sent
|
418
433
|
py_span = @py_span.sent
|
419
|
-
return
|
434
|
+
return Span.new(@doc, py_span: py_span)
|
420
435
|
end
|
421
436
|
|
422
437
|
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
@@ -424,9 +439,9 @@ module Spacy
|
|
424
439
|
def [](range)
|
425
440
|
if range.is_a?(Range)
|
426
441
|
py_span = @py_span[range]
|
427
|
-
return
|
442
|
+
return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
428
443
|
else
|
429
|
-
return
|
444
|
+
return Token.new(@py_span[range])
|
430
445
|
end
|
431
446
|
end
|
432
447
|
|
@@ -440,7 +455,7 @@ module Spacy
|
|
440
455
|
# Creates a document instance from the span
|
441
456
|
# @return [Doc]
|
442
457
|
def as_doc
|
443
|
-
|
458
|
+
Doc.new(@doc.py_nlp, text: self.text)
|
444
459
|
end
|
445
460
|
|
446
461
|
# Returns tokens conjugated to the root of the span.
|
@@ -448,7 +463,7 @@ module Spacy
|
|
448
463
|
def conjuncts
|
449
464
|
conjunct_array = []
|
450
465
|
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
451
|
-
conjunct_array <<
|
466
|
+
conjunct_array << Token.new(py_conjunct)
|
452
467
|
end
|
453
468
|
conjunct_array
|
454
469
|
end
|
@@ -458,7 +473,7 @@ module Spacy
|
|
458
473
|
def lefts
|
459
474
|
left_array = []
|
460
475
|
PyCall::List.(@py_span.lefts).each do |py_left|
|
461
|
-
left_array <<
|
476
|
+
left_array << Token.new(py_left)
|
462
477
|
end
|
463
478
|
left_array
|
464
479
|
end
|
@@ -468,7 +483,7 @@ module Spacy
|
|
468
483
|
def rights
|
469
484
|
right_array = []
|
470
485
|
PyCall::List.(@py_span.rights).each do |py_right|
|
471
|
-
right_array <<
|
486
|
+
right_array << Token.new(py_right)
|
472
487
|
end
|
473
488
|
right_array
|
474
489
|
end
|
@@ -478,11 +493,17 @@ module Spacy
|
|
478
493
|
def subtree
|
479
494
|
subtree_array = []
|
480
495
|
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
481
|
-
subtree_array <<
|
496
|
+
subtree_array << Token.new(py_subtree)
|
482
497
|
end
|
483
498
|
subtree_array
|
484
499
|
end
|
485
500
|
|
501
|
+
# Returns the label
|
502
|
+
# @return [String]
|
503
|
+
def label
|
504
|
+
@py_span.label_
|
505
|
+
end
|
506
|
+
|
486
507
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
487
508
|
def method_missing(name, *args)
|
488
509
|
@py_span.send(name, *args)
|
@@ -506,52 +527,59 @@ module Spacy
|
|
506
527
|
@text = @py_token.text
|
507
528
|
end
|
508
529
|
|
530
|
+
|
531
|
+
# Returns the head token
|
532
|
+
# @return [Token]
|
533
|
+
def head
|
534
|
+
Token.new(@py_token.head)
|
535
|
+
end
|
536
|
+
|
509
537
|
# Returns the token in question and the tokens that descend from it.
|
510
|
-
# @return [Array<
|
538
|
+
# @return [Array<Token>] an array of tokens
|
511
539
|
def subtree
|
512
540
|
descendant_array = []
|
513
541
|
PyCall::List.(@py_token.subtree).each do |descendant|
|
514
|
-
descendant_array << descendant
|
542
|
+
descendant_array << Token.new(descendant)
|
515
543
|
end
|
516
544
|
descendant_array
|
517
545
|
end
|
518
546
|
|
519
547
|
# Returns the token's ancestors.
|
520
|
-
# @return [Array<
|
548
|
+
# @return [Array<Token>] an array of tokens
|
521
549
|
def ancestors
|
522
550
|
ancestor_array = []
|
523
551
|
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
524
|
-
ancestor_array << ancestor
|
552
|
+
ancestor_array << Token.new(ancestor)
|
525
553
|
end
|
526
554
|
ancestor_array
|
527
555
|
end
|
528
556
|
|
529
557
|
# Returns a sequence of the token's immediate syntactic children.
|
530
|
-
# @return [Array<
|
558
|
+
# @return [Array<Token>] an array of tokens
|
531
559
|
def children
|
532
560
|
child_array = []
|
533
561
|
PyCall::List.(@py_token.children).each do |child|
|
534
|
-
child_array << child
|
562
|
+
child_array << Token.new(child)
|
535
563
|
end
|
536
564
|
child_array
|
537
565
|
end
|
538
566
|
|
539
567
|
# The leftward immediate children of the word in the syntactic dependency parse.
|
540
|
-
# @return [Array<
|
568
|
+
# @return [Array<Token>] an array of tokens
|
541
569
|
def lefts
|
542
570
|
token_array = []
|
543
571
|
PyCall::List.(@py_token.lefts).each do |token|
|
544
|
-
token_array << token
|
572
|
+
token_array << Token.new(token)
|
545
573
|
end
|
546
574
|
token_array
|
547
575
|
end
|
548
576
|
|
549
577
|
# The rightward immediate children of the word in the syntactic dependency parse.
|
550
|
-
# @return [Array<
|
578
|
+
# @return [Array<Token>] an array of tokens
|
551
579
|
def rights
|
552
580
|
token_array = []
|
553
581
|
PyCall::List.(@py_token.rights).each do |token|
|
554
|
-
token_array << token
|
582
|
+
token_array << Token.new(token)
|
555
583
|
end
|
556
584
|
token_array
|
557
585
|
end
|
@@ -582,12 +610,143 @@ module Spacy
|
|
582
610
|
end
|
583
611
|
end
|
584
612
|
|
613
|
+
# Returns the lemma by calling `lemma_' of `@py_token` object
|
614
|
+
# @return [String]
|
615
|
+
def lemma
|
616
|
+
@py_token.lemma_
|
617
|
+
end
|
618
|
+
|
619
|
+
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
620
|
+
# @return [String]
|
621
|
+
def lower
|
622
|
+
@py_token.lower_
|
623
|
+
end
|
624
|
+
|
625
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
626
|
+
# @return [String]
|
627
|
+
def shape
|
628
|
+
@py_token.shape_
|
629
|
+
end
|
630
|
+
|
631
|
+
# Returns the pos by calling `pos_' of `@py_token` object
|
632
|
+
# @return [String]
|
633
|
+
def pos
|
634
|
+
@py_token.pos_
|
635
|
+
end
|
636
|
+
|
637
|
+
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
638
|
+
# @return [String]
|
639
|
+
def tag
|
640
|
+
@py_token.tag_
|
641
|
+
end
|
642
|
+
|
643
|
+
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
644
|
+
# @return [String]
|
645
|
+
def dep
|
646
|
+
@py_token.dep_
|
647
|
+
end
|
648
|
+
|
649
|
+
# Returns the language by calling `lang_' of `@py_token` object
|
650
|
+
# @return [String]
|
651
|
+
def lang
|
652
|
+
@py_token.lang_
|
653
|
+
end
|
654
|
+
|
655
|
+
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
656
|
+
# @return [String]
|
657
|
+
def whitespace
|
658
|
+
@py_token.whitespace_
|
659
|
+
end
|
660
|
+
|
661
|
+
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
662
|
+
# @return [String]
|
663
|
+
def ent_type
|
664
|
+
@py_token.ent_type_
|
665
|
+
end
|
666
|
+
|
667
|
+
# Returns a lexeme object
|
668
|
+
# @return [Lexeme]
|
669
|
+
def lexeme
|
670
|
+
Lexeme.new(@py_token.lex)
|
671
|
+
end
|
672
|
+
|
585
673
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
586
674
|
def method_missing(name, *args)
|
587
675
|
@py_token.send(name, *args)
|
588
676
|
end
|
589
677
|
end
|
590
678
|
|
679
|
+
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
680
|
+
class Lexeme
|
681
|
+
|
682
|
+
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
683
|
+
attr_reader :py_lexeme
|
684
|
+
|
685
|
+
# @return [String] a string representing the token
|
686
|
+
attr_reader :text
|
687
|
+
|
688
|
+
# It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
|
689
|
+
# There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
|
690
|
+
# @param py_lexeme [Object] Python `Lexeme` object
|
691
|
+
def initialize(py_lexeme)
|
692
|
+
@py_lexeme = py_lexeme
|
693
|
+
@text = @py_lexeme.text
|
694
|
+
end
|
695
|
+
|
696
|
+
# String representation of the token.
|
697
|
+
# @return [String]
|
698
|
+
def to_s
|
699
|
+
@text
|
700
|
+
end
|
701
|
+
|
702
|
+
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
703
|
+
# @return [String]
|
704
|
+
def lower
|
705
|
+
@py_lexeme.lower_
|
706
|
+
end
|
707
|
+
|
708
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
709
|
+
# @return [String]
|
710
|
+
def shape
|
711
|
+
@py_lexeme.shape_
|
712
|
+
end
|
713
|
+
|
714
|
+
# Returns the language by calling `lang_' of `@py_lexeme` object
|
715
|
+
# @return [String]
|
716
|
+
def lang
|
717
|
+
@py_lexeme.lang_
|
718
|
+
end
|
719
|
+
|
720
|
+
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
721
|
+
# @return [String]
|
722
|
+
def prefix
|
723
|
+
@py_lexeme.prefix_
|
724
|
+
end
|
725
|
+
#
|
726
|
+
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
727
|
+
# @return [String]
|
728
|
+
def suffix
|
729
|
+
@py_lexeme.suffix_
|
730
|
+
end
|
731
|
+
|
732
|
+
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
733
|
+
# @return [String]
|
734
|
+
def norm
|
735
|
+
@py_lexeme.norm_
|
736
|
+
end
|
737
|
+
|
738
|
+
# Returns a semantic similarity estimate.
|
739
|
+
# @param other [Lexeme] the other doc to which a similarity estimation is made
|
740
|
+
# @return [Float]
|
741
|
+
def similarity(other)
|
742
|
+
@py_lexeme.similarity(other.py_lexeme)
|
743
|
+
end
|
744
|
+
|
745
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
746
|
+
def method_missing(name, *args)
|
747
|
+
@py_lexeme.send(name, *args)
|
748
|
+
end
|
749
|
+
end
|
591
750
|
|
592
751
|
end
|
593
752
|
|
data/lib/ruby-spacy/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spacy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pycall
|
@@ -123,6 +123,7 @@ files:
|
|
123
123
|
- examples/linguistic_features/rule_based_morphology.rb
|
124
124
|
- examples/linguistic_features/sentence_segmentation.rb
|
125
125
|
- examples/linguistic_features/similarity.rb
|
126
|
+
- examples/linguistic_features/similarity_between_lexemes.rb
|
126
127
|
- examples/linguistic_features/similarity_between_spans.rb
|
127
128
|
- examples/linguistic_features/tokenization.rb
|
128
129
|
- examples/rule_based_matching/creating_spans_from_matches.rb
|
@@ -149,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
149
150
|
- !ruby/object:Gem::Version
|
150
151
|
version: '0'
|
151
152
|
requirements: []
|
152
|
-
rubygems_version: 3.2.
|
153
|
+
rubygems_version: 3.2.11
|
153
154
|
signing_key:
|
154
155
|
specification_version: 4
|
155
156
|
summary: A wrapper module for using spaCy natural language processing library from
|