ruby-spacy 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Gemfile.lock +2 -1
- data/README.md +7 -7
- data/examples/get_started/lexeme.rb +2 -2
- data/examples/get_started/linguistic_annotations.rb +1 -1
- data/examples/get_started/morphology.rb +1 -1
- data/examples/get_started/named_entities.rb +1 -1
- data/examples/get_started/pos_tags_and_dependencies.rb +1 -1
- data/examples/japanese/ancestors.rb +9 -11
- data/examples/japanese/entity_annotations_and_labels.rb +1 -1
- data/examples/japanese/lemmatization.rb +1 -1
- data/examples/japanese/named_entity_recognition.rb +1 -1
- data/examples/japanese/navigating_parse_tree.rb +18 -18
- data/examples/japanese/noun_chunks.rb +1 -1
- data/examples/japanese/pos_tagging.rb +1 -1
- data/examples/linguistic_features/ancestors.rb +13 -10
- data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
- data/examples/linguistic_features/information_extraction.rb +2 -2
- data/examples/linguistic_features/iterating_children.rb +2 -2
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +4 -4
- data/examples/linguistic_features/lemmatization.rb +1 -1
- data/examples/linguistic_features/named_entity_recognition.rb +1 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
- data/examples/linguistic_features/noun_chunks.rb +1 -1
- data/examples/linguistic_features/pos_tagging.rb +1 -1
- data/examples/linguistic_features/retokenize_1.rb +1 -1
- data/examples/linguistic_features/retokenize_2.rb +2 -2
- data/examples/linguistic_features/rule_based_morphology.rb +1 -1
- data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
- data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
- data/lib/ruby-spacy.rb +181 -22
- data/lib/ruby-spacy/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd5a1c905e5aed7553ac5b1927a6b9cdecaf887c505ea3e38f806e886adeb60c
|
4
|
+
data.tar.gz: 6d3f3fd22e9d927d430d2b9e48dcd018da6eb601813192e6ea14e094cf51e331
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5419fb75109b837465c64da1ace956b91d0a0ab589cdb71ace9a308ce1af263edc0e2f206a80ab71a3ab17e86e6520ab432b657c5f60548c696a36049773c60
|
7
|
+
data.tar.gz: 385606212f290b701458bd1a555e553417ed20be2d1e2008107396a9adc224590c76317c52d30d7c97435c0650ef8c1a15a43fe4b92c797188944a302da51612
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
ruby-spacy (0.1.
|
4
|
+
ruby-spacy (0.1.4)
|
5
5
|
numpy (~> 0.4.0)
|
6
6
|
pycall (~> 1.4.0)
|
7
7
|
terminal-table (~> 3.0.1)
|
@@ -24,6 +24,7 @@ GEM
|
|
24
24
|
PLATFORMS
|
25
25
|
arm64-darwin-20
|
26
26
|
x86_64-darwin-20
|
27
|
+
x86_64-linux
|
27
28
|
|
28
29
|
DEPENDENCIES
|
29
30
|
github-markup
|
data/README.md
CHANGED
@@ -128,7 +128,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
128
128
|
rows = []
|
129
129
|
|
130
130
|
doc.each do |token|
|
131
|
-
rows << [token.text, token.
|
131
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
132
132
|
end
|
133
133
|
|
134
134
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -166,7 +166,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
166
166
|
rows = []
|
167
167
|
|
168
168
|
doc.each do |token|
|
169
|
-
rows << [token.text, token.
|
169
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
170
170
|
end
|
171
171
|
|
172
172
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -212,7 +212,7 @@ doc.each do |token|
|
|
212
212
|
morph = token.morphology.map do |k, v|
|
213
213
|
"#{k} = #{v}"
|
214
214
|
end.join("\n")
|
215
|
-
rows << [token.text, token.
|
215
|
+
rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
|
216
216
|
end
|
217
217
|
|
218
218
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -300,7 +300,7 @@ doc =nlp.read("Apple is looking at buying U.K. startup for $1 billion")
|
|
300
300
|
rows = []
|
301
301
|
|
302
302
|
doc.ents.each do |ent|
|
303
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
303
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
304
304
|
end
|
305
305
|
|
306
306
|
headings = ["text", "start_char", "end_char", "label"]
|
@@ -332,7 +332,7 @@ doc = nlp.read(sentence)
|
|
332
332
|
rows = []
|
333
333
|
|
334
334
|
doc.ents.each do |ent|
|
335
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
335
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
336
336
|
end
|
337
337
|
|
338
338
|
headings = ["text", "start", "end", "label"]
|
@@ -393,8 +393,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
|
|
393
393
|
doc1 = nlp.read("I like salty fries and hamburgers.")
|
394
394
|
doc2 = nlp.read("Fast food tastes very good.")
|
395
395
|
|
396
|
-
puts "Doc 1: " + doc1
|
397
|
-
puts "Doc 2: " + doc2
|
396
|
+
puts "Doc 1: " + doc1.text
|
397
|
+
puts "Doc 2: " + doc2.text
|
398
398
|
puts "Similarity: #{doc1.similarity(doc2)}"
|
399
399
|
|
400
400
|
```
|
@@ -8,8 +8,8 @@ headings = ["text", "shape", "prefix", "suffix", "is_alpha", "is_digit"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |word|
|
11
|
-
lexeme =
|
12
|
-
rows << [lexeme.text, lexeme.
|
11
|
+
lexeme = nlp.vocab(word.text)
|
12
|
+
rows << [lexeme.text, lexeme.shape, lexeme.prefix, lexeme.suffix, lexeme.is_alpha, lexeme.is_digit]
|
13
13
|
end
|
14
14
|
|
15
15
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,7 +12,7 @@ doc.each do |token|
|
|
12
12
|
"#{k} = #{v}"
|
13
13
|
end.join("\n")
|
14
14
|
# end.join("<br />")
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "start_char", "end_char", "label"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.ents.each do |ent|
|
11
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
11
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -23,9 +23,7 @@ puts "The root of the sentence is: " + root.text
|
|
23
23
|
puts "The subject of the sentence is: " + subject.text
|
24
24
|
|
25
25
|
subject.subtree.each do |descendant|
|
26
|
-
|
27
|
-
ancestors = Spacy::generator_to_array(descendant.ancestors)
|
28
|
-
rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
|
26
|
+
rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, descendant.ancestors.map(&:text).join(", ")]
|
29
27
|
end
|
30
28
|
|
31
29
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -34,11 +32,11 @@ puts table
|
|
34
32
|
# The sentence: 私の父は寿司が好きだ。
|
35
33
|
# The root of the sentence is: 好き
|
36
34
|
# The subject of the sentence is: 父
|
37
|
-
#
|
38
|
-
# | text | dep | n_lefts | n_rights | ancestors
|
39
|
-
#
|
40
|
-
# | 私 | nmod | 0 | 1 |
|
41
|
-
# | の | case | 0 | 0 |
|
42
|
-
# | 父 | dislocated | 1 | 1 |
|
43
|
-
# | は | case | 0 | 0 |
|
44
|
-
#
|
35
|
+
# +------+------------+---------+----------+--------------+
|
36
|
+
# | text | dep | n_lefts | n_rights | ancestors |
|
37
|
+
# +------+------------+---------+----------+--------------+
|
38
|
+
# | 私 | nmod | 0 | 1 | 父, 好き |
|
39
|
+
# | の | case | 0 | 0 | 私, 父, 好き |
|
40
|
+
# | 父 | dislocated | 1 | 1 | 好き |
|
41
|
+
# | は | case | 0 | 0 | 父, 好き |
|
42
|
+
# +------+------------+---------+----------+--------------+
|
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.each do |ent|
|
13
|
-
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.
|
13
|
+
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.ents.each do |ent|
|
13
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
13
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -9,26 +9,26 @@ headings = ["text", "dep", "head text", "head pos", "children"]
|
|
9
9
|
rows = []
|
10
10
|
|
11
11
|
doc.each do |token|
|
12
|
-
rows << [token.text, token.
|
12
|
+
rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
|
13
13
|
end
|
14
14
|
|
15
15
|
table = Terminal::Table.new rows: rows, headings: headings
|
16
16
|
puts table
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
18
|
+
+------+----------+-----------+----------+------------------------+
|
19
|
+
| text | dep | head text | head pos | children |
|
20
|
+
+------+----------+-----------+----------+------------------------+
|
21
|
+
| 自動 | compound | 車 | 92 | |
|
22
|
+
| 運転 | compound | 車 | 92 | |
|
23
|
+
| 車 | nsubj | 転嫁 | 100 | 自動, 運転, は |
|
24
|
+
| は | case | 車 | 92 | |
|
25
|
+
| 保険 | compound | 責任 | 92 | |
|
26
|
+
| 責任 | obj | 転嫁 | 100 | 保険, を |
|
27
|
+
| を | case | 責任 | 92 | |
|
28
|
+
| 製造 | compound | 者 | 92 | |
|
29
|
+
| 者 | obl | 転嫁 | 100 | 製造, に |
|
30
|
+
| に | case | 者 | 92 | |
|
31
|
+
| 転嫁 | ROOT | 転嫁 | 100 | 車, 責任, 者, する, 。 |
|
32
|
+
| する | aux | 転嫁 | 100 | |
|
33
|
+
| 。 | punct | 転嫁 | 100 | |
|
34
|
+
+------+----------+-----------+----------+------------------------+
|
@@ -9,7 +9,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
|
|
9
9
|
rows = []
|
10
10
|
|
11
11
|
doc.noun_chunks.each do |chunk|
|
12
|
-
rows << [chunk.text, chunk.root.text, chunk.root.
|
12
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
|
13
13
|
end
|
14
14
|
|
15
15
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -24,18 +24,21 @@ puts "The subject of the sentence is: " + subject.text
|
|
24
24
|
subject.subtree.each do |descendant|
|
25
25
|
# need to convert "ancestors" object from a python generator to a ruby array
|
26
26
|
ancestors = Spacy::generator_to_array(descendant.ancestors)
|
27
|
-
rows << [descendant.text, descendant.
|
27
|
+
rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, ancestors.map(&:text).join(", ")]
|
28
28
|
end
|
29
29
|
|
30
30
|
table = Terminal::Table.new rows: rows, headings: headings
|
31
31
|
print table
|
32
32
|
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
# |
|
38
|
-
#
|
39
|
-
# |
|
40
|
-
# |
|
41
|
-
#
|
33
|
+
# The sentence: Credit and mortgage account holders must submit their requests
|
34
|
+
# The root of the sentence is: submit
|
35
|
+
# The subject of the sentence is: holders
|
36
|
+
# +----------+----------+---------+----------+----------------------------------+
|
37
|
+
# | text | dep | n_lefts | n_rights | ancestors |
|
38
|
+
# +----------+----------+---------+----------+----------------------------------+
|
39
|
+
# | Credit | nmod | 0 | 2 | holders, submit |
|
40
|
+
# | and | cc | 0 | 0 | Credit, holders, submit |
|
41
|
+
# | mortgage | compound | 0 | 0 | account, Credit, holders, submit |
|
42
|
+
# | account | conj | 1 | 0 | Credit, holders, submit |
|
43
|
+
# | holders | nsubj | 1 | 0 | submit |
|
44
|
+
# +----------+----------+---------+----------+----------------------------------+
|
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.each do |ent|
|
13
|
-
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.
|
13
|
+
rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -19,11 +19,11 @@ texts.each do |text|
|
|
19
19
|
doc.each do |token|
|
20
20
|
if token.ent_type_ == "MONEY"
|
21
21
|
if ["attr", "dobj"].index token.dep_
|
22
|
-
subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.
|
22
|
+
subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep == "nsubj"}
|
23
23
|
if !subj.empty?
|
24
24
|
puts(subj[0].text + " --> " + token.text)
|
25
25
|
end
|
26
|
-
elsif token.dep_ == "pobj" and token.head.
|
26
|
+
elsif token.dep_ == "pobj" and token.head.dep == "prep"
|
27
27
|
puts token.head.head.text + " --> " + token.text
|
28
28
|
end
|
29
29
|
end
|
@@ -12,7 +12,7 @@ doc.each do |token|
|
|
12
12
|
if token.pos_ == "VERB"
|
13
13
|
token.children.each do |child|
|
14
14
|
if child.dep_ == "nsubj"
|
15
|
-
results << child.head
|
15
|
+
results << child.head.text
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -20,5 +20,5 @@ end
|
|
20
20
|
|
21
21
|
puts results.to_s
|
22
22
|
|
23
|
-
# [shift]
|
23
|
+
# ["shift"]
|
24
24
|
|
@@ -7,14 +7,14 @@ doc = nlp.read("bright red apples on the tree")
|
|
7
7
|
|
8
8
|
puts "Text: " + doc.text
|
9
9
|
|
10
|
-
puts "Words to the left of 'apple': " +
|
11
|
-
puts "Words to the right of 'apple': " +
|
10
|
+
puts "Words to the left of 'apple': " + doc[2].lefts.map(&:text).join(", ")
|
11
|
+
puts "Words to the right of 'apple': " + doc[2].rights.map(&:text).join(", ")
|
12
12
|
|
13
13
|
puts "Num of the words to the left of 'apple': " + doc[2].n_lefts.to_s
|
14
14
|
puts "Num of the words to the right of 'apple': " + doc[2].n_rights.to_s
|
15
15
|
|
16
16
|
# Text: bright red apples on the tree
|
17
|
-
# Words to the left of 'apple':
|
18
|
-
# Words to the right of 'apple':
|
17
|
+
# Words to the left of 'apple': bright, red
|
18
|
+
# Words to the right of 'apple': on
|
19
19
|
# Num of the words to the left of 'apple': 2
|
20
20
|
# Num of the words to the right of 'apple': 1
|
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
|
|
10
10
|
rows = []
|
11
11
|
|
12
12
|
doc.ents.each do |ent|
|
13
|
-
rows << [ent.text, ent.start_char, ent.end_char, ent.
|
13
|
+
rows << [ent.text, ent.start_char, ent.end_char, ent.label]
|
14
14
|
end
|
15
15
|
|
16
16
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
19
19
|
puts table
|
20
20
|
|
21
21
|
# Lemmatizer mode: rule
|
22
|
-
#
|
23
|
-
# | text | dep | head text | head pos | children
|
24
|
-
#
|
25
|
-
# | Autonomous | amod | cars | NOUN |
|
26
|
-
# | cars | nsubj | shift | VERB |
|
27
|
-
# | shift | ROOT | shift | VERB |
|
28
|
-
# | insurance | compound | liability | NOUN |
|
29
|
-
# | liability | dobj | shift | VERB |
|
30
|
-
# | toward | prep | shift | VERB |
|
31
|
-
# | manufacturers | pobj | toward | ADP |
|
32
|
-
#
|
22
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
23
|
+
# | text | dep | head text | head pos | children |
|
24
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
25
|
+
# | Autonomous | amod | cars | NOUN | |
|
26
|
+
# | cars | nsubj | shift | VERB | Autonomous |
|
27
|
+
# | shift | ROOT | shift | VERB | cars, liability, toward |
|
28
|
+
# | insurance | compound | liability | NOUN | |
|
29
|
+
# | liability | dobj | shift | VERB | insurance |
|
30
|
+
# | toward | prep | shift | VERB | manufacturers |
|
31
|
+
# | manufacturers | pobj | toward | ADP | |
|
32
|
+
# +---------------+----------+-----------+----------+-------------------------+
|
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
|
|
12
12
|
rows = []
|
13
13
|
|
14
14
|
doc.noun_chunks.each do |chunk|
|
15
|
-
rows << [chunk.text, chunk.root.text, chunk.root.
|
15
|
+
rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
|
|
8
8
|
rows = []
|
9
9
|
|
10
10
|
doc.each do |token|
|
11
|
-
rows << [token.text, token.
|
11
|
+
rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
|
12
12
|
end
|
13
13
|
|
14
14
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -12,7 +12,7 @@ rows = []
|
|
12
12
|
doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
|
13
13
|
|
14
14
|
doc.each do |token|
|
15
|
-
rows << [token.text, token.
|
15
|
+
rows << [token.text, token.pos, token.dep, token.head.text]
|
16
16
|
end
|
17
17
|
|
18
18
|
table = Terminal::Table.new rows: rows, headings: headings
|
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
sentence = "I live in New York"
|
7
7
|
doc = nlp.read(sentence)
|
8
8
|
|
9
|
-
puts "Before: " + doc.tokens.
|
9
|
+
puts "Before: " + doc.tokens.map(&:text).join(", ")
|
10
10
|
|
11
11
|
doc.retokenize(3, 4)
|
12
12
|
|
13
|
-
puts "After: " + doc.tokens.
|
13
|
+
puts "After: " + doc.tokens.map(&:text).join(", ")
|
14
14
|
|
15
15
|
# Before: I, live, in, New, York
|
16
16
|
# After: I, live, in, New York
|
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
6
6
|
doc = nlp.read("Where are you?")
|
7
7
|
|
8
8
|
puts "Morph features of the third word: " + doc[2].morph.to_s
|
9
|
-
puts "POS of the third word: " + doc[2].
|
9
|
+
puts "POS of the third word: " + doc[2].pos
|
10
10
|
|
11
11
|
# Morph features of the third word: Case=Nom|Person=2|PronType=Prs
|
12
12
|
# POS of the third word: PRON
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require "ruby-spacy"
|
2
|
+
require "terminal-table"
|
3
|
+
|
4
|
+
nlp = Spacy::Language.new("en_core_web_lg")
|
5
|
+
|
6
|
+
orange = nlp.vocab("orange")
|
7
|
+
lemon = nlp.vocab("lemon")
|
8
|
+
|
9
|
+
book = nlp.vocab("book")
|
10
|
+
magazine = nlp.vocab("magazine")
|
11
|
+
|
12
|
+
puts "orange <=> lemon: #{orange.similarity(lemon)}"
|
13
|
+
puts "book <=> magazine: #{book.similarity(magazine)}"
|
14
|
+
puts "orange <=> book: #{orange.similarity(book)}"
|
15
|
+
|
16
|
+
# orange <=> lemon: 0.7080526351928711
|
17
|
+
# book <=> magazine: 0.4355940818786621
|
18
|
+
# orange <=> book: 0.12197211384773254
|
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
|
|
10
10
|
|
11
11
|
matches.each do |match|
|
12
12
|
span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
|
13
|
-
puts span.text + " / " + span.
|
13
|
+
puts span.text + " / " + span.label
|
14
14
|
end
|
15
15
|
|
16
16
|
# Barack Obama / US_PRESIDENT
|
data/lib/ruby-spacy.rb
CHANGED
@@ -165,6 +165,9 @@ module Spacy
|
|
165
165
|
# so that ents canbe "each"-ed in Ruby
|
166
166
|
ent_array = []
|
167
167
|
PyCall::List.(@py_doc.ents).each do |ent|
|
168
|
+
ent.define_singleton_method :label do
|
169
|
+
return self.label_
|
170
|
+
end
|
168
171
|
ent_array << ent
|
169
172
|
end
|
170
173
|
ent_array
|
@@ -252,10 +255,16 @@ module Spacy
|
|
252
255
|
# @param text [String] A text string representing a lexeme
|
253
256
|
# @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
|
254
257
|
def get_lexeme(text)
|
255
|
-
text = text.gsub("'", "\'")
|
256
258
|
@py_nlp.vocab[text]
|
257
259
|
end
|
258
260
|
|
261
|
+
# Returns a ruby lexeme object
|
262
|
+
# @param text [String] a text string representing the vocabulary item
|
263
|
+
# @return [Lexeme]
|
264
|
+
def vocab(text)
|
265
|
+
Lexeme.new(@py_nlp.vocab[text])
|
266
|
+
end
|
267
|
+
|
259
268
|
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
260
269
|
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
261
270
|
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
@@ -386,18 +395,24 @@ module Spacy
|
|
386
395
|
chunk_array = []
|
387
396
|
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
388
397
|
py_chunks.each do |py_span|
|
389
|
-
chunk_array <<
|
398
|
+
chunk_array << Span.new(@doc, py_span: py_span)
|
390
399
|
end
|
391
400
|
chunk_array
|
392
401
|
end
|
393
402
|
|
403
|
+
# Returns the head token
|
404
|
+
# @return [Token]
|
405
|
+
def root
|
406
|
+
Token.new(@py_span.root)
|
407
|
+
end
|
408
|
+
|
394
409
|
# Returns an array of spans that represents sentences.
|
395
410
|
# @return [Array<Span>]
|
396
411
|
def sents
|
397
412
|
sentence_array = []
|
398
413
|
py_sentences = PyCall::List.(@py_span.sents)
|
399
414
|
py_sentences.each do |py_span|
|
400
|
-
sentence_array <<
|
415
|
+
sentence_array << Span.new(@doc, py_span: py_span)
|
401
416
|
end
|
402
417
|
sentence_array
|
403
418
|
end
|
@@ -407,7 +422,7 @@ module Spacy
|
|
407
422
|
def ents
|
408
423
|
ent_array = []
|
409
424
|
PyCall::List.(@py_span.ents).each do |py_span|
|
410
|
-
ent_array <<
|
425
|
+
ent_array << Span.new(@doc, py_span: py_span)
|
411
426
|
end
|
412
427
|
ent_array
|
413
428
|
end
|
@@ -416,7 +431,7 @@ module Spacy
|
|
416
431
|
# @return [Span]
|
417
432
|
def sent
|
418
433
|
py_span = @py_span.sent
|
419
|
-
return
|
434
|
+
return Span.new(@doc, py_span: py_span)
|
420
435
|
end
|
421
436
|
|
422
437
|
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
@@ -424,9 +439,9 @@ module Spacy
|
|
424
439
|
def [](range)
|
425
440
|
if range.is_a?(Range)
|
426
441
|
py_span = @py_span[range]
|
427
|
-
return
|
442
|
+
return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
428
443
|
else
|
429
|
-
return
|
444
|
+
return Token.new(@py_span[range])
|
430
445
|
end
|
431
446
|
end
|
432
447
|
|
@@ -440,7 +455,7 @@ module Spacy
|
|
440
455
|
# Creates a document instance from the span
|
441
456
|
# @return [Doc]
|
442
457
|
def as_doc
|
443
|
-
|
458
|
+
Doc.new(@doc.py_nlp, text: self.text)
|
444
459
|
end
|
445
460
|
|
446
461
|
# Returns tokens conjugated to the root of the span.
|
@@ -448,7 +463,7 @@ module Spacy
|
|
448
463
|
def conjuncts
|
449
464
|
conjunct_array = []
|
450
465
|
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
451
|
-
conjunct_array <<
|
466
|
+
conjunct_array << Token.new(py_conjunct)
|
452
467
|
end
|
453
468
|
conjunct_array
|
454
469
|
end
|
@@ -458,7 +473,7 @@ module Spacy
|
|
458
473
|
def lefts
|
459
474
|
left_array = []
|
460
475
|
PyCall::List.(@py_span.lefts).each do |py_left|
|
461
|
-
left_array <<
|
476
|
+
left_array << Token.new(py_left)
|
462
477
|
end
|
463
478
|
left_array
|
464
479
|
end
|
@@ -468,7 +483,7 @@ module Spacy
|
|
468
483
|
def rights
|
469
484
|
right_array = []
|
470
485
|
PyCall::List.(@py_span.rights).each do |py_right|
|
471
|
-
right_array <<
|
486
|
+
right_array << Token.new(py_right)
|
472
487
|
end
|
473
488
|
right_array
|
474
489
|
end
|
@@ -478,11 +493,17 @@ module Spacy
|
|
478
493
|
def subtree
|
479
494
|
subtree_array = []
|
480
495
|
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
481
|
-
subtree_array <<
|
496
|
+
subtree_array << Token.new(py_subtree)
|
482
497
|
end
|
483
498
|
subtree_array
|
484
499
|
end
|
485
500
|
|
501
|
+
# Returns the label
|
502
|
+
# @return [String]
|
503
|
+
def label
|
504
|
+
@py_span.label_
|
505
|
+
end
|
506
|
+
|
486
507
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
487
508
|
def method_missing(name, *args)
|
488
509
|
@py_span.send(name, *args)
|
@@ -506,52 +527,59 @@ module Spacy
|
|
506
527
|
@text = @py_token.text
|
507
528
|
end
|
508
529
|
|
530
|
+
|
531
|
+
# Returns the head token
|
532
|
+
# @return [Token]
|
533
|
+
def head
|
534
|
+
Token.new(@py_token.head)
|
535
|
+
end
|
536
|
+
|
509
537
|
# Returns the token in question and the tokens that descend from it.
|
510
|
-
# @return [Array<
|
538
|
+
# @return [Array<Token>] an array of tokens
|
511
539
|
def subtree
|
512
540
|
descendant_array = []
|
513
541
|
PyCall::List.(@py_token.subtree).each do |descendant|
|
514
|
-
descendant_array << descendant
|
542
|
+
descendant_array << Token.new(descendant)
|
515
543
|
end
|
516
544
|
descendant_array
|
517
545
|
end
|
518
546
|
|
519
547
|
# Returns the token's ancestors.
|
520
|
-
# @return [Array<
|
548
|
+
# @return [Array<Token>] an array of tokens
|
521
549
|
def ancestors
|
522
550
|
ancestor_array = []
|
523
551
|
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
524
|
-
ancestor_array << ancestor
|
552
|
+
ancestor_array << Token.new(ancestor)
|
525
553
|
end
|
526
554
|
ancestor_array
|
527
555
|
end
|
528
556
|
|
529
557
|
# Returns a sequence of the token's immediate syntactic children.
|
530
|
-
# @return [Array<
|
558
|
+
# @return [Array<Token>] an array of tokens
|
531
559
|
def children
|
532
560
|
child_array = []
|
533
561
|
PyCall::List.(@py_token.children).each do |child|
|
534
|
-
child_array << child
|
562
|
+
child_array << Token.new(child)
|
535
563
|
end
|
536
564
|
child_array
|
537
565
|
end
|
538
566
|
|
539
567
|
# The leftward immediate children of the word in the syntactic dependency parse.
|
540
|
-
# @return [Array<
|
568
|
+
# @return [Array<Token>] an array of tokens
|
541
569
|
def lefts
|
542
570
|
token_array = []
|
543
571
|
PyCall::List.(@py_token.lefts).each do |token|
|
544
|
-
token_array << token
|
572
|
+
token_array << Token.new(token)
|
545
573
|
end
|
546
574
|
token_array
|
547
575
|
end
|
548
576
|
|
549
577
|
# The rightward immediate children of the word in the syntactic dependency parse.
|
550
|
-
# @return [Array<
|
578
|
+
# @return [Array<Token>] an array of tokens
|
551
579
|
def rights
|
552
580
|
token_array = []
|
553
581
|
PyCall::List.(@py_token.rights).each do |token|
|
554
|
-
token_array << token
|
582
|
+
token_array << Token.new(token)
|
555
583
|
end
|
556
584
|
token_array
|
557
585
|
end
|
@@ -582,12 +610,143 @@ module Spacy
|
|
582
610
|
end
|
583
611
|
end
|
584
612
|
|
613
|
+
# Returns the lemma by calling `lemma_' of `@py_token` object
|
614
|
+
# @return [String]
|
615
|
+
def lemma
|
616
|
+
@py_token.lemma_
|
617
|
+
end
|
618
|
+
|
619
|
+
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
620
|
+
# @return [String]
|
621
|
+
def lower
|
622
|
+
@py_token.lower_
|
623
|
+
end
|
624
|
+
|
625
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
626
|
+
# @return [String]
|
627
|
+
def shape
|
628
|
+
@py_token.shape_
|
629
|
+
end
|
630
|
+
|
631
|
+
# Returns the pos by calling `pos_' of `@py_token` object
|
632
|
+
# @return [String]
|
633
|
+
def pos
|
634
|
+
@py_token.pos_
|
635
|
+
end
|
636
|
+
|
637
|
+
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
638
|
+
# @return [String]
|
639
|
+
def tag
|
640
|
+
@py_token.tag_
|
641
|
+
end
|
642
|
+
|
643
|
+
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
644
|
+
# @return [String]
|
645
|
+
def dep
|
646
|
+
@py_token.dep_
|
647
|
+
end
|
648
|
+
|
649
|
+
# Returns the language by calling `lang_' of `@py_token` object
|
650
|
+
# @return [String]
|
651
|
+
def lang
|
652
|
+
@py_token.lang_
|
653
|
+
end
|
654
|
+
|
655
|
+
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
656
|
+
# @return [String]
|
657
|
+
def whitespace
|
658
|
+
@py_token.whitespace_
|
659
|
+
end
|
660
|
+
|
661
|
+
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
662
|
+
# @return [String]
|
663
|
+
def ent_type
|
664
|
+
@py_token.ent_type_
|
665
|
+
end
|
666
|
+
|
667
|
+
# Returns a lexeme object
|
668
|
+
# @return [Lexeme]
|
669
|
+
def lexeme
|
670
|
+
Lexeme.new(@py_token.lex)
|
671
|
+
end
|
672
|
+
|
585
673
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
586
674
|
def method_missing(name, *args)
|
587
675
|
@py_token.send(name, *args)
|
588
676
|
end
|
589
677
|
end
|
590
678
|
|
679
|
+
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
680
|
+
class Lexeme
|
681
|
+
|
682
|
+
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
683
|
+
attr_reader :py_lexeme
|
684
|
+
|
685
|
+
# @return [String] a string representing the token
|
686
|
+
attr_reader :text
|
687
|
+
|
688
|
+
# It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
|
689
|
+
# There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
|
690
|
+
# @param py_lexeme [Object] Python `Lexeme` object
|
691
|
+
def initialize(py_lexeme)
|
692
|
+
@py_lexeme = py_lexeme
|
693
|
+
@text = @py_lexeme.text
|
694
|
+
end
|
695
|
+
|
696
|
+
# String representation of the token.
|
697
|
+
# @return [String]
|
698
|
+
def to_s
|
699
|
+
@text
|
700
|
+
end
|
701
|
+
|
702
|
+
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
703
|
+
# @return [String]
|
704
|
+
def lower
|
705
|
+
@py_lexeme.lower_
|
706
|
+
end
|
707
|
+
|
708
|
+
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
709
|
+
# @return [String]
|
710
|
+
def shape
|
711
|
+
@py_lexeme.shape_
|
712
|
+
end
|
713
|
+
|
714
|
+
# Returns the language by calling `lang_' of `@py_lexeme` object
|
715
|
+
# @return [String]
|
716
|
+
def lang
|
717
|
+
@py_lexeme.lang_
|
718
|
+
end
|
719
|
+
|
720
|
+
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
721
|
+
# @return [String]
|
722
|
+
def prefix
|
723
|
+
@py_lexeme.prefix_
|
724
|
+
end
|
725
|
+
#
|
726
|
+
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
727
|
+
# @return [String]
|
728
|
+
def suffix
|
729
|
+
@py_lexeme.suffix_
|
730
|
+
end
|
731
|
+
|
732
|
+
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
733
|
+
# @return [String]
|
734
|
+
def norm
|
735
|
+
@py_lexeme.norm_
|
736
|
+
end
|
737
|
+
|
738
|
+
# Returns a semantic similarity estimate.
|
739
|
+
# @param other [Lexeme] the other doc to which a similarity estimation is made
|
740
|
+
# @return [Float]
|
741
|
+
def similarity(other)
|
742
|
+
@py_lexeme.similarity(other.py_lexeme)
|
743
|
+
end
|
744
|
+
|
745
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
746
|
+
def method_missing(name, *args)
|
747
|
+
@py_lexeme.send(name, *args)
|
748
|
+
end
|
749
|
+
end
|
591
750
|
|
592
751
|
end
|
593
752
|
|
data/lib/ruby-spacy/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spacy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-07-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pycall
|
@@ -123,6 +123,7 @@ files:
|
|
123
123
|
- examples/linguistic_features/rule_based_morphology.rb
|
124
124
|
- examples/linguistic_features/sentence_segmentation.rb
|
125
125
|
- examples/linguistic_features/similarity.rb
|
126
|
+
- examples/linguistic_features/similarity_between_lexemes.rb
|
126
127
|
- examples/linguistic_features/similarity_between_spans.rb
|
127
128
|
- examples/linguistic_features/tokenization.rb
|
128
129
|
- examples/rule_based_matching/creating_spans_from_matches.rb
|
@@ -149,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
149
150
|
- !ruby/object:Gem::Version
|
150
151
|
version: '0'
|
151
152
|
requirements: []
|
152
|
-
rubygems_version: 3.2.
|
153
|
+
rubygems_version: 3.2.11
|
153
154
|
signing_key:
|
154
155
|
specification_version: 4
|
155
156
|
summary: A wrapper module for using spaCy natural language processing library from
|