ruby-spacy 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/Gemfile.lock +2 -1
  4. data/README.md +7 -7
  5. data/examples/get_started/lexeme.rb +2 -2
  6. data/examples/get_started/linguistic_annotations.rb +1 -1
  7. data/examples/get_started/morphology.rb +1 -1
  8. data/examples/get_started/named_entities.rb +1 -1
  9. data/examples/get_started/pos_tags_and_dependencies.rb +1 -1
  10. data/examples/japanese/ancestors.rb +9 -11
  11. data/examples/japanese/entity_annotations_and_labels.rb +1 -1
  12. data/examples/japanese/lemmatization.rb +1 -1
  13. data/examples/japanese/named_entity_recognition.rb +1 -1
  14. data/examples/japanese/navigating_parse_tree.rb +18 -18
  15. data/examples/japanese/noun_chunks.rb +1 -1
  16. data/examples/japanese/pos_tagging.rb +1 -1
  17. data/examples/linguistic_features/ancestors.rb +13 -10
  18. data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
  19. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
  20. data/examples/linguistic_features/information_extraction.rb +2 -2
  21. data/examples/linguistic_features/iterating_children.rb +2 -2
  22. data/examples/linguistic_features/iterating_lefts_and_rights.rb +4 -4
  23. data/examples/linguistic_features/lemmatization.rb +1 -1
  24. data/examples/linguistic_features/named_entity_recognition.rb +1 -1
  25. data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
  26. data/examples/linguistic_features/noun_chunks.rb +1 -1
  27. data/examples/linguistic_features/pos_tagging.rb +1 -1
  28. data/examples/linguistic_features/retokenize_1.rb +1 -1
  29. data/examples/linguistic_features/retokenize_2.rb +2 -2
  30. data/examples/linguistic_features/rule_based_morphology.rb +1 -1
  31. data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
  32. data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
  33. data/lib/ruby-spacy.rb +181 -22
  34. data/lib/ruby-spacy/version.rb +1 -1
  35. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f2571b8bd9a0e170c5462d73b57ab75d535936f591c6b7954d0ca6b2a9d74aeb
4
- data.tar.gz: 55bffce60937e8ae1f4135a076185b64f2e989c9431e67914d926ee60b2cc537
3
+ metadata.gz: bd5a1c905e5aed7553ac5b1927a6b9cdecaf887c505ea3e38f806e886adeb60c
4
+ data.tar.gz: 6d3f3fd22e9d927d430d2b9e48dcd018da6eb601813192e6ea14e094cf51e331
5
5
  SHA512:
6
- metadata.gz: b3de6a6179eef74f224db186075fe084c93fb2fc802c4831b4b23c069594ac8c6aada546336d16065b9eeef116ba3a686afc6c441e68a66ba0fbddd7ffa321a6
7
- data.tar.gz: 05b174052487979bf1c81a54469264068dac76d17d17b6f870b7666472625ca596f022abf6f6d734f5b02d4d623b11351c57e1d1d90deb192aaaa36d59e7255c
6
+ metadata.gz: b5419fb75109b837465c64da1ace956b91d0a0ab589cdb71ace9a308ce1af263edc0e2f206a80ab71a3ab17e86e6520ab432b657c5f60548c696a36049773c60
7
+ data.tar.gz: 385606212f290b701458bd1a555e553417ed20be2d1e2008107396a9adc224590c76317c52d30d7c97435c0650ef8c1a15a43fe4b92c797188944a302da51612
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Change Log
2
2
 
3
+ ## 0.1.2 - 2021-06-26
4
+ ### Added
5
+ - `Spacy::Lexeme` class
6
+
7
+ - `Spacy::Token#morpheme` method
3
8
  ## 0.1.3 - 2021-06-26
4
9
  - Code cleanup
5
10
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby-spacy (0.1.3)
4
+ ruby-spacy (0.1.4)
5
5
  numpy (~> 0.4.0)
6
6
  pycall (~> 1.4.0)
7
7
  terminal-table (~> 3.0.1)
@@ -24,6 +24,7 @@ GEM
24
24
  PLATFORMS
25
25
  arm64-darwin-20
26
26
  x86_64-darwin-20
27
+ x86_64-linux
27
28
 
28
29
  DEPENDENCIES
29
30
  github-markup
data/README.md CHANGED
@@ -128,7 +128,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
128
128
  rows = []
129
129
 
130
130
  doc.each do |token|
131
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
131
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
132
132
  end
133
133
 
134
134
  table = Terminal::Table.new rows: rows, headings: headings
@@ -166,7 +166,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
166
166
  rows = []
167
167
 
168
168
  doc.each do |token|
169
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
169
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
170
170
  end
171
171
 
172
172
  table = Terminal::Table.new rows: rows, headings: headings
@@ -212,7 +212,7 @@ doc.each do |token|
212
212
  morph = token.morphology.map do |k, v|
213
213
  "#{k} = #{v}"
214
214
  end.join("\n")
215
- rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
215
+ rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
216
216
  end
217
217
 
218
218
  table = Terminal::Table.new rows: rows, headings: headings
@@ -300,7 +300,7 @@ doc =nlp.read("Apple is looking at buying U.K. startup for $1 billion")
300
300
  rows = []
301
301
 
302
302
  doc.ents.each do |ent|
303
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
303
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
304
304
  end
305
305
 
306
306
  headings = ["text", "start_char", "end_char", "label"]
@@ -332,7 +332,7 @@ doc = nlp.read(sentence)
332
332
  rows = []
333
333
 
334
334
  doc.ents.each do |ent|
335
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
335
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
336
336
  end
337
337
 
338
338
  headings = ["text", "start", "end", "label"]
@@ -393,8 +393,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
393
393
  doc1 = nlp.read("I like salty fries and hamburgers.")
394
394
  doc2 = nlp.read("Fast food tastes very good.")
395
395
 
396
- puts "Doc 1: " + doc1
397
- puts "Doc 2: " + doc2
396
+ puts "Doc 1: " + doc1.text
397
+ puts "Doc 2: " + doc2.text
398
398
  puts "Similarity: #{doc1.similarity(doc2)}"
399
399
 
400
400
  ```
@@ -8,8 +8,8 @@ headings = ["text", "shape", "prefix", "suffix", "is_alpha", "is_digit"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |word|
11
- lexeme = doc.vocab[word.text]
12
- rows << [lexeme.text, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, lexeme.is_alpha, lexeme.is_digit]
11
+ lexeme = nlp.vocab(word.text)
12
+ rows << [lexeme.text, lexeme.shape, lexeme.prefix, lexeme.suffix, lexeme.is_alpha, lexeme.is_digit]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "pos", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.pos_, token.dep_]
11
+ rows << [token.text, token.pos, token.dep]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,7 +12,7 @@ doc.each do |token|
12
12
  "#{k} = #{v}"
13
13
  end.join("\n")
14
14
  # end.join("<br />")
15
- rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
15
+ rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "start_char", "end_char", "label"]
8
8
  rows = []
9
9
 
10
10
  doc.ents.each do |ent|
11
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
11
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -23,9 +23,7 @@ puts "The root of the sentence is: " + root.text
23
23
  puts "The subject of the sentence is: " + subject.text
24
24
 
25
25
  subject.subtree.each do |descendant|
26
- # need to convert "ancestors" object from a python generator to a ruby array
27
- ancestors = Spacy::generator_to_array(descendant.ancestors)
28
- rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
26
+ rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, descendant.ancestors.map(&:text).join(", ")]
29
27
  end
30
28
 
31
29
  table = Terminal::Table.new rows: rows, headings: headings
@@ -34,11 +32,11 @@ puts table
34
32
  # The sentence: 私の父は寿司が好きだ。
35
33
  # The root of the sentence is: 好き
36
34
  # The subject of the sentence is: 父
37
- # +------+------------+---------+----------+----------------+
38
- # | text | dep | n_lefts | n_rights | ancestors |
39
- # +------+------------+---------+----------+----------------+
40
- # | 私 | nmod | 0 | 1 | [父, 好き] |
41
- # | の | case | 0 | 0 | [私, 父, 好き] |
42
- # | 父 | dislocated | 1 | 1 | [好き] |
43
- # | は | case | 0 | 0 | [父, 好き] |
44
- # +------+------------+---------+----------+----------------+
35
+ # +------+------------+---------+----------+--------------+
36
+ # | text | dep | n_lefts | n_rights | ancestors |
37
+ # +------+------------+---------+----------+--------------+
38
+ # | 私 | nmod | 0 | 1 | 父, 好き |
39
+ # | の | case | 0 | 0 | 私, 父, 好き |
40
+ # | 父 | dislocated | 1 | 1 | 好き |
41
+ # | は | case | 0 | 0 | 父, 好き |
42
+ # +------+------------+---------+----------+--------------+
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
10
10
  rows = []
11
11
 
12
12
  doc.each do |ent|
13
- rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type_]
13
+ rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -9,7 +9,7 @@ headings = ["text", "lemma"]
9
9
  rows = []
10
10
 
11
11
  doc.each do |token|
12
- rows << [token.text, token.lemma_]
12
+ rows << [token.text, token.lemma]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
10
10
  rows = []
11
11
 
12
12
  doc.ents.each do |ent|
13
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
13
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -9,26 +9,26 @@ headings = ["text", "dep", "head text", "head pos", "children"]
9
9
  rows = []
10
10
 
11
11
  doc.each do |token|
12
- rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
12
+ rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
16
16
  puts table
17
17
 
18
- # +------+----------+-----------+----------+--------------------------+
19
- # | text | dep | head text | head pos | children |
20
- # +------+----------+-----------+----------+--------------------------+
21
- # | 自動 | compound | 車 | NOUN | [] |
22
- # | 運転 | compound | 車 | NOUN | [] |
23
- # | 車 | nsubj | 転嫁 | VERB | [自動, 運転, は] |
24
- # | は | case | 車 | NOUN | [] |
25
- # | 保険 | compound | 責任 | NOUN | [] |
26
- # | 責任 | obj | 転嫁 | VERB | [保険, を] |
27
- # | を | case | 責任 | NOUN | [] |
28
- # | 製造 | compound | 者 | NOUN | [] |
29
- # | 者 | obl | 転嫁 | VERB | [製造, に] |
30
- # | に | case | 者 | NOUN | [] |
31
- # | 転嫁 | ROOT | 転嫁 | VERB | [車, 責任, 者, する, 。] |
32
- # | する | aux | 転嫁 | VERB | [] |
33
- # | 。 | punct | 転嫁 | VERB | [] |
34
- # +------+----------+-----------+----------+--------------------------+
18
+ +------+----------+-----------+----------+------------------------+
19
+ | text | dep | head text | head pos | children |
20
+ +------+----------+-----------+----------+------------------------+
21
+ | 自動 | compound | 車 | 92 | |
22
+ | 運転 | compound | 車 | 92 | |
23
+ | 車 | nsubj | 転嫁 | 100 | 自動, 運転, は |
24
+ | は | case | 車 | 92 | |
25
+ | 保険 | compound | 責任 | 92 | |
26
+ | 責任 | obj | 転嫁 | 100 | 保険, を |
27
+ | を | case | 責任 | 92 | |
28
+ | 製造 | compound | 者 | 92 | |
29
+ | 者 | obl | 転嫁 | 100 | 製造, に |
30
+ | に | case | 者 | 92 | |
31
+ | 転嫁 | ROOT | 転嫁 | 100 | 車, 責任, 者, する, 。 |
32
+ | する | aux | 転嫁 | 100 | |
33
+ | 。 | punct | 転嫁 | 100 | |
34
+ +------+----------+-----------+----------+------------------------+
@@ -9,7 +9,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
9
9
  rows = []
10
10
 
11
11
  doc.noun_chunks.each do |chunk|
12
- rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
12
+ rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -24,18 +24,21 @@ puts "The subject of the sentence is: " + subject.text
24
24
  subject.subtree.each do |descendant|
25
25
  # need to convert "ancestors" object from a python generator to a ruby array
26
26
  ancestors = Spacy::generator_to_array(descendant.ancestors)
27
- rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
27
+ rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, ancestors.map(&:text).join(", ")]
28
28
  end
29
29
 
30
30
  table = Terminal::Table.new rows: rows, headings: headings
31
31
  print table
32
32
 
33
- # +----------+----------+---------+----------+------------------------------------+
34
- # | text | dep | n_lefts | n_rights | ancestors |
35
- # +----------+----------+---------+----------+------------------------------------+
36
- # | Credit | nmod | 0 | 2 | [holders, submit] |
37
- # | and | cc | 0 | 0 | [Credit, holders, submit] |
38
- # | mortgage | compound | 0 | 0 | [account, Credit, holders, submit] |
39
- # | account | conj | 1 | 0 | [Credit, holders, submit] |
40
- # | holders | nsubj | 1 | 0 | [submit] |
41
- # +----------+----------+---------+----------+------------------------------------+
33
+ # The sentence: Credit and mortgage account holders must submit their requests
34
+ # The root of the sentence is: submit
35
+ # The subject of the sentence is: holders
36
+ # +----------+----------+---------+----------+----------------------------------+
37
+ # | text | dep | n_lefts | n_rights | ancestors |
38
+ # +----------+----------+---------+----------+----------------------------------+
39
+ # | Credit | nmod | 0 | 2 | holders, submit |
40
+ # | and | cc | 0 | 0 | Credit, holders, submit |
41
+ # | mortgage | compound | 0 | 0 | account, Credit, holders, submit |
42
+ # | account | conj | 1 | 0 | Credit, holders, submit |
43
+ # | holders | nsubj | 1 | 0 | submit |
44
+ # +----------+----------+---------+----------+----------------------------------+
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
10
10
  rows = []
11
11
 
12
12
  doc.each do |ent|
13
- rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type_]
13
+ rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -10,11 +10,11 @@ results = []
10
10
 
11
11
  doc.each do |token|
12
12
  if token.dep_ == "nsubj" && token.head.pos_ == "VERB"
13
- results << token.head
13
+ results << token.head.text
14
14
  end
15
15
  end
16
16
 
17
17
  puts results.to_s
18
18
 
19
- # [shift]
19
+ # ["shift"]
20
20
 
@@ -19,11 +19,11 @@ texts.each do |text|
19
19
  doc.each do |token|
20
20
  if token.ent_type_ == "MONEY"
21
21
  if ["attr", "dobj"].index token.dep_
22
- subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep_ == "nsubj"}
22
+ subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep == "nsubj"}
23
23
  if !subj.empty?
24
24
  puts(subj[0].text + " --> " + token.text)
25
25
  end
26
- elsif token.dep_ == "pobj" and token.head.dep_ == "prep"
26
+ elsif token.dep_ == "pobj" and token.head.dep == "prep"
27
27
  puts token.head.head.text + " --> " + token.text
28
28
  end
29
29
  end
@@ -12,7 +12,7 @@ doc.each do |token|
12
12
  if token.pos_ == "VERB"
13
13
  token.children.each do |child|
14
14
  if child.dep_ == "nsubj"
15
- results << child.head
15
+ results << child.head.text
16
16
  end
17
17
  end
18
18
  end
@@ -20,5 +20,5 @@ end
20
20
 
21
21
  puts results.to_s
22
22
 
23
- # [shift]
23
+ # ["shift"]
24
24
 
@@ -7,14 +7,14 @@ doc = nlp.read("bright red apples on the tree")
7
7
 
8
8
  puts "Text: " + doc.text
9
9
 
10
- puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
11
- puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s
10
+ puts "Words to the left of 'apple': " + doc[2].lefts.map(&:text).join(", ")
11
+ puts "Words to the right of 'apple': " + doc[2].rights.map(&:text).join(", ")
12
12
 
13
13
  puts "Num of the words to the left of 'apple': " + doc[2].n_lefts.to_s
14
14
  puts "Num of the words to the right of 'apple': " + doc[2].n_rights.to_s
15
15
 
16
16
  # Text: bright red apples on the tree
17
- # Words to the left of 'apple': [bright, red]
18
- # Words to the right of 'apple': [on]
17
+ # Words to the left of 'apple': bright, red
18
+ # Words to the right of 'apple': on
19
19
  # Num of the words to the left of 'apple': 2
20
20
  # Num of the words to the right of 'apple': 1
@@ -12,7 +12,7 @@ headings = ["lemma"]
12
12
  rows = []
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.lemma_]
15
+ rows << [token.lemma]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
10
10
  rows = []
11
11
 
12
12
  doc.ents.each do |ent|
13
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
13
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
12
12
  rows = []
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
15
+ rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
19
19
  puts table
20
20
 
21
21
  # Lemmatizer mode: rule
22
- # +---------------+----------+-----------+----------+---------------------------+
23
- # | text | dep | head text | head pos | children |
24
- # +---------------+----------+-----------+----------+---------------------------+
25
- # | Autonomous | amod | cars | NOUN | [] |
26
- # | cars | nsubj | shift | VERB | [Autonomous] |
27
- # | shift | ROOT | shift | VERB | [cars, liability, toward] |
28
- # | insurance | compound | liability | NOUN | [] |
29
- # | liability | dobj | shift | VERB | [insurance] |
30
- # | toward | prep | shift | VERB | [manufacturers] |
31
- # | manufacturers | pobj | toward | ADP | [] |
32
- # +---------------+----------+-----------+----------+---------------------------+
22
+ # +---------------+----------+-----------+----------+-------------------------+
23
+ # | text | dep | head text | head pos | children |
24
+ # +---------------+----------+-----------+----------+-------------------------+
25
+ # | Autonomous | amod | cars | NOUN | |
26
+ # | cars | nsubj | shift | VERB | Autonomous |
27
+ # | shift | ROOT | shift | VERB | cars, liability, toward |
28
+ # | insurance | compound | liability | NOUN | |
29
+ # | liability | dobj | shift | VERB | insurance |
30
+ # | toward | prep | shift | VERB | manufacturers |
31
+ # | manufacturers | pobj | toward | ADP | |
32
+ # +---------------+----------+-----------+----------+-------------------------+
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
12
12
  rows = []
13
13
 
14
14
  doc.noun_chunks.each do |chunk|
15
- rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
15
+ rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,7 +12,7 @@ rows = []
12
12
  doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.pos_, token.dep_, token.head.text]
15
+ rows << [token.text, token.pos, token.dep, token.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  sentence = "I live in New York"
7
7
  doc = nlp.read(sentence)
8
8
 
9
- puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
9
+ puts "Before: " + doc.tokens.map(&:text).join(", ")
10
10
 
11
11
  doc.retokenize(3, 4)
12
12
 
13
- puts "After: " + doc.tokens.collect{|t| t}.join(", ")
13
+ puts "After: " + doc.tokens.map(&:text).join(", ")
14
14
 
15
15
  # Before: I, live, in, New, York
16
16
  # After: I, live, in, New York
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  doc = nlp.read("Where are you?")
7
7
 
8
8
  puts "Morph features of the third word: " + doc[2].morph.to_s
9
- puts "POS of the third word: " + doc[2].pos_.to_s
9
+ puts "POS of the third word: " + doc[2].pos
10
10
 
11
11
  # Morph features of the third word: Case=Nom|Person=2|PronType=Prs
12
12
  # POS of the third word: PRON
@@ -0,0 +1,18 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+
6
+ orange = nlp.vocab("orange")
7
+ lemon = nlp.vocab("lemon")
8
+
9
+ book = nlp.vocab("book")
10
+ magazine = nlp.vocab("magazine")
11
+
12
+ puts "orange <=> lemon: #{orange.similarity(lemon)}"
13
+ puts "book <=> magazine: #{book.similarity(magazine)}"
14
+ puts "orange <=> book: #{orange.similarity(book)}"
15
+
16
+ # orange <=> lemon: 0.7080526351928711
17
+ # book <=> magazine: 0.4355940818786621
18
+ # orange <=> book: 0.12197211384773254
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
10
10
 
11
11
  matches.each do |match|
12
12
  span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
13
- puts span.text + " / " + span.label_
13
+ puts span.text + " / " + span.label
14
14
  end
15
15
 
16
16
  # Barack Obama / US_PRESIDENT
data/lib/ruby-spacy.rb CHANGED
@@ -165,6 +165,9 @@ module Spacy
165
165
  # so that ents canbe "each"-ed in Ruby
166
166
  ent_array = []
167
167
  PyCall::List.(@py_doc.ents).each do |ent|
168
+ ent.define_singleton_method :label do
169
+ return self.label_
170
+ end
168
171
  ent_array << ent
169
172
  end
170
173
  ent_array
@@ -252,10 +255,16 @@ module Spacy
252
255
  # @param text [String] A text string representing a lexeme
253
256
  # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
254
257
  def get_lexeme(text)
255
- text = text.gsub("'", "\'")
256
258
  @py_nlp.vocab[text]
257
259
  end
258
260
 
261
+ # Returns a ruby lexeme object
262
+ # @param text [String] a text string representing the vocabulary item
263
+ # @return [Lexeme]
264
+ def vocab(text)
265
+ Lexeme.new(@py_nlp.vocab[text])
266
+ end
267
+
259
268
  # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
260
269
  # @param vector [Object] A vector representation of a word (whether existing or non-existing)
261
270
  # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
@@ -386,18 +395,24 @@ module Spacy
386
395
  chunk_array = []
387
396
  py_chunks = PyCall::List.(@py_span.noun_chunks)
388
397
  py_chunks.each do |py_span|
389
- chunk_array << Spacy::Span.new(@doc, py_span: py_span)
398
+ chunk_array << Span.new(@doc, py_span: py_span)
390
399
  end
391
400
  chunk_array
392
401
  end
393
402
 
403
+ # Returns the head token
404
+ # @return [Token]
405
+ def root
406
+ Token.new(@py_span.root)
407
+ end
408
+
394
409
  # Returns an array of spans that represents sentences.
395
410
  # @return [Array<Span>]
396
411
  def sents
397
412
  sentence_array = []
398
413
  py_sentences = PyCall::List.(@py_span.sents)
399
414
  py_sentences.each do |py_span|
400
- sentence_array << Spacy::Span.new(@doc, py_span: py_span)
415
+ sentence_array << Span.new(@doc, py_span: py_span)
401
416
  end
402
417
  sentence_array
403
418
  end
@@ -407,7 +422,7 @@ module Spacy
407
422
  def ents
408
423
  ent_array = []
409
424
  PyCall::List.(@py_span.ents).each do |py_span|
410
- ent_array << Spacy::Span.new(@doc, py_span: py_span)
425
+ ent_array << Span.new(@doc, py_span: py_span)
411
426
  end
412
427
  ent_array
413
428
  end
@@ -416,7 +431,7 @@ module Spacy
416
431
  # @return [Span]
417
432
  def sent
418
433
  py_span = @py_span.sent
419
- return Spacy::Span.new(@doc, py_span: py_span)
434
+ return Span.new(@doc, py_span: py_span)
420
435
  end
421
436
 
422
437
  # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
@@ -424,9 +439,9 @@ module Spacy
424
439
  def [](range)
425
440
  if range.is_a?(Range)
426
441
  py_span = @py_span[range]
427
- return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
442
+ return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
428
443
  else
429
- return Spacy::Token.new(@py_span[range])
444
+ return Token.new(@py_span[range])
430
445
  end
431
446
  end
432
447
 
@@ -440,7 +455,7 @@ module Spacy
440
455
  # Creates a document instance from the span
441
456
  # @return [Doc]
442
457
  def as_doc
443
- Spacy::Doc.new(@doc.py_nlp, text: self.text)
458
+ Doc.new(@doc.py_nlp, text: self.text)
444
459
  end
445
460
 
446
461
  # Returns tokens conjugated to the root of the span.
@@ -448,7 +463,7 @@ module Spacy
448
463
  def conjuncts
449
464
  conjunct_array = []
450
465
  PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
451
- conjunct_array << Spacy::Token.new(py_conjunct)
466
+ conjunct_array << Token.new(py_conjunct)
452
467
  end
453
468
  conjunct_array
454
469
  end
@@ -458,7 +473,7 @@ module Spacy
458
473
  def lefts
459
474
  left_array = []
460
475
  PyCall::List.(@py_span.lefts).each do |py_left|
461
- left_array << Spacy::Token.new(py_left)
476
+ left_array << Token.new(py_left)
462
477
  end
463
478
  left_array
464
479
  end
@@ -468,7 +483,7 @@ module Spacy
468
483
  def rights
469
484
  right_array = []
470
485
  PyCall::List.(@py_span.rights).each do |py_right|
471
- right_array << Spacy::Token.new(py_right)
486
+ right_array << Token.new(py_right)
472
487
  end
473
488
  right_array
474
489
  end
@@ -478,11 +493,17 @@ module Spacy
478
493
  def subtree
479
494
  subtree_array = []
480
495
  PyCall::List.(@py_span.subtree).each do |py_subtree|
481
- subtree_array << Spacy::Token.new(py_subtree)
496
+ subtree_array << Token.new(py_subtree)
482
497
  end
483
498
  subtree_array
484
499
  end
485
500
 
501
+ # Returns the label
502
+ # @return [String]
503
+ def label
504
+ @py_span.label_
505
+ end
506
+
486
507
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
487
508
  def method_missing(name, *args)
488
509
  @py_span.send(name, *args)
@@ -506,52 +527,59 @@ module Spacy
506
527
  @text = @py_token.text
507
528
  end
508
529
 
530
+
531
+ # Returns the head token
532
+ # @return [Token]
533
+ def head
534
+ Token.new(@py_token.head)
535
+ end
536
+
509
537
  # Returns the token in question and the tokens that descend from it.
510
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
538
+ # @return [Array<Token>] an array of tokens
511
539
  def subtree
512
540
  descendant_array = []
513
541
  PyCall::List.(@py_token.subtree).each do |descendant|
514
- descendant_array << descendant
542
+ descendant_array << Token.new(descendant)
515
543
  end
516
544
  descendant_array
517
545
  end
518
546
 
519
547
  # Returns the token's ancestors.
520
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
548
+ # @return [Array<Token>] an array of tokens
521
549
  def ancestors
522
550
  ancestor_array = []
523
551
  PyCall::List.(@py_token.ancestors).each do |ancestor|
524
- ancestor_array << ancestor
552
+ ancestor_array << Token.new(ancestor)
525
553
  end
526
554
  ancestor_array
527
555
  end
528
556
 
529
557
  # Returns a sequence of the token's immediate syntactic children.
530
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
558
+ # @return [Array<Token>] an array of tokens
531
559
  def children
532
560
  child_array = []
533
561
  PyCall::List.(@py_token.children).each do |child|
534
- child_array << child
562
+ child_array << Token.new(child)
535
563
  end
536
564
  child_array
537
565
  end
538
566
 
539
567
  # The leftward immediate children of the word in the syntactic dependency parse.
540
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
568
+ # @return [Array<Token>] an array of tokens
541
569
  def lefts
542
570
  token_array = []
543
571
  PyCall::List.(@py_token.lefts).each do |token|
544
- token_array << token
572
+ token_array << Token.new(token)
545
573
  end
546
574
  token_array
547
575
  end
548
576
 
549
577
  # The rightward immediate children of the word in the syntactic dependency parse.
550
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
578
+ # @return [Array<Token>] an array of tokens
551
579
  def rights
552
580
  token_array = []
553
581
  PyCall::List.(@py_token.rights).each do |token|
554
- token_array << token
582
+ token_array << Token.new(token)
555
583
  end
556
584
  token_array
557
585
  end
@@ -582,12 +610,143 @@ module Spacy
582
610
  end
583
611
  end
584
612
 
613
+ # Returns the lemma by calling `lemma_' of `@py_token` object
614
+ # @return [String]
615
+ def lemma
616
+ @py_token.lemma_
617
+ end
618
+
619
+ # Returns the lowercase form by calling `lower_' of `@py_token` object
620
+ # @return [String]
621
+ def lower
622
+ @py_token.lower_
623
+ end
624
+
625
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
626
+ # @return [String]
627
+ def shape
628
+ @py_token.shape_
629
+ end
630
+
631
+ # Returns the pos by calling `pos_' of `@py_token` object
632
+ # @return [String]
633
+ def pos
634
+ @py_token.pos_
635
+ end
636
+
637
+ # Returns the fine-grained pos by calling `tag_' of `@py_token` object
638
+ # @return [String]
639
+ def tag
640
+ @py_token.tag_
641
+ end
642
+
643
+ # Returns the dependency relation by calling `dep_' of `@py_token` object
644
+ # @return [String]
645
+ def dep
646
+ @py_token.dep_
647
+ end
648
+
649
+ # Returns the language by calling `lang_' of `@py_token` object
650
+ # @return [String]
651
+ def lang
652
+ @py_token.lang_
653
+ end
654
+
655
+ # Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
656
+ # @return [String]
657
+ def whitespace
658
+ @py_token.whitespace_
659
+ end
660
+
661
+ # Returns the named entity type by calling `ent_type_' of `@py_token` object
662
+ # @return [String]
663
+ def ent_type
664
+ @py_token.ent_type_
665
+ end
666
+
667
+ # Returns a lexeme object
668
+ # @return [Lexeme]
669
+ def lexeme
670
+ Lexeme.new(@py_token.lex)
671
+ end
672
+
585
673
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
586
674
  def method_missing(name, *args)
587
675
  @py_token.send(name, *args)
588
676
  end
589
677
  end
590
678
 
679
+ # See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
680
+ class Lexeme
681
+
682
+ # @return [Object] a Python `Lexeme` instance accessible via `PyCall`
683
+ attr_reader :py_lexeme
684
+
685
+ # @return [String] a string representing the token
686
+ attr_reader :text
687
+
688
+ # It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
689
+ # There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
690
+ # @param py_lexeme [Object] Python `Lexeme` object
691
+ def initialize(py_lexeme)
692
+ @py_lexeme = py_lexeme
693
+ @text = @py_lexeme.text
694
+ end
695
+
696
+ # String representation of the token.
697
+ # @return [String]
698
+ def to_s
699
+ @text
700
+ end
701
+
702
+ # Returns the lowercase form by calling `lower_' of `@py_lexeme` object
703
+ # @return [String]
704
+ def lower
705
+ @py_lexeme.lower_
706
+ end
707
+
708
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
709
+ # @return [String]
710
+ def shape
711
+ @py_lexeme.shape_
712
+ end
713
+
714
+ # Returns the language by calling `lang_' of `@py_lexeme` object
715
+ # @return [String]
716
+ def lang
717
+ @py_lexeme.lang_
718
+ end
719
+
720
+ # Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
721
+ # @return [String]
722
+ def prefix
723
+ @py_lexeme.prefix_
724
+ end
725
+ #
726
+ # Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
727
+ # @return [String]
728
+ def suffix
729
+ @py_lexeme.suffix_
730
+ end
731
+
732
+ # Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
733
+ # @return [String]
734
+ def norm
735
+ @py_lexeme.norm_
736
+ end
737
+
738
+ # Returns a semantic similarity estimate.
739
+ # @param other [Lexeme] the other doc to which a similarity estimation is made
740
+ # @return [Float]
741
+ def similarity(other)
742
+ @py_lexeme.similarity(other.py_lexeme)
743
+ end
744
+
745
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
746
+ def method_missing(name, *args)
747
+ @py_lexeme.send(name, *args)
748
+ end
749
+ end
591
750
 
592
751
  end
593
752
 
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Spacy
4
4
  # The version number of the module
5
- VERSION = "0.1.3"
5
+ VERSION = "0.1.4"
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spacy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-28 00:00:00.000000000 Z
11
+ date: 2021-07-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pycall
@@ -123,6 +123,7 @@ files:
123
123
  - examples/linguistic_features/rule_based_morphology.rb
124
124
  - examples/linguistic_features/sentence_segmentation.rb
125
125
  - examples/linguistic_features/similarity.rb
126
+ - examples/linguistic_features/similarity_between_lexemes.rb
126
127
  - examples/linguistic_features/similarity_between_spans.rb
127
128
  - examples/linguistic_features/tokenization.rb
128
129
  - examples/rule_based_matching/creating_spans_from_matches.rb
@@ -149,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
150
  - !ruby/object:Gem::Version
150
151
  version: '0'
151
152
  requirements: []
152
- rubygems_version: 3.2.3
153
+ rubygems_version: 3.2.11
153
154
  signing_key:
154
155
  specification_version: 4
155
156
  summary: A wrapper module for using spaCy natural language processing library from