ruby-spacy 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/Gemfile.lock +2 -1
  4. data/README.md +7 -7
  5. data/examples/get_started/lexeme.rb +2 -2
  6. data/examples/get_started/linguistic_annotations.rb +1 -1
  7. data/examples/get_started/morphology.rb +1 -1
  8. data/examples/get_started/named_entities.rb +1 -1
  9. data/examples/get_started/pos_tags_and_dependencies.rb +1 -1
  10. data/examples/japanese/ancestors.rb +9 -11
  11. data/examples/japanese/entity_annotations_and_labels.rb +1 -1
  12. data/examples/japanese/lemmatization.rb +1 -1
  13. data/examples/japanese/named_entity_recognition.rb +1 -1
  14. data/examples/japanese/navigating_parse_tree.rb +18 -18
  15. data/examples/japanese/noun_chunks.rb +1 -1
  16. data/examples/japanese/pos_tagging.rb +1 -1
  17. data/examples/linguistic_features/ancestors.rb +13 -10
  18. data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
  19. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
  20. data/examples/linguistic_features/information_extraction.rb +2 -2
  21. data/examples/linguistic_features/iterating_children.rb +2 -2
  22. data/examples/linguistic_features/iterating_lefts_and_rights.rb +4 -4
  23. data/examples/linguistic_features/lemmatization.rb +1 -1
  24. data/examples/linguistic_features/named_entity_recognition.rb +1 -1
  25. data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
  26. data/examples/linguistic_features/noun_chunks.rb +1 -1
  27. data/examples/linguistic_features/pos_tagging.rb +1 -1
  28. data/examples/linguistic_features/retokenize_1.rb +1 -1
  29. data/examples/linguistic_features/retokenize_2.rb +2 -2
  30. data/examples/linguistic_features/rule_based_morphology.rb +1 -1
  31. data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
  32. data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
  33. data/lib/ruby-spacy.rb +181 -22
  34. data/lib/ruby-spacy/version.rb +1 -1
  35. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f2571b8bd9a0e170c5462d73b57ab75d535936f591c6b7954d0ca6b2a9d74aeb
4
- data.tar.gz: 55bffce60937e8ae1f4135a076185b64f2e989c9431e67914d926ee60b2cc537
3
+ metadata.gz: bd5a1c905e5aed7553ac5b1927a6b9cdecaf887c505ea3e38f806e886adeb60c
4
+ data.tar.gz: 6d3f3fd22e9d927d430d2b9e48dcd018da6eb601813192e6ea14e094cf51e331
5
5
  SHA512:
6
- metadata.gz: b3de6a6179eef74f224db186075fe084c93fb2fc802c4831b4b23c069594ac8c6aada546336d16065b9eeef116ba3a686afc6c441e68a66ba0fbddd7ffa321a6
7
- data.tar.gz: 05b174052487979bf1c81a54469264068dac76d17d17b6f870b7666472625ca596f022abf6f6d734f5b02d4d623b11351c57e1d1d90deb192aaaa36d59e7255c
6
+ metadata.gz: b5419fb75109b837465c64da1ace956b91d0a0ab589cdb71ace9a308ce1af263edc0e2f206a80ab71a3ab17e86e6520ab432b657c5f60548c696a36049773c60
7
+ data.tar.gz: 385606212f290b701458bd1a555e553417ed20be2d1e2008107396a9adc224590c76317c52d30d7c97435c0650ef8c1a15a43fe4b92c797188944a302da51612
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Change Log
2
2
 
3
+ ## 0.1.2 - 2021-06-26
4
+ ### Added
5
+ - `Spacy::Lexeme` class
6
+
7
+ - `Spacy::Token#morpheme` method
3
8
  ## 0.1.3 - 2021-06-26
4
9
  - Code cleanup
5
10
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ruby-spacy (0.1.3)
4
+ ruby-spacy (0.1.4)
5
5
  numpy (~> 0.4.0)
6
6
  pycall (~> 1.4.0)
7
7
  terminal-table (~> 3.0.1)
@@ -24,6 +24,7 @@ GEM
24
24
  PLATFORMS
25
25
  arm64-darwin-20
26
26
  x86_64-darwin-20
27
+ x86_64-linux
27
28
 
28
29
  DEPENDENCIES
29
30
  github-markup
data/README.md CHANGED
@@ -128,7 +128,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
128
128
  rows = []
129
129
 
130
130
  doc.each do |token|
131
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
131
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
132
132
  end
133
133
 
134
134
  table = Terminal::Table.new rows: rows, headings: headings
@@ -166,7 +166,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
166
166
  rows = []
167
167
 
168
168
  doc.each do |token|
169
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
169
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
170
170
  end
171
171
 
172
172
  table = Terminal::Table.new rows: rows, headings: headings
@@ -212,7 +212,7 @@ doc.each do |token|
212
212
  morph = token.morphology.map do |k, v|
213
213
  "#{k} = #{v}"
214
214
  end.join("\n")
215
- rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
215
+ rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
216
216
  end
217
217
 
218
218
  table = Terminal::Table.new rows: rows, headings: headings
@@ -300,7 +300,7 @@ doc =nlp.read("Apple is looking at buying U.K. startup for $1 billion")
300
300
  rows = []
301
301
 
302
302
  doc.ents.each do |ent|
303
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
303
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
304
304
  end
305
305
 
306
306
  headings = ["text", "start_char", "end_char", "label"]
@@ -332,7 +332,7 @@ doc = nlp.read(sentence)
332
332
  rows = []
333
333
 
334
334
  doc.ents.each do |ent|
335
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
335
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
336
336
  end
337
337
 
338
338
  headings = ["text", "start", "end", "label"]
@@ -393,8 +393,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
393
393
  doc1 = nlp.read("I like salty fries and hamburgers.")
394
394
  doc2 = nlp.read("Fast food tastes very good.")
395
395
 
396
- puts "Doc 1: " + doc1
397
- puts "Doc 2: " + doc2
396
+ puts "Doc 1: " + doc1.text
397
+ puts "Doc 2: " + doc2.text
398
398
  puts "Similarity: #{doc1.similarity(doc2)}"
399
399
 
400
400
  ```
@@ -8,8 +8,8 @@ headings = ["text", "shape", "prefix", "suffix", "is_alpha", "is_digit"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |word|
11
- lexeme = doc.vocab[word.text]
12
- rows << [lexeme.text, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, lexeme.is_alpha, lexeme.is_digit]
11
+ lexeme = nlp.vocab(word.text)
12
+ rows << [lexeme.text, lexeme.shape, lexeme.prefix, lexeme.suffix, lexeme.is_alpha, lexeme.is_digit]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "pos", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.pos_, token.dep_]
11
+ rows << [token.text, token.pos, token.dep]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,7 +12,7 @@ doc.each do |token|
12
12
  "#{k} = #{v}"
13
13
  end.join("\n")
14
14
  # end.join("<br />")
15
- rows << [token.text, token.shape_, token.is_alpha, token.is_stop, morph]
15
+ rows << [token.text, token.shape, token.is_alpha, token.is_stop, morph]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "start_char", "end_char", "label"]
8
8
  rows = []
9
9
 
10
10
  doc.ents.each do |ent|
11
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
11
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -23,9 +23,7 @@ puts "The root of the sentence is: " + root.text
23
23
  puts "The subject of the sentence is: " + subject.text
24
24
 
25
25
  subject.subtree.each do |descendant|
26
- # need to convert "ancestors" object from a python generator to a ruby array
27
- ancestors = Spacy::generator_to_array(descendant.ancestors)
28
- rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
26
+ rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, descendant.ancestors.map(&:text).join(", ")]
29
27
  end
30
28
 
31
29
  table = Terminal::Table.new rows: rows, headings: headings
@@ -34,11 +32,11 @@ puts table
34
32
  # The sentence: 私の父は寿司が好きだ。
35
33
  # The root of the sentence is: 好き
36
34
  # The subject of the sentence is: 父
37
- # +------+------------+---------+----------+----------------+
38
- # | text | dep | n_lefts | n_rights | ancestors |
39
- # +------+------------+---------+----------+----------------+
40
- # | 私 | nmod | 0 | 1 | [父, 好き] |
41
- # | の | case | 0 | 0 | [私, 父, 好き] |
42
- # | 父 | dislocated | 1 | 1 | [好き] |
43
- # | は | case | 0 | 0 | [父, 好き] |
44
- # +------+------------+---------+----------+----------------+
35
+ # +------+------------+---------+----------+--------------+
36
+ # | text | dep | n_lefts | n_rights | ancestors |
37
+ # +------+------------+---------+----------+--------------+
38
+ # | 私 | nmod | 0 | 1 | 父, 好き |
39
+ # | の | case | 0 | 0 | 私, 父, 好き |
40
+ # | 父 | dislocated | 1 | 1 | 好き |
41
+ # | は | case | 0 | 0 | 父, 好き |
42
+ # +------+------------+---------+----------+--------------+
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
10
10
  rows = []
11
11
 
12
12
  doc.each do |ent|
13
- rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type_]
13
+ rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -9,7 +9,7 @@ headings = ["text", "lemma"]
9
9
  rows = []
10
10
 
11
11
  doc.each do |token|
12
- rows << [token.text, token.lemma_]
12
+ rows << [token.text, token.lemma]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
10
10
  rows = []
11
11
 
12
12
  doc.ents.each do |ent|
13
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
13
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -9,26 +9,26 @@ headings = ["text", "dep", "head text", "head pos", "children"]
9
9
  rows = []
10
10
 
11
11
  doc.each do |token|
12
- rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
12
+ rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
16
16
  puts table
17
17
 
18
- # +------+----------+-----------+----------+--------------------------+
19
- # | text | dep | head text | head pos | children |
20
- # +------+----------+-----------+----------+--------------------------+
21
- # | 自動 | compound | 車 | NOUN | [] |
22
- # | 運転 | compound | 車 | NOUN | [] |
23
- # | 車 | nsubj | 転嫁 | VERB | [自動, 運転, は] |
24
- # | は | case | 車 | NOUN | [] |
25
- # | 保険 | compound | 責任 | NOUN | [] |
26
- # | 責任 | obj | 転嫁 | VERB | [保険, を] |
27
- # | を | case | 責任 | NOUN | [] |
28
- # | 製造 | compound | 者 | NOUN | [] |
29
- # | 者 | obl | 転嫁 | VERB | [製造, に] |
30
- # | に | case | 者 | NOUN | [] |
31
- # | 転嫁 | ROOT | 転嫁 | VERB | [車, 責任, 者, する, 。] |
32
- # | する | aux | 転嫁 | VERB | [] |
33
- # | 。 | punct | 転嫁 | VERB | [] |
34
- # +------+----------+-----------+----------+--------------------------+
18
+ +------+----------+-----------+----------+------------------------+
19
+ | text | dep | head text | head pos | children |
20
+ +------+----------+-----------+----------+------------------------+
21
+ | 自動 | compound | 車 | 92 | |
22
+ | 運転 | compound | 車 | 92 | |
23
+ | 車 | nsubj | 転嫁 | 100 | 自動, 運転, は |
24
+ | は | case | 車 | 92 | |
25
+ | 保険 | compound | 責任 | 92 | |
26
+ | 責任 | obj | 転嫁 | 100 | 保険, を |
27
+ | を | case | 責任 | 92 | |
28
+ | 製造 | compound | 者 | 92 | |
29
+ | 者 | obl | 転嫁 | 100 | 製造, に |
30
+ | に | case | 者 | 92 | |
31
+ | 転嫁 | ROOT | 転嫁 | 100 | 車, 責任, 者, する, 。 |
32
+ | する | aux | 転嫁 | 100 | |
33
+ | 。 | punct | 転嫁 | 100 | |
34
+ +------+----------+-----------+----------+------------------------+
@@ -9,7 +9,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
9
9
  rows = []
10
10
 
11
11
  doc.noun_chunks.each do |chunk|
12
- rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
12
+ rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
13
13
  end
14
14
 
15
15
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep"]
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -24,18 +24,21 @@ puts "The subject of the sentence is: " + subject.text
24
24
  subject.subtree.each do |descendant|
25
25
  # need to convert "ancestors" object from a python generator to a ruby array
26
26
  ancestors = Spacy::generator_to_array(descendant.ancestors)
27
- rows << [descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, ancestors]
27
+ rows << [descendant.text, descendant.dep, descendant.n_lefts, descendant.n_rights, ancestors.map(&:text).join(", ")]
28
28
  end
29
29
 
30
30
  table = Terminal::Table.new rows: rows, headings: headings
31
31
  print table
32
32
 
33
- # +----------+----------+---------+----------+------------------------------------+
34
- # | text | dep | n_lefts | n_rights | ancestors |
35
- # +----------+----------+---------+----------+------------------------------------+
36
- # | Credit | nmod | 0 | 2 | [holders, submit] |
37
- # | and | cc | 0 | 0 | [Credit, holders, submit] |
38
- # | mortgage | compound | 0 | 0 | [account, Credit, holders, submit] |
39
- # | account | conj | 1 | 0 | [Credit, holders, submit] |
40
- # | holders | nsubj | 1 | 0 | [submit] |
41
- # +----------+----------+---------+----------+------------------------------------+
33
+ # The sentence: Credit and mortgage account holders must submit their requests
34
+ # The root of the sentence is: submit
35
+ # The subject of the sentence is: holders
36
+ # +----------+----------+---------+----------+----------------------------------+
37
+ # | text | dep | n_lefts | n_rights | ancestors |
38
+ # +----------+----------+---------+----------+----------------------------------+
39
+ # | Credit | nmod | 0 | 2 | holders, submit |
40
+ # | and | cc | 0 | 0 | Credit, holders, submit |
41
+ # | mortgage | compound | 0 | 0 | account, Credit, holders, submit |
42
+ # | account | conj | 1 | 0 | Credit, holders, submit |
43
+ # | holders | nsubj | 1 | 0 | submit |
44
+ # +----------+----------+---------+----------+----------------------------------+
@@ -10,7 +10,7 @@ headings = ["text", "ent_iob", "ent_iob_", "ent_type_"]
10
10
  rows = []
11
11
 
12
12
  doc.each do |ent|
13
- rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type_]
13
+ rows << [ent.text, ent.ent_iob, ent.ent_iob_, ent.ent_type]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -10,11 +10,11 @@ results = []
10
10
 
11
11
  doc.each do |token|
12
12
  if token.dep_ == "nsubj" && token.head.pos_ == "VERB"
13
- results << token.head
13
+ results << token.head.text
14
14
  end
15
15
  end
16
16
 
17
17
  puts results.to_s
18
18
 
19
- # [shift]
19
+ # ["shift"]
20
20
 
@@ -19,11 +19,11 @@ texts.each do |text|
19
19
  doc.each do |token|
20
20
  if token.ent_type_ == "MONEY"
21
21
  if ["attr", "dobj"].index token.dep_
22
- subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep_ == "nsubj"}
22
+ subj = Spacy.generator_to_array(token.head.lefts).select{|t| t.dep == "nsubj"}
23
23
  if !subj.empty?
24
24
  puts(subj[0].text + " --> " + token.text)
25
25
  end
26
- elsif token.dep_ == "pobj" and token.head.dep_ == "prep"
26
+ elsif token.dep_ == "pobj" and token.head.dep == "prep"
27
27
  puts token.head.head.text + " --> " + token.text
28
28
  end
29
29
  end
@@ -12,7 +12,7 @@ doc.each do |token|
12
12
  if token.pos_ == "VERB"
13
13
  token.children.each do |child|
14
14
  if child.dep_ == "nsubj"
15
- results << child.head
15
+ results << child.head.text
16
16
  end
17
17
  end
18
18
  end
@@ -20,5 +20,5 @@ end
20
20
 
21
21
  puts results.to_s
22
22
 
23
- # [shift]
23
+ # ["shift"]
24
24
 
@@ -7,14 +7,14 @@ doc = nlp.read("bright red apples on the tree")
7
7
 
8
8
  puts "Text: " + doc.text
9
9
 
10
- puts "Words to the left of 'apple': " + Spacy.generator_to_array(doc[2].lefts).to_s
11
- puts "Words to the right of 'apple': " + Spacy.generator_to_array(doc[2].rights).to_s
10
+ puts "Words to the left of 'apple': " + doc[2].lefts.map(&:text).join(", ")
11
+ puts "Words to the right of 'apple': " + doc[2].rights.map(&:text).join(", ")
12
12
 
13
13
  puts "Num of the words to the left of 'apple': " + doc[2].n_lefts.to_s
14
14
  puts "Num of the words to the right of 'apple': " + doc[2].n_rights.to_s
15
15
 
16
16
  # Text: bright red apples on the tree
17
- # Words to the left of 'apple': [bright, red]
18
- # Words to the right of 'apple': [on]
17
+ # Words to the left of 'apple': bright, red
18
+ # Words to the right of 'apple': on
19
19
  # Num of the words to the left of 'apple': 2
20
20
  # Num of the words to the right of 'apple': 1
@@ -12,7 +12,7 @@ headings = ["lemma"]
12
12
  rows = []
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.lemma_]
15
+ rows << [token.lemma]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
10
10
  rows = []
11
11
 
12
12
  doc.ents.each do |ent|
13
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
13
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
12
12
  rows = []
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
15
+ rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
19
19
  puts table
20
20
 
21
21
  # Lemmatizer mode: rule
22
- # +---------------+----------+-----------+----------+---------------------------+
23
- # | text | dep | head text | head pos | children |
24
- # +---------------+----------+-----------+----------+---------------------------+
25
- # | Autonomous | amod | cars | NOUN | [] |
26
- # | cars | nsubj | shift | VERB | [Autonomous] |
27
- # | shift | ROOT | shift | VERB | [cars, liability, toward] |
28
- # | insurance | compound | liability | NOUN | [] |
29
- # | liability | dobj | shift | VERB | [insurance] |
30
- # | toward | prep | shift | VERB | [manufacturers] |
31
- # | manufacturers | pobj | toward | ADP | [] |
32
- # +---------------+----------+-----------+----------+---------------------------+
22
+ # +---------------+----------+-----------+----------+-------------------------+
23
+ # | text | dep | head text | head pos | children |
24
+ # +---------------+----------+-----------+----------+-------------------------+
25
+ # | Autonomous | amod | cars | NOUN | |
26
+ # | cars | nsubj | shift | VERB | Autonomous |
27
+ # | shift | ROOT | shift | VERB | cars, liability, toward |
28
+ # | insurance | compound | liability | NOUN | |
29
+ # | liability | dobj | shift | VERB | insurance |
30
+ # | toward | prep | shift | VERB | manufacturers |
31
+ # | manufacturers | pobj | toward | ADP | |
32
+ # +---------------+----------+-----------+----------+-------------------------+
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
12
12
  rows = []
13
13
 
14
14
  doc.noun_chunks.each do |chunk|
15
- rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
15
+ rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,7 +12,7 @@ rows = []
12
12
  doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.pos_, token.dep_, token.head.text]
15
+ rows << [token.text, token.pos, token.dep, token.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  sentence = "I live in New York"
7
7
  doc = nlp.read(sentence)
8
8
 
9
- puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
9
+ puts "Before: " + doc.tokens.map(&:text).join(", ")
10
10
 
11
11
  doc.retokenize(3, 4)
12
12
 
13
- puts "After: " + doc.tokens.collect{|t| t}.join(", ")
13
+ puts "After: " + doc.tokens.map(&:text).join(", ")
14
14
 
15
15
  # Before: I, live, in, New, York
16
16
  # After: I, live, in, New York
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  doc = nlp.read("Where are you?")
7
7
 
8
8
  puts "Morph features of the third word: " + doc[2].morph.to_s
9
- puts "POS of the third word: " + doc[2].pos_.to_s
9
+ puts "POS of the third word: " + doc[2].pos
10
10
 
11
11
  # Morph features of the third word: Case=Nom|Person=2|PronType=Prs
12
12
  # POS of the third word: PRON
@@ -0,0 +1,18 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+
6
+ orange = nlp.vocab("orange")
7
+ lemon = nlp.vocab("lemon")
8
+
9
+ book = nlp.vocab("book")
10
+ magazine = nlp.vocab("magazine")
11
+
12
+ puts "orange <=> lemon: #{orange.similarity(lemon)}"
13
+ puts "book <=> magazine: #{book.similarity(magazine)}"
14
+ puts "orange <=> book: #{orange.similarity(book)}"
15
+
16
+ # orange <=> lemon: 0.7080526351928711
17
+ # book <=> magazine: 0.4355940818786621
18
+ # orange <=> book: 0.12197211384773254
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
10
10
 
11
11
  matches.each do |match|
12
12
  span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
13
- puts span.text + " / " + span.label_
13
+ puts span.text + " / " + span.label
14
14
  end
15
15
 
16
16
  # Barack Obama / US_PRESIDENT
data/lib/ruby-spacy.rb CHANGED
@@ -165,6 +165,9 @@ module Spacy
165
165
  # so that ents canbe "each"-ed in Ruby
166
166
  ent_array = []
167
167
  PyCall::List.(@py_doc.ents).each do |ent|
168
+ ent.define_singleton_method :label do
169
+ return self.label_
170
+ end
168
171
  ent_array << ent
169
172
  end
170
173
  ent_array
@@ -252,10 +255,16 @@ module Spacy
252
255
  # @param text [String] A text string representing a lexeme
253
256
  # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
254
257
  def get_lexeme(text)
255
- text = text.gsub("'", "\'")
256
258
  @py_nlp.vocab[text]
257
259
  end
258
260
 
261
+ # Returns a ruby lexeme object
262
+ # @param text [String] a text string representing the vocabulary item
263
+ # @return [Lexeme]
264
+ def vocab(text)
265
+ Lexeme.new(@py_nlp.vocab[text])
266
+ end
267
+
259
268
  # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
260
269
  # @param vector [Object] A vector representation of a word (whether existing or non-existing)
261
270
  # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
@@ -386,18 +395,24 @@ module Spacy
386
395
  chunk_array = []
387
396
  py_chunks = PyCall::List.(@py_span.noun_chunks)
388
397
  py_chunks.each do |py_span|
389
- chunk_array << Spacy::Span.new(@doc, py_span: py_span)
398
+ chunk_array << Span.new(@doc, py_span: py_span)
390
399
  end
391
400
  chunk_array
392
401
  end
393
402
 
403
+ # Returns the head token
404
+ # @return [Token]
405
+ def root
406
+ Token.new(@py_span.root)
407
+ end
408
+
394
409
  # Returns an array of spans that represents sentences.
395
410
  # @return [Array<Span>]
396
411
  def sents
397
412
  sentence_array = []
398
413
  py_sentences = PyCall::List.(@py_span.sents)
399
414
  py_sentences.each do |py_span|
400
- sentence_array << Spacy::Span.new(@doc, py_span: py_span)
415
+ sentence_array << Span.new(@doc, py_span: py_span)
401
416
  end
402
417
  sentence_array
403
418
  end
@@ -407,7 +422,7 @@ module Spacy
407
422
  def ents
408
423
  ent_array = []
409
424
  PyCall::List.(@py_span.ents).each do |py_span|
410
- ent_array << Spacy::Span.new(@doc, py_span: py_span)
425
+ ent_array << Span.new(@doc, py_span: py_span)
411
426
  end
412
427
  ent_array
413
428
  end
@@ -416,7 +431,7 @@ module Spacy
416
431
  # @return [Span]
417
432
  def sent
418
433
  py_span = @py_span.sent
419
- return Spacy::Span.new(@doc, py_span: py_span)
434
+ return Span.new(@doc, py_span: py_span)
420
435
  end
421
436
 
422
437
  # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
@@ -424,9 +439,9 @@ module Spacy
424
439
  def [](range)
425
440
  if range.is_a?(Range)
426
441
  py_span = @py_span[range]
427
- return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
442
+ return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
428
443
  else
429
- return Spacy::Token.new(@py_span[range])
444
+ return Token.new(@py_span[range])
430
445
  end
431
446
  end
432
447
 
@@ -440,7 +455,7 @@ module Spacy
440
455
  # Creates a document instance from the span
441
456
  # @return [Doc]
442
457
  def as_doc
443
- Spacy::Doc.new(@doc.py_nlp, text: self.text)
458
+ Doc.new(@doc.py_nlp, text: self.text)
444
459
  end
445
460
 
446
461
  # Returns tokens conjugated to the root of the span.
@@ -448,7 +463,7 @@ module Spacy
448
463
  def conjuncts
449
464
  conjunct_array = []
450
465
  PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
451
- conjunct_array << Spacy::Token.new(py_conjunct)
466
+ conjunct_array << Token.new(py_conjunct)
452
467
  end
453
468
  conjunct_array
454
469
  end
@@ -458,7 +473,7 @@ module Spacy
458
473
  def lefts
459
474
  left_array = []
460
475
  PyCall::List.(@py_span.lefts).each do |py_left|
461
- left_array << Spacy::Token.new(py_left)
476
+ left_array << Token.new(py_left)
462
477
  end
463
478
  left_array
464
479
  end
@@ -468,7 +483,7 @@ module Spacy
468
483
  def rights
469
484
  right_array = []
470
485
  PyCall::List.(@py_span.rights).each do |py_right|
471
- right_array << Spacy::Token.new(py_right)
486
+ right_array << Token.new(py_right)
472
487
  end
473
488
  right_array
474
489
  end
@@ -478,11 +493,17 @@ module Spacy
478
493
  def subtree
479
494
  subtree_array = []
480
495
  PyCall::List.(@py_span.subtree).each do |py_subtree|
481
- subtree_array << Spacy::Token.new(py_subtree)
496
+ subtree_array << Token.new(py_subtree)
482
497
  end
483
498
  subtree_array
484
499
  end
485
500
 
501
+ # Returns the label
502
+ # @return [String]
503
+ def label
504
+ @py_span.label_
505
+ end
506
+
486
507
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
487
508
  def method_missing(name, *args)
488
509
  @py_span.send(name, *args)
@@ -506,52 +527,59 @@ module Spacy
506
527
  @text = @py_token.text
507
528
  end
508
529
 
530
+
531
+ # Returns the head token
532
+ # @return [Token]
533
+ def head
534
+ Token.new(@py_token.head)
535
+ end
536
+
509
537
  # Returns the token in question and the tokens that descend from it.
510
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
538
+ # @return [Array<Token>] an array of tokens
511
539
  def subtree
512
540
  descendant_array = []
513
541
  PyCall::List.(@py_token.subtree).each do |descendant|
514
- descendant_array << descendant
542
+ descendant_array << Token.new(descendant)
515
543
  end
516
544
  descendant_array
517
545
  end
518
546
 
519
547
  # Returns the token's ancestors.
520
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
548
+ # @return [Array<Token>] an array of tokens
521
549
  def ancestors
522
550
  ancestor_array = []
523
551
  PyCall::List.(@py_token.ancestors).each do |ancestor|
524
- ancestor_array << ancestor
552
+ ancestor_array << Token.new(ancestor)
525
553
  end
526
554
  ancestor_array
527
555
  end
528
556
 
529
557
  # Returns a sequence of the token's immediate syntactic children.
530
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
558
+ # @return [Array<Token>] an array of tokens
531
559
  def children
532
560
  child_array = []
533
561
  PyCall::List.(@py_token.children).each do |child|
534
- child_array << child
562
+ child_array << Token.new(child)
535
563
  end
536
564
  child_array
537
565
  end
538
566
 
539
567
  # The leftward immediate children of the word in the syntactic dependency parse.
540
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
568
+ # @return [Array<Token>] an array of tokens
541
569
  def lefts
542
570
  token_array = []
543
571
  PyCall::List.(@py_token.lefts).each do |token|
544
- token_array << token
572
+ token_array << Token.new(token)
545
573
  end
546
574
  token_array
547
575
  end
548
576
 
549
577
  # The rightward immediate children of the word in the syntactic dependency parse.
550
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
578
+ # @return [Array<Token>] an array of tokens
551
579
  def rights
552
580
  token_array = []
553
581
  PyCall::List.(@py_token.rights).each do |token|
554
- token_array << token
582
+ token_array << Token.new(token)
555
583
  end
556
584
  token_array
557
585
  end
@@ -582,12 +610,143 @@ module Spacy
582
610
  end
583
611
  end
584
612
 
613
+ # Returns the lemma by calling `lemma_' of `@py_token` object
614
+ # @return [String]
615
+ def lemma
616
+ @py_token.lemma_
617
+ end
618
+
619
+ # Returns the lowercase form by calling `lower_' of `@py_token` object
620
+ # @return [String]
621
+ def lower
622
+ @py_token.lower_
623
+ end
624
+
625
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
626
+ # @return [String]
627
+ def shape
628
+ @py_token.shape_
629
+ end
630
+
631
+ # Returns the pos by calling `pos_' of `@py_token` object
632
+ # @return [String]
633
+ def pos
634
+ @py_token.pos_
635
+ end
636
+
637
+ # Returns the fine-grained pos by calling `tag_' of `@py_token` object
638
+ # @return [String]
639
+ def tag
640
+ @py_token.tag_
641
+ end
642
+
643
+ # Returns the dependency relation by calling `dep_' of `@py_token` object
644
+ # @return [String]
645
+ def dep
646
+ @py_token.dep_
647
+ end
648
+
649
+ # Returns the language by calling `lang_' of `@py_token` object
650
+ # @return [String]
651
+ def lang
652
+ @py_token.lang_
653
+ end
654
+
655
+ # Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
656
+ # @return [String]
657
+ def whitespace
658
+ @py_token.whitespace_
659
+ end
660
+
661
+ # Returns the named entity type by calling `ent_type_' of `@py_token` object
662
+ # @return [String]
663
+ def ent_type
664
+ @py_token.ent_type_
665
+ end
666
+
667
+ # Returns a lexeme object
668
+ # @return [Lexeme]
669
+ def lexeme
670
+ Lexeme.new(@py_token.lex)
671
+ end
672
+
585
673
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
586
674
  def method_missing(name, *args)
587
675
  @py_token.send(name, *args)
588
676
  end
589
677
  end
590
678
 
679
+ # See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
680
+ class Lexeme
681
+
682
+ # @return [Object] a Python `Lexeme` instance accessible via `PyCall`
683
+ attr_reader :py_lexeme
684
+
685
+ # @return [String] a string representing the token
686
+ attr_reader :text
687
+
688
+ # It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
689
+ # There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
690
+ # @param py_lexeme [Object] Python `Lexeme` object
691
+ def initialize(py_lexeme)
692
+ @py_lexeme = py_lexeme
693
+ @text = @py_lexeme.text
694
+ end
695
+
696
+ # String representation of the token.
697
+ # @return [String]
698
+ def to_s
699
+ @text
700
+ end
701
+
702
+ # Returns the lowercase form by calling `lower_' of `@py_lexeme` object
703
+ # @return [String]
704
+ def lower
705
+ @py_lexeme.lower_
706
+ end
707
+
708
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
709
+ # @return [String]
710
+ def shape
711
+ @py_lexeme.shape_
712
+ end
713
+
714
+ # Returns the language by calling `lang_' of `@py_lexeme` object
715
+ # @return [String]
716
+ def lang
717
+ @py_lexeme.lang_
718
+ end
719
+
720
+ # Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
721
+ # @return [String]
722
+ def prefix
723
+ @py_lexeme.prefix_
724
+ end
725
+ #
726
+ # Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
727
+ # @return [String]
728
+ def suffix
729
+ @py_lexeme.suffix_
730
+ end
731
+
732
+ # Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
733
+ # @return [String]
734
+ def norm
735
+ @py_lexeme.norm_
736
+ end
737
+
738
+ # Returns a semantic similarity estimate.
739
+ # @param other [Lexeme] the other doc to which a similarity estimation is made
740
+ # @return [Float]
741
+ def similarity(other)
742
+ @py_lexeme.similarity(other.py_lexeme)
743
+ end
744
+
745
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
746
+ def method_missing(name, *args)
747
+ @py_lexeme.send(name, *args)
748
+ end
749
+ end
591
750
 
592
751
  end
593
752
 
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Spacy
4
4
  # The version number of the module
5
- VERSION = "0.1.3"
5
+ VERSION = "0.1.4"
6
6
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spacy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-28 00:00:00.000000000 Z
11
+ date: 2021-07-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pycall
@@ -123,6 +123,7 @@ files:
123
123
  - examples/linguistic_features/rule_based_morphology.rb
124
124
  - examples/linguistic_features/sentence_segmentation.rb
125
125
  - examples/linguistic_features/similarity.rb
126
+ - examples/linguistic_features/similarity_between_lexemes.rb
126
127
  - examples/linguistic_features/similarity_between_spans.rb
127
128
  - examples/linguistic_features/tokenization.rb
128
129
  - examples/rule_based_matching/creating_spans_from_matches.rb
@@ -149,7 +150,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
149
150
  - !ruby/object:Gem::Version
150
151
  version: '0'
151
152
  requirements: []
152
- rubygems_version: 3.2.3
153
+ rubygems_version: 3.2.11
153
154
  signing_key:
154
155
  specification_version: 4
155
156
  summary: A wrapper module for using spaCy natural language processing library from