ruby-spacy 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +58 -0
  3. data/.yardopts +2 -0
  4. data/Gemfile +18 -0
  5. data/Gemfile.lock +39 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +498 -0
  8. data/Rakefile +12 -0
  9. data/bin/console +15 -0
  10. data/bin/setup +8 -0
  11. data/examples/get_started/lexeme.rb +24 -0
  12. data/examples/get_started/linguistic_annotations.rb +32 -0
  13. data/examples/get_started/most_similar.rb +46 -0
  14. data/examples/get_started/named_entities.rb +24 -0
  15. data/examples/get_started/outputs/test_dep.svg +84 -0
  16. data/examples/get_started/outputs/test_dep_compact.svg +84 -0
  17. data/examples/get_started/outputs/test_ent.html +11 -0
  18. data/examples/get_started/pos_tags_and_dependencies.rb +31 -0
  19. data/examples/get_started/similarity.rb +13 -0
  20. data/examples/get_started/tokenization.rb +22 -0
  21. data/examples/get_started/visualizing_dependencies.rb +14 -0
  22. data/examples/get_started/visualizing_dependencies_compact.rb +12 -0
  23. data/examples/get_started/visualizing_named_entities.rb +12 -0
  24. data/examples/get_started/vocab.rb +10 -0
  25. data/examples/get_started/word_vectors.rb +24 -0
  26. data/examples/japanese/ancestors.rb +44 -0
  27. data/examples/japanese/entity_annotations_and_labels.rb +45 -0
  28. data/examples/japanese/information_extraction.rb +27 -0
  29. data/examples/japanese/lemmatization.rb +32 -0
  30. data/examples/japanese/most_similar.rb +46 -0
  31. data/examples/japanese/named_entity_recognition.rb +27 -0
  32. data/examples/japanese/navigating_parse_tree.rb +34 -0
  33. data/examples/japanese/noun_chunks.rb +23 -0
  34. data/examples/japanese/outputs/test_dep.svg +149 -0
  35. data/examples/japanese/outputs/test_ent.html +16 -0
  36. data/examples/japanese/pos_tagging.rb +34 -0
  37. data/examples/japanese/sentence_segmentation.rb +16 -0
  38. data/examples/japanese/similarity.rb +12 -0
  39. data/examples/japanese/tokenization.rb +38 -0
  40. data/examples/japanese/visualizing_dependencies.rb +13 -0
  41. data/examples/japanese/visualizing_named_entities.rb +14 -0
  42. data/examples/linguistic_features/ancestors.rb +41 -0
  43. data/examples/linguistic_features/entity_annotations_and_labels.rb +29 -0
  44. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +20 -0
  45. data/examples/linguistic_features/information_extraction.rb +36 -0
  46. data/examples/linguistic_features/iterating_children.rb +24 -0
  47. data/examples/linguistic_features/iterating_lefts_and_rights.rb +20 -0
  48. data/examples/linguistic_features/lemmatization.rb +31 -0
  49. data/examples/linguistic_features/morphology.rb +17 -0
  50. data/examples/linguistic_features/named_entity_recognition.rb +25 -0
  51. data/examples/linguistic_features/navigating_parse_tree.rb +32 -0
  52. data/examples/linguistic_features/noun_chunks.rb +27 -0
  53. data/examples/linguistic_features/outputs/test_ent.html +11 -0
  54. data/examples/linguistic_features/pos_tagging.rb +31 -0
  55. data/examples/linguistic_features/retokenize_1.rb +29 -0
  56. data/examples/linguistic_features/retokenize_2.rb +16 -0
  57. data/examples/linguistic_features/rule_based_morphology.rb +12 -0
  58. data/examples/linguistic_features/sentence_segmentation.rb +16 -0
  59. data/examples/linguistic_features/similarity.rb +14 -0
  60. data/examples/linguistic_features/similarity_between_spans.rb +23 -0
  61. data/examples/linguistic_features/special_case_tokenization_rules.rb +19 -0
  62. data/examples/linguistic_features/tokenization.rb +23 -0
  63. data/examples/rule_based_matching/creating_spans_from_matches.rb +16 -0
  64. data/examples/rule_based_matching/matcher.rb +19 -0
  65. data/lib/ruby-spacy.rb +567 -0
  66. data/lib/ruby-spacy/version.rb +6 -0
  67. data/ruby-spacy.gemspec +42 -0
  68. metadata +157 -0
@@ -0,0 +1,27 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ lemmatizer = nlp.get_pipe("lemmatizer")
7
+ puts "Lemmatizer mode: " + lemmatizer.mode
8
+
9
+ doc = nlp.read("Autonomous cars shift insurance liability toward manufacturers")
10
+
11
+ headings = ["text", "root.text", "root.dep", "root.head.text"]
12
+ rows = []
13
+
14
+ doc.noun_chunks.each do |chunk|
15
+ rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # +---------------------+---------------+----------+----------------+
22
+ # | text | root.text | root.dep | root.head.text |
23
+ # +---------------------+---------------+----------+----------------+
24
+ # | Autonomous cars | cars | nsubj | shift |
25
+ # | insurance liability | liability | dobj | shift |
26
+ # | manufacturers | manufacturers | pobj | toward |
27
+ # +---------------------+---------------+----------+----------------+
@@ -0,0 +1,11 @@
1
+ <div class="entities" style="line-height: 2.5; direction: ltr">When
2
+ <mark class="entity" style="background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
3
+ Sebastian Thrun
4
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">PERSON</span>
5
+ </mark>
6
+ started working on self-driving cars at Google in
7
+ <mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
8
+ 2007
9
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">DATE</span>
10
+ </mark>
11
+ , few people outside of the company took him seriously.</div>
@@ -0,0 +1,31 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
6
+
7
+ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
8
+ rows = []
9
+
10
+ doc.each do |token|
11
+ rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
12
+ end
13
+
14
+ table = Terminal::Table.new rows: rows, headings: headings
15
+ puts table
16
+
17
+ # +---------+---------+-------+-----+----------+-------+----------+---------+
18
+ # | text | lemma | pos | tag | dep | shape | is_alpha | is_stop |
19
+ # +---------+---------+-------+-----+----------+-------+----------+---------+
20
+ # | Apple | Apple | PROPN | NNP | nsubj | Xxxxx | true | false |
21
+ # | is | be | AUX | VBZ | aux | xx | true | true |
22
+ # | looking | look | VERB | VBG | ROOT | xxxx | true | false |
23
+ # | at | at | ADP | IN | prep | xx | true | true |
24
+ # | buying | buy | VERB | VBG | pcomp | xxxx | true | false |
25
+ # | U.K. | U.K. | PROPN | NNP | dobj | X.X. | false | false |
26
+ # | startup | startup | NOUN | NN | advcl | xxxx | true | false |
27
+ # | for | for | ADP | IN | prep | xxx | true | true |
28
+ # | $ | $ | SYM | $ | quantmod | $ | false | false |
29
+ # | 1 | 1 | NUM | CD | compound | d | false | false |
30
+ # | billion | billion | NUM | CD | pobj | xxxx | true | false |
31
+ # +---------+---------+-------+-----+----------+-------+----------+---------+
@@ -0,0 +1,29 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ sentence = "Credit and mortgage account holders must submit their requests"
7
+ doc = nlp.read(sentence)
8
+
9
+ headings = ["text", "pos", "dep", "head text"]
10
+ rows = []
11
+
12
+ doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
13
+
14
+ doc.each do |token|
15
+ rows << [token.text, token.pos_, token.dep_, token.head.text]
16
+ end
17
+
18
+ table = Terminal::Table.new rows: rows, headings: headings
19
+ puts table
20
+
21
+ # +-------------------------------------+------+-------+-----------+
22
+ # | text | pos | dep | head text |
23
+ # +-------------------------------------+------+-------+-----------+
24
+ # | Credit and mortgage account holders | NOUN | nsubj | submit |
25
+ # | must | AUX | aux | submit |
26
+ # | submit | VERB | ROOT | submit |
27
+ # | their | PRON | poss | requests |
28
+ # | requests | NOUN | dobj | submit |
29
+ # +-------------------------------------+------+-------+-----------+
@@ -0,0 +1,16 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ sentence = "I live in New York"
7
+ doc = nlp.read(sentence)
8
+
9
+ puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
10
+
11
+ doc.retokenize(3, 4)
12
+
13
+ puts "After: " + doc.tokens.collect{|t| t}.join(", ")
14
+
15
+ # Before: I, live, in, New, York
16
+ # After: I, live, in, New York
@@ -0,0 +1,12 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("Where are you?")
7
+
8
+ puts "Morph features of the third word: " + doc[2].morph.to_s
9
+ puts "POS of the third word: " + doc[2].pos_.to_s
10
+
11
+ # Morph features of the third word: Case=Nom|Person=2|PronType=Prs
12
+ # POS of the third word: PRON
@@ -0,0 +1,16 @@
1
+ require "ruby-spacy"
2
+
3
+ nlp = Spacy::Language.new("en_core_web_sm")
4
+
5
+ doc = nlp.read("This is a sentence. This is another sentence.")
6
+
7
+
8
+ puts "doc has annotation SENT_START: " + doc.has_annotation("SENT_START").to_s
9
+
10
+ doc.sents.each do |sent|
11
+ puts sent.text
12
+ end
13
+
14
+ # doc has annotation SENT_START: true
15
+ # This is a sentence.
16
+ # This is another sentence.
@@ -0,0 +1,14 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+ doc1 = nlp.read("I like salty fries and hamburgers.")
6
+ doc2 = nlp.read("Fast food tastes very good.")
7
+
8
+ puts "Doc 1: " + doc1
9
+ puts "Doc 2: " + doc2
10
+ puts "Similarity: #{doc1.similarity(doc2)}"
11
+
12
+ # Doc 1: I like salty fries and hamburgers.
13
+ # Doc 2: Fast food tastes very good.
14
+ # Similarity: 0.7687607012190486
@@ -0,0 +1,23 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+ doc1 = nlp.read("I like salty fries and hamburgers.")
6
+ doc2 = nlp.read("Fast food tastes very good.")
7
+
8
+ puts "Doc 1: " + doc1
9
+ puts "Doc 2: " + doc2
10
+ puts "Similarity: #{doc1.similarity(doc2)}"
11
+
12
+ span1 = doc1.span(2, 2) # salty fries
13
+ span2 = doc1.span(5 .. 5) # hamberger
14
+ puts "Span 1: " + span1.text
15
+ puts "Span 2: " + span2.text
16
+ puts "Similarity: #{span1.similarity(span2)}"
17
+
18
+ # Doc 1: I like salty fries and hamburgers.
19
+ # Doc 2: Fast food tastes very good.
20
+ # Similarity: 0.7687607012190486
21
+ # Span 1: salty fries
22
+ # Span 2: hamburgers
23
+ # Similarity: 0.6949787735939026
@@ -0,0 +1,19 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("gimme that")
7
+
8
+ puts doc.tokens.join(" ")
9
+
10
+ # Add special case rule
11
+ special_case = [{ORTH: "gim"}, {ORTH: "me"}]
12
+ tokenizer = nlp.tokenizer
13
+ tokenizer.add_special_case("gimme", special_case)
14
+
15
+ # Check new tokenization
16
+ puts nlp.read("gimme that").tokens.join(" ")
17
+
18
+ # gimme that
19
+ # gim me that
@@ -0,0 +1,23 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_sm")
5
+
6
+ doc = nlp.read("Apple is looking at buying U.K. startup for $1 billion")
7
+
8
+ headings = [1,2,3,4,5,6,7,8,9,10,11]
9
+ row = []
10
+
11
+ doc.each do |token|
12
+ row << token.text
13
+ end
14
+
15
+ table = Terminal::Table.new rows: [row], headings: headings
16
+ puts table
17
+
18
+ # +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
19
+ # | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
20
+ # +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
21
+ # | Apple | is | looking | at | buying | U.K. | startup | for | $ | 1 | billion |
22
+ # +-------+----+---------+----+--------+------+---------+-----+---+----+---------+
23
+
@@ -0,0 +1,16 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+ matcher = nlp.matcher
6
+ matcher.add("US_PRESIDENT", [[{LOWER: "barack"}, {LOWER: "obama"}]])
7
+ doc = nlp.read("Barack Obama was the 44th president of the United States")
8
+
9
+ matches = matcher.match(doc)
10
+
11
+ matches.each do |match|
12
+ span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
13
+ puts span.text + " / " + span.label_
14
+ end
15
+
16
+ # Barack Obama / US_PRESIDENT
@@ -0,0 +1,19 @@
1
+ require "ruby-spacy"
2
+
3
+ nlp = Spacy::Language.new("en_core_web_sm")
4
+
5
+ pattern = [[{LOWER: "hello"}, {IS_PUNCT: true}, {LOWER: "world"}]]
6
+
7
+ matcher = nlp.matcher
8
+ matcher.add("HelloWorld", pattern)
9
+
10
+ doc = nlp.read("Hello, world! Hello world!")
11
+ matches = matcher.match(doc)
12
+
13
+ matches.each do | match |
14
+ string_id = nlp.vocab_string_lookup(match[:match_id])
15
+ span = doc.span(match[:start_index]..match[:end_index])
16
+ puts "#{string_id}, #{span.text}"
17
+ end
18
+
19
+ # HelloWorld, Hello, world
data/lib/ruby-spacy.rb ADDED
@@ -0,0 +1,567 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "ruby-spacy/version"
4
+ require 'enumerator'
5
+ require 'strscan'
6
+ require 'pycall/import'
7
+ require 'numpy'
8
+ include PyCall::Import
9
+
10
+ # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
11
+ module Spacy
12
+ # A utility module method to convert Python's generator object to a Ruby array,
13
+ # mainly used on the items inside the array returned from dependency-related methods
14
+ # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
15
+ def self.generator_to_array(py_generator)
16
+ PyCall::List.(py_generator)
17
+ end
18
+
19
+ # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
20
+ class Span
21
+
22
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
23
+ attr_reader :spacy_span_id
24
+
25
+ # @return [Object] a Python `Span` instance accessible via `PyCall`
26
+ attr_reader :py_span
27
+
28
+ # @return [Doc] the document to which the span belongs
29
+ attr_reader :doc
30
+
31
+ include Enumerable
32
+
33
+ alias_method :length, :count
34
+ alias_method :len, :count
35
+ alias_method :size, :count
36
+
37
+ # It is recommended to use {Doc#span} method to create a span. If you need to
38
+ # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
39
+ # @param doc [Doc] the document to which this span belongs to
40
+ # @param start_index [Integer] the index of the item starting the span inside a doc
41
+ # @param end_index [Integer] the index of the item ending the span inside a doc
42
+ # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
43
+ def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
44
+ @doc = doc
45
+ @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
46
+ if py_span
47
+ @py_span = py_span
48
+ else
49
+ options = PyCall::Dict.(options)
50
+ PyCall.exec("#{@spacy_span_id}_opts = #{options}")
51
+ PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
52
+ @py_span = PyCall.eval(@spacy_span_id)
53
+ end
54
+ end
55
+
56
+ # Returns an array of tokens contained in the span.
57
+ # @return [Array<Token>]
58
+ def tokens
59
+ results = []
60
+ PyCall::List.(@py_span).each do |py_token|
61
+ results << Token.new(py_token)
62
+ end
63
+ results
64
+ end
65
+
66
+ # Iterates over the elements in the span yielding a token instance.
67
+ def each
68
+ PyCall::List.(@py_span).each do |py_token|
69
+ yield Token.new(py_token)
70
+ end
71
+ end
72
+
73
+ # Returns an array of spans of noun chunks.
74
+ # @return [Array<Span>]
75
+ def noun_chunks
76
+ chunk_array = []
77
+ py_chunks = PyCall::List.(@py_span.noun_chunks)
78
+ py_chunks.each do |py_span|
79
+ chunk_array << Spacy::Span.new(@doc, py_span: py_span)
80
+ end
81
+ chunk_array
82
+ end
83
+
84
+ # Returns an array of spans that represents sentences.
85
+ # @return [Array<Span>]
86
+ def sents
87
+ sentence_array = []
88
+ py_sentences = PyCall::List.(@py_span.sents)
89
+ py_sentences.each do |py_span|
90
+ sentence_array << Spacy::Span.new(@doc, py_span: py_span)
91
+ end
92
+ sentence_array
93
+ end
94
+
95
+ # Returns an array of spans that represents named entities.
96
+ # @return [Array<Span>]
97
+ def ents
98
+ ent_array = []
99
+ PyCall::List.(@py_span.ents).each do |py_span|
100
+ # ent_array << ent
101
+ ent_array << Spacy::Span.new(@doc, py_span: py_span)
102
+ end
103
+ ent_array
104
+ end
105
+
106
+ # Returns a span that represents the sentence that the given span is part of.
107
+ # @return [Span]
108
+ def sent
109
+ py_span =@py_span.sent
110
+ return Spacy::Span.new(@doc, py_span: py_span)
111
+ end
112
+
113
+ # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
114
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
115
+ def [](range)
116
+ if range.is_a?(Range)
117
+ py_span = @py_span[range]
118
+ return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
119
+ else
120
+ return Spacy::Token.new(@py_span[range])
121
+ end
122
+ end
123
+
124
+ # Returns a semantic similarity estimate.
125
+ # @param other [Span] the other span to which a similarity estimation is conducted
126
+ # @return [Float]
127
+ def similarity(other)
128
+ PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
129
+ end
130
+
131
+ # Creates a document instance
132
+ # @return [Doc]
133
+ def as_doc
134
+ Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
135
+ end
136
+
137
+ # Returns Tokens conjugated to the root of the span.
138
+ # @return [Array<Token>] an array of tokens
139
+ def conjuncts
140
+ conjunct_array = []
141
+ PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
142
+ conjunct_array << Spacy::Token.new(py_conjunct)
143
+ end
144
+ conjunct_array
145
+ end
146
+
147
+ # Returns Tokens that are to the left of the span, whose heads are within the span.
148
+ # @return [Array<Token>] an array of tokens
149
+ def lefts
150
+ left_array = []
151
+ PyCall::List.(@py_span.lefts).each do |py_left|
152
+ left_array << Spacy::Token.new(py_left)
153
+ end
154
+ left_array
155
+ end
156
+
157
+ # Returns Tokens that are to the right of the span, whose heads are within the span.
158
+ # @return [Array<Token>] an array of Tokens
159
+ def rights
160
+ right_array = []
161
+ PyCall::List.(@py_span.rights).each do |py_right|
162
+ right_array << Spacy::Token.new(py_right)
163
+ end
164
+ right_array
165
+ end
166
+
167
+ # Returns Tokens that are within the span and tokens that descend from them.
168
+ # @return [Array<Token>] an array of tokens
169
+ def subtree
170
+ subtree_array = []
171
+ PyCall::List.(@py_span.subtree).each do |py_subtree|
172
+ subtree_array << Spacy::Token.new(py_subtree)
173
+ end
174
+ subtree_array
175
+ end
176
+
177
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
178
+ def method_missing(name, *args)
179
+ @py_span.send(name, *args)
180
+ end
181
+ end
182
+
183
+ # See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
184
+ class Token
185
+
186
+ # @return [Object] a Python `Token` instance accessible via `PyCall`
187
+ attr_reader :py_token
188
+
189
+ # @return [String] a string representing the token
190
+ attr_reader :text
191
+
192
+ # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
193
+ # @param py_token [Object] Python `Token` object
194
+ def initialize(py_token)
195
+ @py_token = py_token
196
+ @text = @py_token.text
197
+ end
198
+
199
+ # Returns the token in question and the tokens that descend from it.
200
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
201
+ def subtree
202
+ descendant_array = []
203
+ PyCall::List.(@py_token.subtree).each do |descendant|
204
+ descendant_array << descendant
205
+ end
206
+ descendant_array
207
+ end
208
+
209
+ # Returns the token's ancestors.
210
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
211
+ def ancestors
212
+ ancestor_array = []
213
+ PyCall::List.(@py_token.ancestors).each do |ancestor|
214
+ ancestor_array << ancestor
215
+ end
216
+ ancestor_array
217
+ end
218
+
219
+ # Returns a sequence of the token's immediate syntactic children.
220
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
221
+ def children
222
+ child_array = []
223
+ PyCall::List.(@py_token.children).each do |child|
224
+ child_array << child
225
+ end
226
+ child_array
227
+ end
228
+
229
+ # The leftward immediate children of the word in the syntactic dependency parse.
230
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
231
+ def lefts
232
+ token_array = []
233
+ PyCall::List.(@py_token.lefts).each do |token|
234
+ token_array << token
235
+ end
236
+ token_array
237
+ end
238
+
239
+ # The rightward immediate children of the word in the syntactic dependency parse.
240
+ # @return [Array<Object>] an (Ruby) array of Python `Token` objects
241
+ def rights
242
+ token_array = []
243
+ PyCall::List.(@py_token.rights).each do |token|
244
+ token_array << token
245
+ end
246
+ token_array
247
+ end
248
+
249
+ # String representation of the token.
250
+ # @return [String]
251
+ def to_s
252
+ @text
253
+ end
254
+
255
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
256
+ def method_missing(name, *args)
257
+ @py_token.send(name, *args)
258
+ end
259
+ end
260
+
261
+ # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
262
+ class Doc
263
+
264
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
265
+ attr_reader :spacy_nlp_id
266
+
267
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
268
+ attr_reader :spacy_doc_id
269
+
270
+ # @return [Object] a Python `Doc` instance accessible via `PyCall`
271
+ attr_reader :py_doc
272
+
273
+ # @return [String] a text string of the document
274
+ attr_reader :text
275
+
276
+ include Enumerable
277
+
278
+ alias_method :length, :count
279
+ alias_method :len, :count
280
+ alias_method :size, :count
281
+
282
+ # Creates a new instance of {Doc}.
283
+ # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
284
+ # @param text [String] The text string to be analyzed
285
+ def initialize(nlp_id, text)
286
+ @text = text
287
+ @spacy_nlp_id = nlp_id
288
+ @spacy_doc_id = "doc_#{text.object_id}"
289
+ quoted = text.gsub('"', '\"')
290
+ PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
291
+ PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
292
+ @py_doc = PyCall.eval(@spacy_doc_id)
293
+ end
294
+
295
+
296
+ # Retokenizes the text merging a span into a single token.
297
+ # @param start_index [Integer] The start position of the span to be retokenized in the document
298
+ # @param end_index [Integer] The end position of the span to be retokenized in the document
299
+ # @param attributes [Hash] Attributes to set on the merged token
300
+ def retokenize(start_index, end_index, attributes = {})
301
+ py_attrs = PyCall::Dict.(attributes)
302
+ PyCall.exec(<<PY)
303
+ with #{@spacy_doc_id}.retokenize() as retokenizer:
304
+ retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
305
+ PY
306
+ @py_doc = PyCall.eval(@spacy_doc_id)
307
+ end
308
+
309
+ # Retokenizes the text splitting the specified token.
310
+ # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
311
+ # @param split_array [Array<String>] text strings of the split results
312
+ # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
313
+ # @param attributes [Hash] The attributes of the split elements
314
+ def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
315
+ py_attrs = PyCall::Dict.(attributes)
316
+ py_split_array = PyCall::List.(split_array)
317
+ PyCall.exec(<<PY)
318
+ with #{@spacy_doc_id}.retokenize() as retokenizer:
319
+ heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
320
+ attrs = #{py_attrs}
321
+ split_array = #{py_split_array}
322
+ retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
323
+ PY
324
+ @py_doc = PyCall.eval(@spacy_doc_id)
325
+ end
326
+
327
+ # String representation of the token.
328
+ # @return [String]
329
+ def to_s
330
+ @text
331
+ end
332
+
333
+ # Returns an array of tokens contained in the doc.
334
+ # @return [Array<Token>]
335
+ def tokens
336
+ results = []
337
+ PyCall::List.(@py_doc).each do |py_token|
338
+ results << Token.new(py_token)
339
+ end
340
+ results
341
+ end
342
+
343
+ # Iterates over the elements in the doc yielding a token instance.
344
+ def each
345
+ PyCall::List.(@py_doc).each do |py_token|
346
+ yield Token.new(py_token)
347
+ end
348
+ end
349
+
350
+ # Returns a span of the specified range within the doc.
351
+ # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
352
+ # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
353
+ # @param optional_size [Integer] An integer representing the size of the span
354
+ # @return [Span]
355
+ def span(range_or_start, optional_size = nil)
356
+ if optional_size
357
+ start_index = range_or_start
358
+ temp = tokens[start_index ... start_index + optional_size]
359
+ else
360
+ start_index = range_or_start.first
361
+ range = range_or_start
362
+ temp = tokens[range]
363
+ end
364
+
365
+ end_index = start_index + temp.size - 1
366
+
367
+ Span.new(self, start_index: start_index, end_index: end_index)
368
+ end
369
+
370
+ # Returns an array of spans representing noun chunks.
371
+ # @return [Array<Span>]
372
+ def noun_chunks
373
+ chunk_array = []
374
+ py_chunks = PyCall::List.(@py_doc.noun_chunks)
375
+ py_chunks.each do |py_chunk|
376
+ chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
377
+ end
378
+ chunk_array
379
+ end
380
+
381
+ # Returns an array of spans representing sentences.
382
+ # @return [Array<Span>]
383
+ def sents
384
+ sentence_array = []
385
+ py_sentences = PyCall::List.(@py_doc.sents)
386
+ py_sentences.each do |py_sent|
387
+ sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
388
+ end
389
+ sentence_array
390
+ end
391
+
392
+ # Returns an array of spans representing named entities.
393
+ # @return [Array<Span>]
394
+ def ents
395
+ # so that ents canbe "each"-ed in Ruby
396
+ ent_array = []
397
+ PyCall::List.(@py_doc.ents).each do |ent|
398
+ ent_array << ent
399
+ end
400
+ ent_array
401
+ end
402
+
403
+ # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
404
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
405
+ def [](range)
406
+ if range.is_a?(Range)
407
+ py_span = @py_doc[range]
408
+ return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
409
+ else
410
+ return Token.new(@py_doc[range])
411
+ end
412
+ end
413
+
414
+ # Returns a semantic similarity estimate.
415
+ # @param other [Doc] the other doc to which a similarity estimation is made
416
+ # @return [Float]
417
+ def similarity(other)
418
+ PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
419
+ end
420
+
421
+ # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
422
+ # @param style [String] Either `dep` or `ent`
423
+ # @param compact [Boolean] Only relevant to the `dep' style
424
+ # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
425
+ def displacy(style: "dep", compact: false)
426
+ PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
427
+ end
428
+
429
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
430
+ def method_missing(name, *args)
431
+ @py_doc.send(name, *args)
432
+ end
433
+ end
434
+
435
+ # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
436
+ class Matcher
437
+
438
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
439
+ attr_reader :spacy_matcher_id
440
+
441
+ # @return [Object] a Python `Matcher` instance accessible via `PyCall`
442
+ attr_reader :py_matcher
443
+
444
+ # Creates a {Matcher} instance
445
+ # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
446
+ def initialize(nlp_id)
447
+ @spacy_matcher_id = "doc_#{nlp_id}_matcher"
448
+ PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
449
+ @py_matcher = PyCall.eval(@spacy_matcher_id)
450
+ end
451
+
452
+ # Adds a label string and a text pattern.
453
+ # @param text [String] a label string given to the pattern
454
+ # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
455
+ def add(text, pattern)
456
+ @py_matcher.add(text, pattern)
457
+ end
458
+
459
+ # Execute the match.
460
+ # @param doc [Doc] An {Doc} instance
461
+ # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
462
+ def match(doc)
463
+ str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
464
+ s = StringScanner.new(str_results[1..-2])
465
+ results = []
466
+ while s.scan_until(/(\d+), (\d+), (\d+)/)
467
+ next unless s.matched
468
+ triple = s.matched.split(", ")
469
+ match_id = triple[0].to_i
470
+ start_index = triple[1].to_i
471
+ end_index = triple[2].to_i - 1
472
+ results << {match_id: match_id, start_index: start_index, end_index: end_index}
473
+ end
474
+ results
475
+ end
476
+ end
477
+
478
+ # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
479
+ class Language
480
+
481
+ # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
482
+ attr_reader :spacy_nlp_id
483
+
484
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
485
+ attr_reader :py_nlp
486
+
487
+ # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
488
+ # @param model [String] A language model installed in the system
489
+ def initialize(model = "en_core_web_sm")
490
+ @spacy_nlp_id = "nlp_#{model.object_id}"
491
+ PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
492
+ PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
493
+ @py_nlp = PyCall.eval(@spacy_nlp_id)
494
+ end
495
+
496
+ # Reads and analyze the given text.
497
+ # @param text [String] A text to be read and analyzed
498
+ def read(text)
499
+ Doc.new(@spacy_nlp_id, text)
500
+ end
501
+
502
+ # Generates a matcher for the current language model.
503
+ # @return [Matcher]
504
+ def matcher
505
+ Matcher.new(@spacy_nlp_id)
506
+ end
507
+
508
+ # A utility method to lookup a vocabulary item of the given id.
509
+ # @param id [Integer] A vocabulary id
510
+ # @return [Object] A Python `Lexeme` object
511
+ def vocab_string_lookup(id)
512
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
513
+ end
514
+
515
+ # A utility method to list pipeline components.
516
+ # @return [Array<String>] An array of text strings representing pipeline components
517
+ def pipe_names
518
+ pipe_array = []
519
+ PyCall::List.(@py_nlp.pipe_names).each do |pipe|
520
+ pipe_array << pipe
521
+ end
522
+ pipe_array
523
+ end
524
+
525
+ # A utility method to get the tokenizer Python object.
526
+ # @return [Object] Python `Tokenizer` object
527
+ def tokenizer
528
+ return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
529
+ end
530
+
531
+ # A utility method to get a Python `Lexeme` object.
532
+ # @param text [String] A text string representing a lexeme
533
+ # @return [Object] Python `Tokenizer` object
534
+ def get_lexeme(text)
535
+ text = text.gsub("'", "\'")
536
+ py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
537
+ return py_lexeme
538
+ end
539
+
540
+ # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
541
+ # @param vector [Object] A vector representation of a word (whether existing or non-existing)
542
+ # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
543
+ def most_similar(vector, n)
544
+ vec_array = Numpy.asarray([vector])
545
+ py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
546
+ key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
547
+ keys = key_texts.map{|kt| kt[0]}
548
+ texts = key_texts.map{|kt| kt[1]}
549
+ best_rows = PyCall::List.(py_result[1])[0]
550
+ scores = PyCall::List.(py_result[2])[0]
551
+
552
+ results = []
553
+ n.times do |i|
554
+ results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
555
+ end
556
+
557
+ results
558
+ end
559
+
560
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
561
+ def method_missing(name, *args)
562
+ @py_nlp.send(name, *args)
563
+ end
564
+ end
565
+
566
+ end
567
+