ruby-spacy 0.1.0 → 0.1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Gemfile.lock +3 -1
  4. data/README.md +123 -77
  5. data/examples/get_started/lexeme.rb +2 -2
  6. data/examples/get_started/linguistic_annotations.rb +1 -1
  7. data/examples/get_started/morphology.rb +45 -0
  8. data/examples/get_started/most_similar.rb +28 -27
  9. data/examples/get_started/named_entities.rb +1 -1
  10. data/examples/get_started/pos_tags_and_dependencies.rb +18 -18
  11. data/examples/get_started/similarity.rb +2 -2
  12. data/examples/japanese/ancestors.rb +9 -11
  13. data/examples/japanese/entity_annotations_and_labels.rb +1 -1
  14. data/examples/japanese/lemmatization.rb +1 -1
  15. data/examples/japanese/most_similar.rb +28 -27
  16. data/examples/japanese/named_entity_recognition.rb +1 -1
  17. data/examples/japanese/navigating_parse_tree.rb +18 -18
  18. data/examples/japanese/noun_chunks.rb +1 -1
  19. data/examples/japanese/pos_tagging.rb +20 -20
  20. data/examples/japanese/visualizing_dependencies.rb +2 -2
  21. data/examples/japanese/visualizing_named_entities.rb +1 -1
  22. data/examples/linguistic_features/ancestors.rb +13 -10
  23. data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
  24. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
  25. data/examples/linguistic_features/information_extraction.rb +2 -2
  26. data/examples/linguistic_features/iterating_children.rb +2 -2
  27. data/examples/linguistic_features/iterating_lefts_and_rights.rb +5 -5
  28. data/examples/linguistic_features/lemmatization.rb +1 -1
  29. data/examples/linguistic_features/named_entity_recognition.rb +1 -1
  30. data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
  31. data/examples/linguistic_features/noun_chunks.rb +1 -1
  32. data/examples/linguistic_features/pos_tagging.rb +1 -1
  33. data/examples/linguistic_features/retokenize_1.rb +1 -1
  34. data/examples/linguistic_features/retokenize_2.rb +2 -2
  35. data/examples/linguistic_features/rule_based_morphology.rb +1 -1
  36. data/examples/linguistic_features/similarity.rb +2 -2
  37. data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
  38. data/examples/linguistic_features/similarity_between_spans.rb +2 -2
  39. data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
  40. data/lib/ruby-spacy.rb +493 -300
  41. data/lib/ruby-spacy/version.rb +1 -1
  42. data/ruby-spacy.gemspec +1 -1
  43. metadata +6 -5
  44. data/examples/linguistic_features/morphology.rb +0 -17
  45. data/examples/linguistic_features/special_case_tokenization_rules.rb +0 -19
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
10
10
  rows = []
11
11
 
12
12
  doc.ents.each do |ent|
13
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
13
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
12
12
  rows = []
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
15
+ rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
19
19
  puts table
20
20
 
21
21
  # Lemmatizer mode: rule
22
- # +---------------+----------+-----------+----------+---------------------------+
23
- # | text | dep | head text | head pos | children |
24
- # +---------------+----------+-----------+----------+---------------------------+
25
- # | Autonomous | amod | cars | NOUN | [] |
26
- # | cars | nsubj | shift | VERB | [Autonomous] |
27
- # | shift | ROOT | shift | VERB | [cars, liability, toward] |
28
- # | insurance | compound | liability | NOUN | [] |
29
- # | liability | dobj | shift | VERB | [insurance] |
30
- # | toward | prep | shift | VERB | [manufacturers] |
31
- # | manufacturers | pobj | toward | ADP | [] |
32
- # +---------------+----------+-----------+----------+---------------------------+
22
+ # +---------------+----------+-----------+----------+-------------------------+
23
+ # | text | dep | head text | head pos | children |
24
+ # +---------------+----------+-----------+----------+-------------------------+
25
+ # | Autonomous | amod | cars | NOUN | |
26
+ # | cars | nsubj | shift | VERB | Autonomous |
27
+ # | shift | ROOT | shift | VERB | cars, liability, toward |
28
+ # | insurance | compound | liability | NOUN | |
29
+ # | liability | dobj | shift | VERB | insurance |
30
+ # | toward | prep | shift | VERB | manufacturers |
31
+ # | manufacturers | pobj | toward | ADP | |
32
+ # +---------------+----------+-----------+----------+-------------------------+
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
12
12
  rows = []
13
13
 
14
14
  doc.noun_chunks.each do |chunk|
15
- rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
15
+ rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,7 +12,7 @@ rows = []
12
12
  doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.pos_, token.dep_, token.head.text]
15
+ rows << [token.text, token.pos, token.dep, token.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  sentence = "I live in New York"
7
7
  doc = nlp.read(sentence)
8
8
 
9
- puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
9
+ puts "Before: " + doc.tokens.map(&:text).join(", ")
10
10
 
11
11
  doc.retokenize(3, 4)
12
12
 
13
- puts "After: " + doc.tokens.collect{|t| t}.join(", ")
13
+ puts "After: " + doc.tokens.map(&:text).join(", ")
14
14
 
15
15
  # Before: I, live, in, New, York
16
16
  # After: I, live, in, New York
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  doc = nlp.read("Where are you?")
7
7
 
8
8
  puts "Morph features of the third word: " + doc[2].morph.to_s
9
- puts "POS of the third word: " + doc[2].pos_.to_s
9
+ puts "POS of the third word: " + doc[2].pos
10
10
 
11
11
  # Morph features of the third word: Case=Nom|Person=2|PronType=Prs
12
12
  # POS of the third word: PRON
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  # Doc 1: I like salty fries and hamburgers.
@@ -0,0 +1,18 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+
6
+ orange = nlp.vocab("orange")
7
+ lemon = nlp.vocab("lemon")
8
+
9
+ book = nlp.vocab("book")
10
+ magazine = nlp.vocab("magazine")
11
+
12
+ puts "orange <=> lemon: #{orange.similarity(lemon)}"
13
+ puts "book <=> magazine: #{book.similarity(magazine)}"
14
+ puts "orange <=> book: #{orange.similarity(book)}"
15
+
16
+ # orange <=> lemon: 0.7080526351928711
17
+ # book <=> magazine: 0.4355940818786621
18
+ # orange <=> book: 0.12197211384773254
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  span1 = doc1.span(2, 2) # salty fries
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
10
10
 
11
11
  matches.each do |match|
12
12
  span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
13
- puts span.text + " / " + span.label_
13
+ puts span.text + " / " + span.label
14
14
  end
15
15
 
16
16
  # Barack Obama / US_PRESIDENT
data/lib/ruby-spacy.rb CHANGED
@@ -3,12 +3,34 @@
3
3
  require_relative "ruby-spacy/version"
4
4
  require 'enumerator'
5
5
  require 'strscan'
6
- require 'pycall/import'
7
6
  require 'numpy'
7
+ require 'pycall/import'
8
8
  include PyCall::Import
9
9
 
10
10
  # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
11
11
  module Spacy
12
+
13
+ extend PyCall::Import
14
+ spacy = PyCall.import_module('spacy')
15
+
16
+ # Python `Language` class
17
+ PyLanguage = spacy.language.Language
18
+
19
+ # Python `Doc` class object
20
+ PyDoc = spacy.tokens.Doc
21
+
22
+ # Python `Span` class object
23
+ PySpan = spacy.tokens.Span
24
+
25
+ # Python `Token` class object
26
+ PyToken = spacy.tokens.Token
27
+
28
+ # Python `Matcher` class object
29
+ PyMatcher = spacy.matcher.Matcher
30
+
31
+ # Python `displacy` object
32
+ PyDisplacy = spacy.displacy
33
+
12
34
  # A utility module method to convert Python's generator object to a Ruby array,
13
35
  # mainly used on the items inside the array returned from dependency-related methods
14
36
  # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
@@ -16,12 +38,320 @@ module Spacy
16
38
  PyCall::List.(py_generator)
17
39
  end
18
40
 
41
+ # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
42
+ class Doc
43
+
44
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
45
+ attr_reader :py_nlp
46
+
47
+ # @return [Object] a Python `Doc` instance accessible via `PyCall`
48
+ attr_reader :py_doc
49
+
50
+ # @return [String] a text string of the document
51
+ attr_reader :text
52
+
53
+ include Enumerable
54
+
55
+ alias_method :length, :count
56
+ alias_method :len, :count
57
+ alias_method :size, :count
58
+
59
+ # It is recommended to use {Language#read} method to create a doc. If you need to
60
+ # create one using {Doc#initialize}, there are two method signatures:
61
+ # `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
62
+ # @param nlp [Language] an instance of {Language} class
63
+ # @param py_doc [Object] an instance of Python `Doc` class
64
+ # @param text [String] the text string to be analyzed
65
+ def initialize(nlp, py_doc: nil, text: nil)
66
+ @py_nlp = nlp
67
+ if py_doc
68
+ @py_doc = py_doc
69
+ else
70
+ @py_doc = nlp.(text)
71
+ end
72
+ @text = @py_doc.text
73
+ end
74
+
75
+ # Retokenizes the text merging a span into a single token.
76
+ # @param start_index [Integer] the start position of the span to be retokenized in the document
77
+ # @param end_index [Integer] the end position of the span to be retokenized in the document
78
+ # @param attributes [Hash] attributes to set on the merged token
79
+ def retokenize(start_index, end_index, attributes = {})
80
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
81
+ retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
82
+ end
83
+ end
84
+
85
+ # Retokenizes the text splitting the specified token.
86
+ # @param pos_in_doc [Integer] the position of the span to be retokenized in the document
87
+ # @param split_array [Array<String>] text strings of the split results
88
+ # @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
89
+ # @param attributes [Hash] the attributes of the split elements
90
+ def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
91
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
92
+ heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
93
+ retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
94
+ end
95
+ end
96
+
97
+ # String representation of the document.
98
+ # @return [String]
99
+ def to_s
100
+ @text
101
+ end
102
+
103
+ # Returns an array of tokens contained in the doc.
104
+ # @return [Array<Token>]
105
+ def tokens
106
+ results = []
107
+ PyCall::List.(@py_doc).each do |py_token|
108
+ results << Token.new(py_token)
109
+ end
110
+ results
111
+ end
112
+
113
+ # Iterates over the elements in the doc yielding a token instance each time.
114
+ def each
115
+ PyCall::List.(@py_doc).each do |py_token|
116
+ yield Token.new(py_token)
117
+ end
118
+ end
119
+
120
+ # Returns a span of the specified range within the doc.
121
+ # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
122
+ # @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
123
+ # @param optional_size [Integer] an integer representing the size of the span
124
+ # @return [Span]
125
+ def span(range_or_start, optional_size = nil)
126
+ if optional_size
127
+ start_index = range_or_start
128
+ temp = tokens[start_index ... start_index + optional_size]
129
+ else
130
+ start_index = range_or_start.first
131
+ range = range_or_start
132
+ temp = tokens[range]
133
+ end
134
+
135
+ end_index = start_index + temp.size - 1
136
+
137
+ Span.new(self, start_index: start_index, end_index: end_index)
138
+ end
139
+
140
+ # Returns an array of spans representing noun chunks.
141
+ # @return [Array<Span>]
142
+ def noun_chunks
143
+ chunk_array = []
144
+ py_chunks = PyCall::List.(@py_doc.noun_chunks)
145
+ py_chunks.each do |py_chunk|
146
+ chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
147
+ end
148
+ chunk_array
149
+ end
150
+
151
+ # Returns an array of spans each representing a sentence.
152
+ # @return [Array<Span>]
153
+ def sents
154
+ sentence_array = []
155
+ py_sentences = PyCall::List.(@py_doc.sents)
156
+ py_sentences.each do |py_sent|
157
+ sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
158
+ end
159
+ sentence_array
160
+ end
161
+
162
+ # Returns an array of spans each representing a named entity.
163
+ # @return [Array<Span>]
164
+ def ents
165
+ # so that ents canbe "each"-ed in Ruby
166
+ ent_array = []
167
+ PyCall::List.(@py_doc.ents).each do |ent|
168
+ ent.define_singleton_method :label do
169
+ return self.label_
170
+ end
171
+ ent_array << ent
172
+ end
173
+ ent_array
174
+ end
175
+
176
+ # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
177
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
178
+ def [](range)
179
+ if range.is_a?(Range)
180
+ py_span = @py_doc[range]
181
+ return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
182
+ else
183
+ return Token.new(@py_doc[range])
184
+ end
185
+ end
186
+
187
+ # Returns a semantic similarity estimate.
188
+ # @param other [Doc] the other doc to which a similarity estimation is made
189
+ # @return [Float]
190
+ def similarity(other)
191
+ py_doc.similarity(other.py_doc)
192
+ end
193
+
194
+ # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
195
+ # @param style [String] either `dep` or `ent`
196
+ # @param compact [Boolean] only relevant to the `dep' style
197
+ # @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
198
+ def displacy(style: "dep", compact: false)
199
+ PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
200
+ end
201
+
202
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
203
+ def method_missing(name, *args)
204
+ @py_doc.send(name, *args)
205
+ end
206
+ end
207
+
208
+ # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
209
+ class Language
210
+
211
+ # @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
212
+ attr_reader :spacy_nlp_id
213
+
214
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
215
+ attr_reader :py_nlp
216
+
217
+ # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
218
+ # @param model [String] A language model installed in the system
219
+ def initialize(model = "en_core_web_sm")
220
+ @spacy_nlp_id = "nlp_#{model.object_id}"
221
+ PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
222
+ @py_nlp = PyCall.eval(@spacy_nlp_id)
223
+ end
224
+
225
+ # Reads and analyze the given text.
226
+ # @param text [String] a text to be read and analyzed
227
+ def read(text)
228
+ Doc.new(py_nlp, text: text)
229
+ end
230
+
231
+ # Generates a matcher for the current language model.
232
+ # @return [Matcher]
233
+ def matcher
234
+ Matcher.new(@py_nlp)
235
+ end
236
+
237
+ # A utility method to lookup a vocabulary item of the given id.
238
+ # @param id [Integer] a vocabulary id
239
+ # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
240
+ def vocab_string_lookup(id)
241
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
242
+ end
243
+
244
+ # A utility method to list pipeline components.
245
+ # @return [Array<String>] An array of text strings representing pipeline components
246
+ def pipe_names
247
+ pipe_array = []
248
+ PyCall::List.(@py_nlp.pipe_names).each do |pipe|
249
+ pipe_array << pipe
250
+ end
251
+ pipe_array
252
+ end
253
+
254
+ # A utility method to get a Python `Lexeme` object.
255
+ # @param text [String] A text string representing a lexeme
256
+ # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
257
+ def get_lexeme(text)
258
+ @py_nlp.vocab[text]
259
+ end
260
+
261
+ # Returns a ruby lexeme object
262
+ # @param text [String] a text string representing the vocabulary item
263
+ # @return [Lexeme]
264
+ def vocab(text)
265
+ Lexeme.new(@py_nlp.vocab[text])
266
+ end
267
+
268
+ # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
269
+ # @param vector [Object] A vector representation of a word (whether existing or non-existing)
270
+ # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
271
+ def most_similar(vector, n)
272
+ vec_array = Numpy.asarray([vector])
273
+ py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
274
+ key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
275
+ keys = key_texts.map{|kt| kt[0]}
276
+ texts = key_texts.map{|kt| kt[1]}
277
+ best_rows = PyCall::List.(py_result[1])[0]
278
+ scores = PyCall::List.(py_result[2])[0]
279
+
280
+ results = []
281
+ n.times do |i|
282
+ result = {key: keys[i].to_i,
283
+ text: texts[i],
284
+ best_row: best_rows[i],
285
+ score: scores[i]
286
+ }
287
+ result.each_key do |key|
288
+ result.define_singleton_method(key){ result[key] }
289
+ end
290
+ results << result
291
+ end
292
+ results
293
+ end
294
+
295
+ # Utility function to batch process many texts
296
+ # @param texts [String]
297
+ # @param disable [Array<String>]
298
+ # @param batch_size [Integer]
299
+ # @return [Array<Doc>]
300
+ def pipe(texts, disable: [], batch_size: 50)
301
+ docs = []
302
+ PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
303
+ docs << Doc.new(@py_nlp, py_doc: py_doc)
304
+ end
305
+ docs
306
+ end
307
+
308
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
309
+ def method_missing(name, *args)
310
+ @py_nlp.send(name, *args)
311
+ end
312
+ end
313
+
314
+ # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
315
+ class Matcher
316
+
317
+ # @return [Object] a Python `Matcher` instance accessible via `PyCall`
318
+ attr_reader :py_matcher
319
+
320
+ # Creates a {Matcher} instance
321
+ # @param nlp [Language] an instance of {Language} class
322
+ def initialize(nlp)
323
+ @py_matcher = PyMatcher.(nlp.vocab)
324
+ end
325
+
326
+ # Adds a label string and a text pattern.
327
+ # @param text [String] a label string given to the pattern
328
+ # @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
329
+ def add(text, pattern)
330
+ @py_matcher.add(text, pattern)
331
+ end
332
+
333
+ # Execute the match.
334
+ # @param doc [Doc] an {Doc} instance
335
+ # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
336
+ def match(doc)
337
+ str_results = @py_matcher.(doc.py_doc).to_s
338
+ s = StringScanner.new(str_results[1..-2])
339
+ results = []
340
+ while s.scan_until(/(\d+), (\d+), (\d+)/)
341
+ next unless s.matched
342
+ triple = s.matched.split(", ")
343
+ match_id = triple[0].to_i
344
+ start_index = triple[1].to_i
345
+ end_index = triple[2].to_i - 1
346
+ results << {match_id: match_id, start_index: start_index, end_index: end_index}
347
+ end
348
+ results
349
+ end
350
+ end
351
+
19
352
  # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
20
353
  class Span
21
354
 
22
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
23
- attr_reader :spacy_span_id
24
-
25
355
  # @return [Object] a Python `Span` instance accessible via `PyCall`
26
356
  attr_reader :py_span
27
357
 
@@ -35,21 +365,18 @@ module Spacy
35
365
  alias_method :size, :count
36
366
 
37
367
  # It is recommended to use {Doc#span} method to create a span. If you need to
38
- # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
368
+ # create one using {Span#initialize}, there are two method signatures:
369
+ # `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
39
370
  # @param doc [Doc] the document to which this span belongs to
40
371
  # @param start_index [Integer] the index of the item starting the span inside a doc
41
372
  # @param end_index [Integer] the index of the item ending the span inside a doc
42
373
  # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
43
374
  def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
44
375
  @doc = doc
45
- @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
46
376
  if py_span
47
377
  @py_span = py_span
48
378
  else
49
- options = PyCall::Dict.(options)
50
- PyCall.exec("#{@spacy_span_id}_opts = #{options}")
51
- PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
52
- @py_span = PyCall.eval(@spacy_span_id)
379
+ @py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
53
380
  end
54
381
  end
55
382
 
@@ -63,7 +390,7 @@ module Spacy
63
390
  results
64
391
  end
65
392
 
66
- # Iterates over the elements in the span yielding a token instance.
393
+ # Iterates over the elements in the span yielding a token instance each time.
67
394
  def each
68
395
  PyCall::List.(@py_span).each do |py_token|
69
396
  yield Token.new(py_token)
@@ -76,18 +403,24 @@ module Spacy
76
403
  chunk_array = []
77
404
  py_chunks = PyCall::List.(@py_span.noun_chunks)
78
405
  py_chunks.each do |py_span|
79
- chunk_array << Spacy::Span.new(@doc, py_span: py_span)
406
+ chunk_array << Span.new(@doc, py_span: py_span)
80
407
  end
81
408
  chunk_array
82
409
  end
83
410
 
411
+ # Returns the head token
412
+ # @return [Token]
413
+ def root
414
+ Token.new(@py_span.root)
415
+ end
416
+
84
417
  # Returns an array of spans that represents sentences.
85
418
  # @return [Array<Span>]
86
419
  def sents
87
420
  sentence_array = []
88
421
  py_sentences = PyCall::List.(@py_span.sents)
89
422
  py_sentences.each do |py_span|
90
- sentence_array << Spacy::Span.new(@doc, py_span: py_span)
423
+ sentence_array << Span.new(@doc, py_span: py_span)
91
424
  end
92
425
  sentence_array
93
426
  end
@@ -97,8 +430,7 @@ module Spacy
97
430
  def ents
98
431
  ent_array = []
99
432
  PyCall::List.(@py_span.ents).each do |py_span|
100
- # ent_array << ent
101
- ent_array << Spacy::Span.new(@doc, py_span: py_span)
433
+ ent_array << Span.new(@doc, py_span: py_span)
102
434
  end
103
435
  ent_array
104
436
  end
@@ -106,18 +438,18 @@ module Spacy
106
438
  # Returns a span that represents the sentence that the given span is part of.
107
439
  # @return [Span]
108
440
  def sent
109
- py_span =@py_span.sent
110
- return Spacy::Span.new(@doc, py_span: py_span)
441
+ py_span = @py_span.sent
442
+ return Span.new(@doc, py_span: py_span)
111
443
  end
112
444
 
113
- # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
445
+ # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
114
446
  # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
115
447
  def [](range)
116
448
  if range.is_a?(Range)
117
449
  py_span = @py_span[range]
118
- return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
450
+ return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
119
451
  else
120
- return Spacy::Token.new(@py_span[range])
452
+ return Token.new(@py_span[range])
121
453
  end
122
454
  end
123
455
 
@@ -125,31 +457,31 @@ module Spacy
125
457
  # @param other [Span] the other span to which a similarity estimation is conducted
126
458
  # @return [Float]
127
459
  def similarity(other)
128
- PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
460
+ py_span.similarity(other.py_span)
129
461
  end
130
462
 
131
- # Creates a document instance
463
+ # Creates a document instance from the span
132
464
  # @return [Doc]
133
465
  def as_doc
134
- Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
466
+ Doc.new(@doc.py_nlp, text: self.text)
135
467
  end
136
468
 
137
- # Returns Tokens conjugated to the root of the span.
469
+ # Returns tokens conjugated to the root of the span.
138
470
  # @return [Array<Token>] an array of tokens
139
471
  def conjuncts
140
472
  conjunct_array = []
141
473
  PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
142
- conjunct_array << Spacy::Token.new(py_conjunct)
474
+ conjunct_array << Token.new(py_conjunct)
143
475
  end
144
476
  conjunct_array
145
477
  end
146
478
 
147
- # Returns Tokens that are to the left of the span, whose heads are within the span.
479
+ # Returns tokens that are to the left of the span, whose heads are within the span.
148
480
  # @return [Array<Token>] an array of tokens
149
481
  def lefts
150
482
  left_array = []
151
483
  PyCall::List.(@py_span.lefts).each do |py_left|
152
- left_array << Spacy::Token.new(py_left)
484
+ left_array << Token.new(py_left)
153
485
  end
154
486
  left_array
155
487
  end
@@ -159,7 +491,7 @@ module Spacy
159
491
  def rights
160
492
  right_array = []
161
493
  PyCall::List.(@py_span.rights).each do |py_right|
162
- right_array << Spacy::Token.new(py_right)
494
+ right_array << Token.new(py_right)
163
495
  end
164
496
  right_array
165
497
  end
@@ -169,11 +501,17 @@ module Spacy
169
501
  def subtree
170
502
  subtree_array = []
171
503
  PyCall::List.(@py_span.subtree).each do |py_subtree|
172
- subtree_array << Spacy::Token.new(py_subtree)
504
+ subtree_array << Token.new(py_subtree)
173
505
  end
174
506
  subtree_array
175
507
  end
176
508
 
509
+ # Returns the label
510
+ # @return [String]
511
+ def label
512
+ @py_span.label_
513
+ end
514
+
177
515
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
178
516
  def method_missing(name, *args)
179
517
  @py_span.send(name, *args)
@@ -189,59 +527,67 @@ module Spacy
189
527
  # @return [String] a string representing the token
190
528
  attr_reader :text
191
529
 
192
- # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
530
+ # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
531
+ # There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
193
532
  # @param py_token [Object] Python `Token` object
194
533
  def initialize(py_token)
195
534
  @py_token = py_token
196
535
  @text = @py_token.text
197
536
  end
198
537
 
538
+
539
+ # Returns the head token
540
+ # @return [Token]
541
+ def head
542
+ Token.new(@py_token.head)
543
+ end
544
+
199
545
  # Returns the token in question and the tokens that descend from it.
200
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
546
+ # @return [Array<Token>] an array of tokens
201
547
  def subtree
202
548
  descendant_array = []
203
549
  PyCall::List.(@py_token.subtree).each do |descendant|
204
- descendant_array << descendant
550
+ descendant_array << Token.new(descendant)
205
551
  end
206
552
  descendant_array
207
553
  end
208
554
 
209
555
  # Returns the token's ancestors.
210
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
556
+ # @return [Array<Token>] an array of tokens
211
557
  def ancestors
212
558
  ancestor_array = []
213
559
  PyCall::List.(@py_token.ancestors).each do |ancestor|
214
- ancestor_array << ancestor
560
+ ancestor_array << Token.new(ancestor)
215
561
  end
216
562
  ancestor_array
217
563
  end
218
564
 
219
565
  # Returns a sequence of the token's immediate syntactic children.
220
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
566
+ # @return [Array<Token>] an array of tokens
221
567
  def children
222
568
  child_array = []
223
569
  PyCall::List.(@py_token.children).each do |child|
224
- child_array << child
570
+ child_array << Token.new(child)
225
571
  end
226
572
  child_array
227
573
  end
228
574
 
229
575
  # The leftward immediate children of the word in the syntactic dependency parse.
230
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
576
+ # @return [Array<Token>] an array of tokens
231
577
  def lefts
232
578
  token_array = []
233
579
  PyCall::List.(@py_token.lefts).each do |token|
234
- token_array << token
580
+ token_array << Token.new(token)
235
581
  end
236
582
  token_array
237
583
  end
238
584
 
239
585
  # The rightward immediate children of the word in the syntactic dependency parse.
240
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
586
+ # @return [Array<Token>] an array of tokens
241
587
  def rights
242
588
  token_array = []
243
589
  PyCall::List.(@py_token.rights).each do |token|
244
- token_array << token
590
+ token_array << Token.new(token)
245
591
  end
246
592
  token_array
247
593
  end
@@ -252,314 +598,161 @@ module Spacy
252
598
  @text
253
599
  end
254
600
 
255
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
256
- def method_missing(name, *args)
257
- @py_token.send(name, *args)
258
- end
259
- end
260
-
261
- # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
262
- class Doc
263
-
264
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
265
- attr_reader :spacy_nlp_id
266
-
267
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
268
- attr_reader :spacy_doc_id
269
-
270
- # @return [Object] a Python `Doc` instance accessible via `PyCall`
271
- attr_reader :py_doc
272
-
273
- # @return [String] a text string of the document
274
- attr_reader :text
275
-
276
- include Enumerable
277
-
278
- alias_method :length, :count
279
- alias_method :len, :count
280
- alias_method :size, :count
281
-
282
- # Creates a new instance of {Doc}.
283
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
284
- # @param text [String] The text string to be analyzed
285
- def initialize(nlp_id, text)
286
- @text = text
287
- @spacy_nlp_id = nlp_id
288
- @spacy_doc_id = "doc_#{text.object_id}"
289
- quoted = text.gsub('"', '\"')
290
- PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
291
- PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
292
- @py_doc = PyCall.eval(@spacy_doc_id)
293
- end
294
-
295
-
296
- # Retokenizes the text merging a span into a single token.
297
- # @param start_index [Integer] The start position of the span to be retokenized in the document
298
- # @param end_index [Integer] The end position of the span to be retokenized in the document
299
- # @param attributes [Hash] Attributes to set on the merged token
300
- def retokenize(start_index, end_index, attributes = {})
301
- py_attrs = PyCall::Dict.(attributes)
302
- PyCall.exec(<<PY)
303
- with #{@spacy_doc_id}.retokenize() as retokenizer:
304
- retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
305
- PY
306
- @py_doc = PyCall.eval(@spacy_doc_id)
307
- end
308
-
309
- # Retokenizes the text splitting the specified token.
310
- # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
311
- # @param split_array [Array<String>] text strings of the split results
312
- # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
313
- # @param attributes [Hash] The attributes of the split elements
314
- def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
315
- py_attrs = PyCall::Dict.(attributes)
316
- py_split_array = PyCall::List.(split_array)
317
- PyCall.exec(<<PY)
318
- with #{@spacy_doc_id}.retokenize() as retokenizer:
319
- heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
320
- attrs = #{py_attrs}
321
- split_array = #{py_split_array}
322
- retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
323
- PY
324
- @py_doc = PyCall.eval(@spacy_doc_id)
601
+ # Returns a hash or string of morphological information
602
+ # @param hash [Boolean] if true, a hash will be returned instead of a string
603
+ # @return [Hash, String]
604
+ def morphology(hash = true)
605
+ if @py_token.has_morph
606
+ morph_analysis = @py_token.morph
607
+ if hash
608
+ return morph_analysis.to_dict
609
+ else
610
+ return morph_analysis.to_s
611
+ end
612
+ else
613
+ if hash
614
+ results = {}
615
+ else
616
+ return ""
617
+ end
618
+ end
325
619
  end
326
620
 
327
- # String representation of the token.
621
+ # Returns the lemma by calling `lemma_' of `@py_token` object
328
622
  # @return [String]
329
- def to_s
330
- @text
623
+ def lemma
624
+ @py_token.lemma_
331
625
  end
332
626
 
333
- # Returns an array of tokens contained in the doc.
334
- # @return [Array<Token>]
335
- def tokens
336
- results = []
337
- PyCall::List.(@py_doc).each do |py_token|
338
- results << Token.new(py_token)
339
- end
340
- results
627
+ # Returns the lowercase form by calling `lower_' of `@py_token` object
628
+ # @return [String]
629
+ def lower
630
+ @py_token.lower_
341
631
  end
342
632
 
343
- # Iterates over the elements in the doc yielding a token instance.
344
- def each
345
- PyCall::List.(@py_doc).each do |py_token|
346
- yield Token.new(py_token)
347
- end
633
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
634
+ # @return [String]
635
+ def shape
636
+ @py_token.shape_
348
637
  end
349
638
 
350
- # Returns a span of the specified range within the doc.
351
- # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
352
- # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
353
- # @param optional_size [Integer] An integer representing the size of the span
354
- # @return [Span]
355
- def span(range_or_start, optional_size = nil)
356
- if optional_size
357
- start_index = range_or_start
358
- temp = tokens[start_index ... start_index + optional_size]
359
- else
360
- start_index = range_or_start.first
361
- range = range_or_start
362
- temp = tokens[range]
363
- end
364
-
365
- end_index = start_index + temp.size - 1
366
-
367
- Span.new(self, start_index: start_index, end_index: end_index)
639
+ # Returns the pos by calling `pos_' of `@py_token` object
640
+ # @return [String]
641
+ def pos
642
+ @py_token.pos_
368
643
  end
369
644
 
370
- # Returns an array of spans representing noun chunks.
371
- # @return [Array<Span>]
372
- def noun_chunks
373
- chunk_array = []
374
- py_chunks = PyCall::List.(@py_doc.noun_chunks)
375
- py_chunks.each do |py_chunk|
376
- chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
377
- end
378
- chunk_array
645
+ # Returns the fine-grained pos by calling `tag_' of `@py_token` object
646
+ # @return [String]
647
+ def tag
648
+ @py_token.tag_
379
649
  end
380
650
 
381
- # Returns an array of spans representing sentences.
382
- # @return [Array<Span>]
383
- def sents
384
- sentence_array = []
385
- py_sentences = PyCall::List.(@py_doc.sents)
386
- py_sentences.each do |py_sent|
387
- sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
388
- end
389
- sentence_array
651
+ # Returns the dependency relation by calling `dep_' of `@py_token` object
652
+ # @return [String]
653
+ def dep
654
+ @py_token.dep_
390
655
  end
391
-
392
- # Returns an array of spans representing named entities.
393
- # @return [Array<Span>]
394
- def ents
395
- # so that ents canbe "each"-ed in Ruby
396
- ent_array = []
397
- PyCall::List.(@py_doc.ents).each do |ent|
398
- ent_array << ent
399
- end
400
- ent_array
656
+
657
+ # Returns the language by calling `lang_' of `@py_token` object
658
+ # @return [String]
659
+ def lang
660
+ @py_token.lang_
401
661
  end
402
662
 
403
- # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
404
- # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
405
- def [](range)
406
- if range.is_a?(Range)
407
- py_span = @py_doc[range]
408
- return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
409
- else
410
- return Token.new(@py_doc[range])
411
- end
663
+ # Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
664
+ # @return [String]
665
+ def whitespace
666
+ @py_token.whitespace_
412
667
  end
413
668
 
414
- # Returns a semantic similarity estimate.
415
- # @param other [Doc] the other doc to which a similarity estimation is made
416
- # @return [Float]
417
- def similarity(other)
418
- PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
669
+ # Returns the named entity type by calling `ent_type_' of `@py_token` object
670
+ # @return [String]
671
+ def ent_type
672
+ @py_token.ent_type_
419
673
  end
420
674
 
421
- # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
422
- # @param style [String] Either `dep` or `ent`
423
- # @param compact [Boolean] Only relevant to the `dep' style
424
- # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
425
- def displacy(style: "dep", compact: false)
426
- PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
675
+ # Returns a lexeme object
676
+ # @return [Lexeme]
677
+ def lexeme
678
+ Lexeme.new(@py_token.lex)
427
679
  end
428
680
 
429
681
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
430
682
  def method_missing(name, *args)
431
- @py_doc.send(name, *args)
683
+ @py_token.send(name, *args)
432
684
  end
433
685
  end
434
686
 
435
- # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
436
- class Matcher
687
+ # See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
688
+ class Lexeme
437
689
 
438
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
439
- attr_reader :spacy_matcher_id
440
-
441
- # @return [Object] a Python `Matcher` instance accessible via `PyCall`
442
- attr_reader :py_matcher
443
-
444
- # Creates a {Matcher} instance
445
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
446
- def initialize(nlp_id)
447
- @spacy_matcher_id = "doc_#{nlp_id}_matcher"
448
- PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
449
- @py_matcher = PyCall.eval(@spacy_matcher_id)
450
- end
690
+ # @return [Object] a Python `Lexeme` instance accessible via `PyCall`
691
+ attr_reader :py_lexeme
451
692
 
452
- # Adds a label string and a text pattern.
453
- # @param text [String] a label string given to the pattern
454
- # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
455
- def add(text, pattern)
456
- @py_matcher.add(text, pattern)
457
- end
693
+ # @return [String] a string representing the token
694
+ attr_reader :text
458
695
 
459
- # Execute the match.
460
- # @param doc [Doc] An {Doc} instance
461
- # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
462
- def match(doc)
463
- str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
464
- s = StringScanner.new(str_results[1..-2])
465
- results = []
466
- while s.scan_until(/(\d+), (\d+), (\d+)/)
467
- next unless s.matched
468
- triple = s.matched.split(", ")
469
- match_id = triple[0].to_i
470
- start_index = triple[1].to_i
471
- end_index = triple[2].to_i - 1
472
- results << {match_id: match_id, start_index: start_index, end_index: end_index}
473
- end
474
- results
696
+ # It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
697
+ # There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
698
+ # @param py_lexeme [Object] Python `Lexeme` object
699
+ def initialize(py_lexeme)
700
+ @py_lexeme = py_lexeme
701
+ @text = @py_lexeme.text
475
702
  end
476
- end
477
-
478
- # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
479
- class Language
480
-
481
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
482
- attr_reader :spacy_nlp_id
483
-
484
- # @return [Object] a Python `Language` instance accessible via `PyCall`
485
- attr_reader :py_nlp
486
703
 
487
- # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
488
- # @param model [String] A language model installed in the system
489
- def initialize(model = "en_core_web_sm")
490
- @spacy_nlp_id = "nlp_#{model.object_id}"
491
- PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
492
- PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
493
- @py_nlp = PyCall.eval(@spacy_nlp_id)
704
+ # String representation of the token.
705
+ # @return [String]
706
+ def to_s
707
+ @text
494
708
  end
495
709
 
496
- # Reads and analyze the given text.
497
- # @param text [String] A text to be read and analyzed
498
- def read(text)
499
- Doc.new(@spacy_nlp_id, text)
710
+ # Returns the lowercase form by calling `lower_' of `@py_lexeme` object
711
+ # @return [String]
712
+ def lower
713
+ @py_lexeme.lower_
500
714
  end
501
715
 
502
- # Generates a matcher for the current language model.
503
- # @return [Matcher]
504
- def matcher
505
- Matcher.new(@spacy_nlp_id)
716
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
717
+ # @return [String]
718
+ def shape
719
+ @py_lexeme.shape_
506
720
  end
507
721
 
508
- # A utility method to lookup a vocabulary item of the given id.
509
- # @param id [Integer] A vocabulary id
510
- # @return [Object] A Python `Lexeme` object
511
- def vocab_string_lookup(id)
512
- PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
722
+ # Returns the language by calling `lang_' of `@py_lexeme` object
723
+ # @return [String]
724
+ def lang
725
+ @py_lexeme.lang_
513
726
  end
514
727
 
515
- # A utility method to list pipeline components.
516
- # @return [Array<String>] An array of text strings representing pipeline components
517
- def pipe_names
518
- pipe_array = []
519
- PyCall::List.(@py_nlp.pipe_names).each do |pipe|
520
- pipe_array << pipe
521
- end
522
- pipe_array
728
+ # Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
729
+ # @return [String]
730
+ def prefix
731
+ @py_lexeme.prefix_
523
732
  end
524
-
525
- # A utility method to get the tokenizer Python object.
526
- # @return [Object] Python `Tokenizer` object
527
- def tokenizer
528
- return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
733
+ #
734
+ # Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
735
+ # @return [String]
736
+ def suffix
737
+ @py_lexeme.suffix_
529
738
  end
530
739
 
531
- # A utility method to get a Python `Lexeme` object.
532
- # @param text [String] A text string representing a lexeme
533
- # @return [Object] Python `Tokenizer` object
534
- def get_lexeme(text)
535
- text = text.gsub("'", "\'")
536
- py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
537
- return py_lexeme
740
+ # Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
741
+ # @return [String]
742
+ def norm
743
+ @py_lexeme.norm_
538
744
  end
539
745
 
540
- # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
541
- # @param vector [Object] A vector representation of a word (whether existing or non-existing)
542
- # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
543
- def most_similar(vector, n)
544
- vec_array = Numpy.asarray([vector])
545
- py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
546
- key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
547
- keys = key_texts.map{|kt| kt[0]}
548
- texts = key_texts.map{|kt| kt[1]}
549
- best_rows = PyCall::List.(py_result[1])[0]
550
- scores = PyCall::List.(py_result[2])[0]
551
-
552
- results = []
553
- n.times do |i|
554
- results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
555
- end
556
-
557
- results
746
+ # Returns a semantic similarity estimate.
747
+ # @param other [Lexeme] the other doc to which a similarity estimation is made
748
+ # @return [Float]
749
+ def similarity(other)
750
+ @py_lexeme.similarity(other.py_lexeme)
558
751
  end
559
752
 
560
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
753
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
561
754
  def method_missing(name, *args)
562
- @py_nlp.send(name, *args)
755
+ @py_lexeme.send(name, *args)
563
756
  end
564
757
  end
565
758