ruby-spacy 0.1.0 → 0.1.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +23 -0
  3. data/Gemfile.lock +3 -1
  4. data/README.md +123 -77
  5. data/examples/get_started/lexeme.rb +2 -2
  6. data/examples/get_started/linguistic_annotations.rb +1 -1
  7. data/examples/get_started/morphology.rb +45 -0
  8. data/examples/get_started/most_similar.rb +28 -27
  9. data/examples/get_started/named_entities.rb +1 -1
  10. data/examples/get_started/pos_tags_and_dependencies.rb +18 -18
  11. data/examples/get_started/similarity.rb +2 -2
  12. data/examples/japanese/ancestors.rb +9 -11
  13. data/examples/japanese/entity_annotations_and_labels.rb +1 -1
  14. data/examples/japanese/lemmatization.rb +1 -1
  15. data/examples/japanese/most_similar.rb +28 -27
  16. data/examples/japanese/named_entity_recognition.rb +1 -1
  17. data/examples/japanese/navigating_parse_tree.rb +18 -18
  18. data/examples/japanese/noun_chunks.rb +1 -1
  19. data/examples/japanese/pos_tagging.rb +20 -20
  20. data/examples/japanese/visualizing_dependencies.rb +2 -2
  21. data/examples/japanese/visualizing_named_entities.rb +1 -1
  22. data/examples/linguistic_features/ancestors.rb +13 -10
  23. data/examples/linguistic_features/entity_annotations_and_labels.rb +1 -1
  24. data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +2 -2
  25. data/examples/linguistic_features/information_extraction.rb +2 -2
  26. data/examples/linguistic_features/iterating_children.rb +2 -2
  27. data/examples/linguistic_features/iterating_lefts_and_rights.rb +5 -5
  28. data/examples/linguistic_features/lemmatization.rb +1 -1
  29. data/examples/linguistic_features/named_entity_recognition.rb +1 -1
  30. data/examples/linguistic_features/navigating_parse_tree.rb +12 -12
  31. data/examples/linguistic_features/noun_chunks.rb +1 -1
  32. data/examples/linguistic_features/pos_tagging.rb +1 -1
  33. data/examples/linguistic_features/retokenize_1.rb +1 -1
  34. data/examples/linguistic_features/retokenize_2.rb +2 -2
  35. data/examples/linguistic_features/rule_based_morphology.rb +1 -1
  36. data/examples/linguistic_features/similarity.rb +2 -2
  37. data/examples/linguistic_features/similarity_between_lexemes.rb +18 -0
  38. data/examples/linguistic_features/similarity_between_spans.rb +2 -2
  39. data/examples/rule_based_matching/creating_spans_from_matches.rb +1 -1
  40. data/lib/ruby-spacy.rb +493 -300
  41. data/lib/ruby-spacy/version.rb +1 -1
  42. data/ruby-spacy.gemspec +1 -1
  43. metadata +6 -5
  44. data/examples/linguistic_features/morphology.rb +0 -17
  45. data/examples/linguistic_features/special_case_tokenization_rules.rb +0 -19
@@ -10,7 +10,7 @@ headings = ["text", "start", "end", "label"]
10
10
  rows = []
11
11
 
12
12
  doc.ents.each do |ent|
13
- rows << [ent.text, ent.start_char, ent.end_char, ent.label_]
13
+ rows << [ent.text, ent.start_char, ent.end_char, ent.label]
14
14
  end
15
15
 
16
16
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,21 +12,21 @@ headings = ["text", "dep", "head text", "head pos", "children"]
12
12
  rows = []
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.dep_, token.head.text, token.head.pos_, token.children.to_s]
15
+ rows << [token.text, token.dep, token.head.text, token.head.pos, token.children.map(&:text).join(", ")]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
19
19
  puts table
20
20
 
21
21
  # Lemmatizer mode: rule
22
- # +---------------+----------+-----------+----------+---------------------------+
23
- # | text | dep | head text | head pos | children |
24
- # +---------------+----------+-----------+----------+---------------------------+
25
- # | Autonomous | amod | cars | NOUN | [] |
26
- # | cars | nsubj | shift | VERB | [Autonomous] |
27
- # | shift | ROOT | shift | VERB | [cars, liability, toward] |
28
- # | insurance | compound | liability | NOUN | [] |
29
- # | liability | dobj | shift | VERB | [insurance] |
30
- # | toward | prep | shift | VERB | [manufacturers] |
31
- # | manufacturers | pobj | toward | ADP | [] |
32
- # +---------------+----------+-----------+----------+---------------------------+
22
+ # +---------------+----------+-----------+----------+-------------------------+
23
+ # | text | dep | head text | head pos | children |
24
+ # +---------------+----------+-----------+----------+-------------------------+
25
+ # | Autonomous | amod | cars | NOUN | |
26
+ # | cars | nsubj | shift | VERB | Autonomous |
27
+ # | shift | ROOT | shift | VERB | cars, liability, toward |
28
+ # | insurance | compound | liability | NOUN | |
29
+ # | liability | dobj | shift | VERB | insurance |
30
+ # | toward | prep | shift | VERB | manufacturers |
31
+ # | manufacturers | pobj | toward | ADP | |
32
+ # +---------------+----------+-----------+----------+-------------------------+
@@ -12,7 +12,7 @@ headings = ["text", "root.text", "root.dep", "root.head.text"]
12
12
  rows = []
13
13
 
14
14
  doc.noun_chunks.each do |chunk|
15
- rows << [chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]
15
+ rows << [chunk.text, chunk.root.text, chunk.root.dep, chunk.root.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -8,7 +8,7 @@ headings = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"
8
8
  rows = []
9
9
 
10
10
  doc.each do |token|
11
- rows << [token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
11
+ rows << [token.text, token.lemma, token.pos, token.tag, token.dep, token.shape, token.is_alpha, token.is_stop]
12
12
  end
13
13
 
14
14
  table = Terminal::Table.new rows: rows, headings: headings
@@ -12,7 +12,7 @@ rows = []
12
12
  doc.retokenize(doc[4].left_edge.i, doc[4].right_edge.i)
13
13
 
14
14
  doc.each do |token|
15
- rows << [token.text, token.pos_, token.dep_, token.head.text]
15
+ rows << [token.text, token.pos, token.dep, token.head.text]
16
16
  end
17
17
 
18
18
  table = Terminal::Table.new rows: rows, headings: headings
@@ -6,11 +6,11 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  sentence = "I live in New York"
7
7
  doc = nlp.read(sentence)
8
8
 
9
- puts "Before: " + doc.tokens.collect{|t| t}.join(", ")
9
+ puts "Before: " + doc.tokens.map(&:text).join(", ")
10
10
 
11
11
  doc.retokenize(3, 4)
12
12
 
13
- puts "After: " + doc.tokens.collect{|t| t}.join(", ")
13
+ puts "After: " + doc.tokens.map(&:text).join(", ")
14
14
 
15
15
  # Before: I, live, in, New, York
16
16
  # After: I, live, in, New York
@@ -6,7 +6,7 @@ nlp = Spacy::Language.new("en_core_web_sm")
6
6
  doc = nlp.read("Where are you?")
7
7
 
8
8
  puts "Morph features of the third word: " + doc[2].morph.to_s
9
- puts "POS of the third word: " + doc[2].pos_.to_s
9
+ puts "POS of the third word: " + doc[2].pos
10
10
 
11
11
  # Morph features of the third word: Case=Nom|Person=2|PronType=Prs
12
12
  # POS of the third word: PRON
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  # Doc 1: I like salty fries and hamburgers.
@@ -0,0 +1,18 @@
1
+ require "ruby-spacy"
2
+ require "terminal-table"
3
+
4
+ nlp = Spacy::Language.new("en_core_web_lg")
5
+
6
+ orange = nlp.vocab("orange")
7
+ lemon = nlp.vocab("lemon")
8
+
9
+ book = nlp.vocab("book")
10
+ magazine = nlp.vocab("magazine")
11
+
12
+ puts "orange <=> lemon: #{orange.similarity(lemon)}"
13
+ puts "book <=> magazine: #{book.similarity(magazine)}"
14
+ puts "orange <=> book: #{orange.similarity(book)}"
15
+
16
+ # orange <=> lemon: 0.7080526351928711
17
+ # book <=> magazine: 0.4355940818786621
18
+ # orange <=> book: 0.12197211384773254
@@ -5,8 +5,8 @@ nlp = Spacy::Language.new("en_core_web_lg")
5
5
  doc1 = nlp.read("I like salty fries and hamburgers.")
6
6
  doc2 = nlp.read("Fast food tastes very good.")
7
7
 
8
- puts "Doc 1: " + doc1
9
- puts "Doc 2: " + doc2
8
+ puts "Doc 1: " + doc1.text
9
+ puts "Doc 2: " + doc2.text
10
10
  puts "Similarity: #{doc1.similarity(doc2)}"
11
11
 
12
12
  span1 = doc1.span(2, 2) # salty fries
@@ -10,7 +10,7 @@ matches = matcher.match(doc)
10
10
 
11
11
  matches.each do |match|
12
12
  span = Spacy::Span.new(doc, start_index: match[:start_index], end_index: match[:end_index], options: {label: match[:match_id]})
13
- puts span.text + " / " + span.label_
13
+ puts span.text + " / " + span.label
14
14
  end
15
15
 
16
16
  # Barack Obama / US_PRESIDENT
data/lib/ruby-spacy.rb CHANGED
@@ -3,12 +3,34 @@
3
3
  require_relative "ruby-spacy/version"
4
4
  require 'enumerator'
5
5
  require 'strscan'
6
- require 'pycall/import'
7
6
  require 'numpy'
7
+ require 'pycall/import'
8
8
  include PyCall::Import
9
9
 
10
10
  # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
11
11
  module Spacy
12
+
13
+ extend PyCall::Import
14
+ spacy = PyCall.import_module('spacy')
15
+
16
+ # Python `Language` class
17
+ PyLanguage = spacy.language.Language
18
+
19
+ # Python `Doc` class object
20
+ PyDoc = spacy.tokens.Doc
21
+
22
+ # Python `Span` class object
23
+ PySpan = spacy.tokens.Span
24
+
25
+ # Python `Token` class object
26
+ PyToken = spacy.tokens.Token
27
+
28
+ # Python `Matcher` class object
29
+ PyMatcher = spacy.matcher.Matcher
30
+
31
+ # Python `displacy` object
32
+ PyDisplacy = spacy.displacy
33
+
12
34
  # A utility module method to convert Python's generator object to a Ruby array,
13
35
  # mainly used on the items inside the array returned from dependency-related methods
14
36
  # such as {Span#rights}, {Span#lefts} and {Span#subtree}.
@@ -16,12 +38,320 @@ module Spacy
16
38
  PyCall::List.(py_generator)
17
39
  end
18
40
 
41
+ # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
42
+ class Doc
43
+
44
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
45
+ attr_reader :py_nlp
46
+
47
+ # @return [Object] a Python `Doc` instance accessible via `PyCall`
48
+ attr_reader :py_doc
49
+
50
+ # @return [String] a text string of the document
51
+ attr_reader :text
52
+
53
+ include Enumerable
54
+
55
+ alias_method :length, :count
56
+ alias_method :len, :count
57
+ alias_method :size, :count
58
+
59
+ # It is recommended to use {Language#read} method to create a doc. If you need to
60
+ # create one using {Doc#initialize}, there are two method signatures:
61
+ # `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
62
+ # @param nlp [Language] an instance of {Language} class
63
+ # @param py_doc [Object] an instance of Python `Doc` class
64
+ # @param text [String] the text string to be analyzed
65
+ def initialize(nlp, py_doc: nil, text: nil)
66
+ @py_nlp = nlp
67
+ if py_doc
68
+ @py_doc = py_doc
69
+ else
70
+ @py_doc = nlp.(text)
71
+ end
72
+ @text = @py_doc.text
73
+ end
74
+
75
+ # Retokenizes the text merging a span into a single token.
76
+ # @param start_index [Integer] the start position of the span to be retokenized in the document
77
+ # @param end_index [Integer] the end position of the span to be retokenized in the document
78
+ # @param attributes [Hash] attributes to set on the merged token
79
+ def retokenize(start_index, end_index, attributes = {})
80
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
81
+ retokenizer.merge(@py_doc[start_index .. end_index], attrs: attributes)
82
+ end
83
+ end
84
+
85
+ # Retokenizes the text splitting the specified token.
86
+ # @param pos_in_doc [Integer] the position of the span to be retokenized in the document
87
+ # @param split_array [Array<String>] text strings of the split results
88
+ # @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
89
+ # @param attributes [Hash] the attributes of the split elements
90
+ def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
91
+ PyCall.with(@py_doc.retokenize()) do |retokenizer|
92
+ heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
93
+ retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
94
+ end
95
+ end
96
+
97
+ # String representation of the document.
98
+ # @return [String]
99
+ def to_s
100
+ @text
101
+ end
102
+
103
+ # Returns an array of tokens contained in the doc.
104
+ # @return [Array<Token>]
105
+ def tokens
106
+ results = []
107
+ PyCall::List.(@py_doc).each do |py_token|
108
+ results << Token.new(py_token)
109
+ end
110
+ results
111
+ end
112
+
113
+ # Iterates over the elements in the doc yielding a token instance each time.
114
+ def each
115
+ PyCall::List.(@py_doc).each do |py_token|
116
+ yield Token.new(py_token)
117
+ end
118
+ end
119
+
120
+ # Returns a span of the specified range within the doc.
121
+ # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
122
+ # @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
123
+ # @param optional_size [Integer] an integer representing the size of the span
124
+ # @return [Span]
125
+ def span(range_or_start, optional_size = nil)
126
+ if optional_size
127
+ start_index = range_or_start
128
+ temp = tokens[start_index ... start_index + optional_size]
129
+ else
130
+ start_index = range_or_start.first
131
+ range = range_or_start
132
+ temp = tokens[range]
133
+ end
134
+
135
+ end_index = start_index + temp.size - 1
136
+
137
+ Span.new(self, start_index: start_index, end_index: end_index)
138
+ end
139
+
140
+ # Returns an array of spans representing noun chunks.
141
+ # @return [Array<Span>]
142
+ def noun_chunks
143
+ chunk_array = []
144
+ py_chunks = PyCall::List.(@py_doc.noun_chunks)
145
+ py_chunks.each do |py_chunk|
146
+ chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
147
+ end
148
+ chunk_array
149
+ end
150
+
151
+ # Returns an array of spans each representing a sentence.
152
+ # @return [Array<Span>]
153
+ def sents
154
+ sentence_array = []
155
+ py_sentences = PyCall::List.(@py_doc.sents)
156
+ py_sentences.each do |py_sent|
157
+ sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
158
+ end
159
+ sentence_array
160
+ end
161
+
162
+ # Returns an array of spans each representing a named entity.
163
+ # @return [Array<Span>]
164
+ def ents
165
+ # so that ents canbe "each"-ed in Ruby
166
+ ent_array = []
167
+ PyCall::List.(@py_doc.ents).each do |ent|
168
+ ent.define_singleton_method :label do
169
+ return self.label_
170
+ end
171
+ ent_array << ent
172
+ end
173
+ ent_array
174
+ end
175
+
176
+ # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
177
+ # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
178
+ def [](range)
179
+ if range.is_a?(Range)
180
+ py_span = @py_doc[range]
181
+ return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
182
+ else
183
+ return Token.new(@py_doc[range])
184
+ end
185
+ end
186
+
187
+ # Returns a semantic similarity estimate.
188
+ # @param other [Doc] the other doc to which a similarity estimation is made
189
+ # @return [Float]
190
+ def similarity(other)
191
+ py_doc.similarity(other.py_doc)
192
+ end
193
+
194
+ # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
195
+ # @param style [String] either `dep` or `ent`
196
+ # @param compact [Boolean] only relevant to the `dep' style
197
+ # @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
198
+ def displacy(style: "dep", compact: false)
199
+ PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
200
+ end
201
+
202
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
203
+ def method_missing(name, *args)
204
+ @py_doc.send(name, *args)
205
+ end
206
+ end
207
+
208
+ # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
209
+ class Language
210
+
211
+ # @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
212
+ attr_reader :spacy_nlp_id
213
+
214
+ # @return [Object] a Python `Language` instance accessible via `PyCall`
215
+ attr_reader :py_nlp
216
+
217
+ # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
218
+ # @param model [String] A language model installed in the system
219
+ def initialize(model = "en_core_web_sm")
220
+ @spacy_nlp_id = "nlp_#{model.object_id}"
221
+ PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
222
+ @py_nlp = PyCall.eval(@spacy_nlp_id)
223
+ end
224
+
225
+ # Reads and analyze the given text.
226
+ # @param text [String] a text to be read and analyzed
227
+ def read(text)
228
+ Doc.new(py_nlp, text: text)
229
+ end
230
+
231
+ # Generates a matcher for the current language model.
232
+ # @return [Matcher]
233
+ def matcher
234
+ Matcher.new(@py_nlp)
235
+ end
236
+
237
+ # A utility method to lookup a vocabulary item of the given id.
238
+ # @param id [Integer] a vocabulary id
239
+ # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
240
+ def vocab_string_lookup(id)
241
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
242
+ end
243
+
244
+ # A utility method to list pipeline components.
245
+ # @return [Array<String>] An array of text strings representing pipeline components
246
+ def pipe_names
247
+ pipe_array = []
248
+ PyCall::List.(@py_nlp.pipe_names).each do |pipe|
249
+ pipe_array << pipe
250
+ end
251
+ pipe_array
252
+ end
253
+
254
+ # A utility method to get a Python `Lexeme` object.
255
+ # @param text [String] A text string representing a lexeme
256
+ # @return [Object] Python `Lexeme` object (https://spacy.io/api/lexeme)
257
+ def get_lexeme(text)
258
+ @py_nlp.vocab[text]
259
+ end
260
+
261
+ # Returns a ruby lexeme object
262
+ # @param text [String] a text string representing the vocabulary item
263
+ # @return [Lexeme]
264
+ def vocab(text)
265
+ Lexeme.new(@py_nlp.vocab[text])
266
+ end
267
+
268
+ # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
269
+ # @param vector [Object] A vector representation of a word (whether existing or non-existing)
270
+ # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
271
+ def most_similar(vector, n)
272
+ vec_array = Numpy.asarray([vector])
273
+ py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
274
+ key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist}]")
275
+ keys = key_texts.map{|kt| kt[0]}
276
+ texts = key_texts.map{|kt| kt[1]}
277
+ best_rows = PyCall::List.(py_result[1])[0]
278
+ scores = PyCall::List.(py_result[2])[0]
279
+
280
+ results = []
281
+ n.times do |i|
282
+ result = {key: keys[i].to_i,
283
+ text: texts[i],
284
+ best_row: best_rows[i],
285
+ score: scores[i]
286
+ }
287
+ result.each_key do |key|
288
+ result.define_singleton_method(key){ result[key] }
289
+ end
290
+ results << result
291
+ end
292
+ results
293
+ end
294
+
295
+ # Utility function to batch process many texts
296
+ # @param texts [String]
297
+ # @param disable [Array<String>]
298
+ # @param batch_size [Integer]
299
+ # @return [Array<Doc>]
300
+ def pipe(texts, disable: [], batch_size: 50)
301
+ docs = []
302
+ PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
303
+ docs << Doc.new(@py_nlp, py_doc: py_doc)
304
+ end
305
+ docs
306
+ end
307
+
308
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
309
+ def method_missing(name, *args)
310
+ @py_nlp.send(name, *args)
311
+ end
312
+ end
313
+
314
+ # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
315
+ class Matcher
316
+
317
+ # @return [Object] a Python `Matcher` instance accessible via `PyCall`
318
+ attr_reader :py_matcher
319
+
320
+ # Creates a {Matcher} instance
321
+ # @param nlp [Language] an instance of {Language} class
322
+ def initialize(nlp)
323
+ @py_matcher = PyMatcher.(nlp.vocab)
324
+ end
325
+
326
+ # Adds a label string and a text pattern.
327
+ # @param text [String] a label string given to the pattern
328
+ # @param pattern [Array<Array<Hash>>] sequences of text patterns that are alternative to each other
329
+ def add(text, pattern)
330
+ @py_matcher.add(text, pattern)
331
+ end
332
+
333
+ # Execute the match.
334
+ # @param doc [Doc] an {Doc} instance
335
+ # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
336
+ def match(doc)
337
+ str_results = @py_matcher.(doc.py_doc).to_s
338
+ s = StringScanner.new(str_results[1..-2])
339
+ results = []
340
+ while s.scan_until(/(\d+), (\d+), (\d+)/)
341
+ next unless s.matched
342
+ triple = s.matched.split(", ")
343
+ match_id = triple[0].to_i
344
+ start_index = triple[1].to_i
345
+ end_index = triple[2].to_i - 1
346
+ results << {match_id: match_id, start_index: start_index, end_index: end_index}
347
+ end
348
+ results
349
+ end
350
+ end
351
+
19
352
  # See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
20
353
  class Span
21
354
 
22
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
23
- attr_reader :spacy_span_id
24
-
25
355
  # @return [Object] a Python `Span` instance accessible via `PyCall`
26
356
  attr_reader :py_span
27
357
 
@@ -35,21 +365,18 @@ module Spacy
35
365
  alias_method :size, :count
36
366
 
37
367
  # It is recommended to use {Doc#span} method to create a span. If you need to
38
- # create one using {Span#initialize}, either of the two method signatures should be used: `Spacy.new(doc, py_span)` and `Spacy.new(doc, start_index, end_index, options)`.
368
+ # create one using {Span#initialize}, there are two method signatures:
369
+ # `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
39
370
  # @param doc [Doc] the document to which this span belongs to
40
371
  # @param start_index [Integer] the index of the item starting the span inside a doc
41
372
  # @param end_index [Integer] the index of the item ending the span inside a doc
42
373
  # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
43
374
  def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
44
375
  @doc = doc
45
- @spacy_span_id = "doc_#{doc.object_id}_span_#{start_index}_#{end_index}"
46
376
  if py_span
47
377
  @py_span = py_span
48
378
  else
49
- options = PyCall::Dict.(options)
50
- PyCall.exec("#{@spacy_span_id}_opts = #{options}")
51
- PyCall.exec("#{@spacy_span_id} = Span(#{@doc.spacy_doc_id}, #{start_index}, #{end_index + 1}, **#{@spacy_span_id}_opts)")
52
- @py_span = PyCall.eval(@spacy_span_id)
379
+ @py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
53
380
  end
54
381
  end
55
382
 
@@ -63,7 +390,7 @@ module Spacy
63
390
  results
64
391
  end
65
392
 
66
- # Iterates over the elements in the span yielding a token instance.
393
+ # Iterates over the elements in the span yielding a token instance each time.
67
394
  def each
68
395
  PyCall::List.(@py_span).each do |py_token|
69
396
  yield Token.new(py_token)
@@ -76,18 +403,24 @@ module Spacy
76
403
  chunk_array = []
77
404
  py_chunks = PyCall::List.(@py_span.noun_chunks)
78
405
  py_chunks.each do |py_span|
79
- chunk_array << Spacy::Span.new(@doc, py_span: py_span)
406
+ chunk_array << Span.new(@doc, py_span: py_span)
80
407
  end
81
408
  chunk_array
82
409
  end
83
410
 
411
+ # Returns the head token
412
+ # @return [Token]
413
+ def root
414
+ Token.new(@py_span.root)
415
+ end
416
+
84
417
  # Returns an array of spans that represents sentences.
85
418
  # @return [Array<Span>]
86
419
  def sents
87
420
  sentence_array = []
88
421
  py_sentences = PyCall::List.(@py_span.sents)
89
422
  py_sentences.each do |py_span|
90
- sentence_array << Spacy::Span.new(@doc, py_span: py_span)
423
+ sentence_array << Span.new(@doc, py_span: py_span)
91
424
  end
92
425
  sentence_array
93
426
  end
@@ -97,8 +430,7 @@ module Spacy
97
430
  def ents
98
431
  ent_array = []
99
432
  PyCall::List.(@py_span.ents).each do |py_span|
100
- # ent_array << ent
101
- ent_array << Spacy::Span.new(@doc, py_span: py_span)
433
+ ent_array << Span.new(@doc, py_span: py_span)
102
434
  end
103
435
  ent_array
104
436
  end
@@ -106,18 +438,18 @@ module Spacy
106
438
  # Returns a span that represents the sentence that the given span is part of.
107
439
  # @return [Span]
108
440
  def sent
109
- py_span =@py_span.sent
110
- return Spacy::Span.new(@doc, py_span: py_span)
441
+ py_span = @py_span.sent
442
+ return Span.new(@doc, py_span: py_span)
111
443
  end
112
444
 
113
- # Returns a span if a range object is given, or a token if an integer representing the position of the doc is given.
445
+ # Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
114
446
  # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
115
447
  def [](range)
116
448
  if range.is_a?(Range)
117
449
  py_span = @py_span[range]
118
- return Spacy::Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
450
+ return Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
119
451
  else
120
- return Spacy::Token.new(@py_span[range])
452
+ return Token.new(@py_span[range])
121
453
  end
122
454
  end
123
455
 
@@ -125,31 +457,31 @@ module Spacy
125
457
  # @param other [Span] the other span to which a similarity estimation is conducted
126
458
  # @return [Float]
127
459
  def similarity(other)
128
- PyCall.eval("#{@spacy_span_id}.similarity(#{other.spacy_span_id})")
460
+ py_span.similarity(other.py_span)
129
461
  end
130
462
 
131
- # Creates a document instance
463
+ # Creates a document instance from the span
132
464
  # @return [Doc]
133
465
  def as_doc
134
- Spacy::Doc.new(@doc.spacy_nlp_id, self.text)
466
+ Doc.new(@doc.py_nlp, text: self.text)
135
467
  end
136
468
 
137
- # Returns Tokens conjugated to the root of the span.
469
+ # Returns tokens conjugated to the root of the span.
138
470
  # @return [Array<Token>] an array of tokens
139
471
  def conjuncts
140
472
  conjunct_array = []
141
473
  PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
142
- conjunct_array << Spacy::Token.new(py_conjunct)
474
+ conjunct_array << Token.new(py_conjunct)
143
475
  end
144
476
  conjunct_array
145
477
  end
146
478
 
147
- # Returns Tokens that are to the left of the span, whose heads are within the span.
479
+ # Returns tokens that are to the left of the span, whose heads are within the span.
148
480
  # @return [Array<Token>] an array of tokens
149
481
  def lefts
150
482
  left_array = []
151
483
  PyCall::List.(@py_span.lefts).each do |py_left|
152
- left_array << Spacy::Token.new(py_left)
484
+ left_array << Token.new(py_left)
153
485
  end
154
486
  left_array
155
487
  end
@@ -159,7 +491,7 @@ module Spacy
159
491
  def rights
160
492
  right_array = []
161
493
  PyCall::List.(@py_span.rights).each do |py_right|
162
- right_array << Spacy::Token.new(py_right)
494
+ right_array << Token.new(py_right)
163
495
  end
164
496
  right_array
165
497
  end
@@ -169,11 +501,17 @@ module Spacy
169
501
  def subtree
170
502
  subtree_array = []
171
503
  PyCall::List.(@py_span.subtree).each do |py_subtree|
172
- subtree_array << Spacy::Token.new(py_subtree)
504
+ subtree_array << Token.new(py_subtree)
173
505
  end
174
506
  subtree_array
175
507
  end
176
508
 
509
+ # Returns the label
510
+ # @return [String]
511
+ def label
512
+ @py_span.label_
513
+ end
514
+
177
515
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
178
516
  def method_missing(name, *args)
179
517
  @py_span.send(name, *args)
@@ -189,59 +527,67 @@ module Spacy
189
527
  # @return [String] a string representing the token
190
528
  attr_reader :text
191
529
 
192
- # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens. There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
530
+ # It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
531
+ # There is no way to generate a token from scratch but relying on a pre-exising Python {Token} object.
193
532
  # @param py_token [Object] Python `Token` object
194
533
  def initialize(py_token)
195
534
  @py_token = py_token
196
535
  @text = @py_token.text
197
536
  end
198
537
 
538
+
539
+ # Returns the head token
540
+ # @return [Token]
541
+ def head
542
+ Token.new(@py_token.head)
543
+ end
544
+
199
545
  # Returns the token in question and the tokens that descend from it.
200
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
546
+ # @return [Array<Token>] an array of tokens
201
547
  def subtree
202
548
  descendant_array = []
203
549
  PyCall::List.(@py_token.subtree).each do |descendant|
204
- descendant_array << descendant
550
+ descendant_array << Token.new(descendant)
205
551
  end
206
552
  descendant_array
207
553
  end
208
554
 
209
555
  # Returns the token's ancestors.
210
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
556
+ # @return [Array<Token>] an array of tokens
211
557
  def ancestors
212
558
  ancestor_array = []
213
559
  PyCall::List.(@py_token.ancestors).each do |ancestor|
214
- ancestor_array << ancestor
560
+ ancestor_array << Token.new(ancestor)
215
561
  end
216
562
  ancestor_array
217
563
  end
218
564
 
219
565
  # Returns a sequence of the token's immediate syntactic children.
220
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
566
+ # @return [Array<Token>] an array of tokens
221
567
  def children
222
568
  child_array = []
223
569
  PyCall::List.(@py_token.children).each do |child|
224
- child_array << child
570
+ child_array << Token.new(child)
225
571
  end
226
572
  child_array
227
573
  end
228
574
 
229
575
  # The leftward immediate children of the word in the syntactic dependency parse.
230
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
576
+ # @return [Array<Token>] an array of tokens
231
577
  def lefts
232
578
  token_array = []
233
579
  PyCall::List.(@py_token.lefts).each do |token|
234
- token_array << token
580
+ token_array << Token.new(token)
235
581
  end
236
582
  token_array
237
583
  end
238
584
 
239
585
  # The rightward immediate children of the word in the syntactic dependency parse.
240
- # @return [Array<Object>] an (Ruby) array of Python `Token` objects
586
+ # @return [Array<Token>] an array of tokens
241
587
  def rights
242
588
  token_array = []
243
589
  PyCall::List.(@py_token.rights).each do |token|
244
- token_array << token
590
+ token_array << Token.new(token)
245
591
  end
246
592
  token_array
247
593
  end
@@ -252,314 +598,161 @@ module Spacy
252
598
  @text
253
599
  end
254
600
 
255
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
256
- def method_missing(name, *args)
257
- @py_token.send(name, *args)
258
- end
259
- end
260
-
261
- # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
262
- class Doc
263
-
264
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
265
- attr_reader :spacy_nlp_id
266
-
267
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
268
- attr_reader :spacy_doc_id
269
-
270
- # @return [Object] a Python `Doc` instance accessible via `PyCall`
271
- attr_reader :py_doc
272
-
273
- # @return [String] a text string of the document
274
- attr_reader :text
275
-
276
- include Enumerable
277
-
278
- alias_method :length, :count
279
- alias_method :len, :count
280
- alias_method :size, :count
281
-
282
- # Creates a new instance of {Doc}.
283
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
284
- # @param text [String] The text string to be analyzed
285
- def initialize(nlp_id, text)
286
- @text = text
287
- @spacy_nlp_id = nlp_id
288
- @spacy_doc_id = "doc_#{text.object_id}"
289
- quoted = text.gsub('"', '\"')
290
- PyCall.exec(%Q[text_#{text.object_id} = """#{quoted}"""])
291
- PyCall.exec("#{@spacy_doc_id} = #{nlp_id}(text_#{text.object_id})")
292
- @py_doc = PyCall.eval(@spacy_doc_id)
293
- end
294
-
295
-
296
- # Retokenizes the text merging a span into a single token.
297
- # @param start_index [Integer] The start position of the span to be retokenized in the document
298
- # @param end_index [Integer] The end position of the span to be retokenized in the document
299
- # @param attributes [Hash] Attributes to set on the merged token
300
- def retokenize(start_index, end_index, attributes = {})
301
- py_attrs = PyCall::Dict.(attributes)
302
- PyCall.exec(<<PY)
303
- with #{@spacy_doc_id}.retokenize() as retokenizer:
304
- retokenizer.merge(#{@spacy_doc_id}[#{start_index} : #{end_index + 1}], attrs=#{py_attrs})
305
- PY
306
- @py_doc = PyCall.eval(@spacy_doc_id)
307
- end
308
-
309
- # Retokenizes the text splitting the specified token.
310
- # @param pos_in_doc [Integer] The position of the span to be retokenized in the document
311
- # @param split_array [Array<String>] text strings of the split results
312
- # @param ancestor_pos [Integer] The position of the immediate ancestor element of the split elements in the document
313
- # @param attributes [Hash] The attributes of the split elements
314
- def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
315
- py_attrs = PyCall::Dict.(attributes)
316
- py_split_array = PyCall::List.(split_array)
317
- PyCall.exec(<<PY)
318
- with #{@spacy_doc_id}.retokenize() as retokenizer:
319
- heads = [(#{@spacy_doc_id}[#{pos_in_doc}], #{head_pos_in_split}), #{@spacy_doc_id}[#{ancestor_pos}]]
320
- attrs = #{py_attrs}
321
- split_array = #{py_split_array}
322
- retokenizer.split(#{@spacy_doc_id}[#{pos_in_doc}], split_array, heads=heads, attrs=attrs)
323
- PY
324
- @py_doc = PyCall.eval(@spacy_doc_id)
601
+ # Returns a hash or string of morphological information
602
+ # @param hash [Boolean] if true, a hash will be returned instead of a string
603
+ # @return [Hash, String]
604
+ def morphology(hash = true)
605
+ if @py_token.has_morph
606
+ morph_analysis = @py_token.morph
607
+ if hash
608
+ return morph_analysis.to_dict
609
+ else
610
+ return morph_analysis.to_s
611
+ end
612
+ else
613
+ if hash
614
+ results = {}
615
+ else
616
+ return ""
617
+ end
618
+ end
325
619
  end
326
620
 
327
- # String representation of the token.
621
+ # Returns the lemma by calling `lemma_' of `@py_token` object
328
622
  # @return [String]
329
- def to_s
330
- @text
623
+ def lemma
624
+ @py_token.lemma_
331
625
  end
332
626
 
333
- # Returns an array of tokens contained in the doc.
334
- # @return [Array<Token>]
335
- def tokens
336
- results = []
337
- PyCall::List.(@py_doc).each do |py_token|
338
- results << Token.new(py_token)
339
- end
340
- results
627
+ # Returns the lowercase form by calling `lower_' of `@py_token` object
628
+ # @return [String]
629
+ def lower
630
+ @py_token.lower_
341
631
  end
342
632
 
343
- # Iterates over the elements in the doc yielding a token instance.
344
- def each
345
- PyCall::List.(@py_doc).each do |py_token|
346
- yield Token.new(py_token)
347
- end
633
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
634
+ # @return [String]
635
+ def shape
636
+ @py_token.shape_
348
637
  end
349
638
 
350
- # Returns a span of the specified range within the doc.
351
- # The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
352
- # @param range_or_start [Range, Integer] A range object, or, alternatively, an integer that represents the start position of the span
353
- # @param optional_size [Integer] An integer representing the size of the span
354
- # @return [Span]
355
- def span(range_or_start, optional_size = nil)
356
- if optional_size
357
- start_index = range_or_start
358
- temp = tokens[start_index ... start_index + optional_size]
359
- else
360
- start_index = range_or_start.first
361
- range = range_or_start
362
- temp = tokens[range]
363
- end
364
-
365
- end_index = start_index + temp.size - 1
366
-
367
- Span.new(self, start_index: start_index, end_index: end_index)
639
+ # Returns the pos by calling `pos_' of `@py_token` object
640
+ # @return [String]
641
+ def pos
642
+ @py_token.pos_
368
643
  end
369
644
 
370
- # Returns an array of spans representing noun chunks.
371
- # @return [Array<Span>]
372
- def noun_chunks
373
- chunk_array = []
374
- py_chunks = PyCall::List.(@py_doc.noun_chunks)
375
- py_chunks.each do |py_chunk|
376
- chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
377
- end
378
- chunk_array
645
+ # Returns the fine-grained pos by calling `tag_' of `@py_token` object
646
+ # @return [String]
647
+ def tag
648
+ @py_token.tag_
379
649
  end
380
650
 
381
- # Returns an array of spans representing sentences.
382
- # @return [Array<Span>]
383
- def sents
384
- sentence_array = []
385
- py_sentences = PyCall::List.(@py_doc.sents)
386
- py_sentences.each do |py_sent|
387
- sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
388
- end
389
- sentence_array
651
+ # Returns the dependency relation by calling `dep_' of `@py_token` object
652
+ # @return [String]
653
+ def dep
654
+ @py_token.dep_
390
655
  end
391
-
392
- # Returns an array of spans representing named entities.
393
- # @return [Array<Span>]
394
- def ents
395
- # so that ents canbe "each"-ed in Ruby
396
- ent_array = []
397
- PyCall::List.(@py_doc.ents).each do |ent|
398
- ent_array << ent
399
- end
400
- ent_array
656
+
657
+ # Returns the language by calling `lang_' of `@py_token` object
658
+ # @return [String]
659
+ def lang
660
+ @py_token.lang_
401
661
  end
402
662
 
403
- # Returns a span if given a range object; returns a token if given an integer representing a position in the doc.
404
- # @param range [Range] an ordinary Ruby's range object such as `0..3`, `1...4`, or `3 .. -1`
405
- def [](range)
406
- if range.is_a?(Range)
407
- py_span = @py_doc[range]
408
- return Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
409
- else
410
- return Token.new(@py_doc[range])
411
- end
663
+ # Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
664
+ # @return [String]
665
+ def whitespace
666
+ @py_token.whitespace_
412
667
  end
413
668
 
414
- # Returns a semantic similarity estimate.
415
- # @param other [Doc] the other doc to which a similarity estimation is made
416
- # @return [Float]
417
- def similarity(other)
418
- PyCall.eval("#{@spacy_doc_id}.similarity(#{other.spacy_doc_id})")
669
+ # Returns the named entity type by calling `ent_type_' of `@py_token` object
670
+ # @return [String]
671
+ def ent_type
672
+ @py_token.ent_type_
419
673
  end
420
674
 
421
- # Visualize the document in one of two styles: dep (dependencies) or ent (named entities).
422
- # @param style [String] Either `dep` or `ent`
423
- # @param compact [Boolean] Only relevant to the `dep' style
424
- # @return [String] in the case of `dep`, the output text is an SVG while in the `ent` style, the output text is an HTML.
425
- def displacy(style: "dep", compact: false)
426
- PyCall.eval("displacy.render(#{@spacy_doc_id}, style='#{style}', options={'compact': #{compact.to_s.capitalize}}, jupyter=False)")
675
+ # Returns a lexeme object
676
+ # @return [Lexeme]
677
+ def lexeme
678
+ Lexeme.new(@py_token.lex)
427
679
  end
428
680
 
429
681
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
430
682
  def method_missing(name, *args)
431
- @py_doc.send(name, *args)
683
+ @py_token.send(name, *args)
432
684
  end
433
685
  end
434
686
 
435
- # See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
436
- class Matcher
687
+ # See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
688
+ class Lexeme
437
689
 
438
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
439
- attr_reader :spacy_matcher_id
440
-
441
- # @return [Object] a Python `Matcher` instance accessible via `PyCall`
442
- attr_reader :py_matcher
443
-
444
- # Creates a {Matcher} instance
445
- # @param nlp_id [String] The id string of the `nlp`, an instance of {Language} class
446
- def initialize(nlp_id)
447
- @spacy_matcher_id = "doc_#{nlp_id}_matcher"
448
- PyCall.exec("#{@spacy_matcher_id} = Matcher(#{nlp_id}.vocab)")
449
- @py_matcher = PyCall.eval(@spacy_matcher_id)
450
- end
690
+ # @return [Object] a Python `Lexeme` instance accessible via `PyCall`
691
+ attr_reader :py_lexeme
451
692
 
452
- # Adds a label string and a text pattern.
453
- # @param text [String] a label string given to the pattern
454
- # @param pattern [Array<Array<Hash>>] alternative sequences of text patterns
455
- def add(text, pattern)
456
- @py_matcher.add(text, pattern)
457
- end
693
+ # @return [String] a string representing the token
694
+ attr_reader :text
458
695
 
459
- # Execute the match.
460
- # @param doc [Doc] An {Doc} instance
461
- # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] The id of the matched pattern, the starting position, and the end position
462
- def match(doc)
463
- str_results = PyCall.eval("#{@spacy_matcher_id}(#{doc.spacy_doc_id})").to_s
464
- s = StringScanner.new(str_results[1..-2])
465
- results = []
466
- while s.scan_until(/(\d+), (\d+), (\d+)/)
467
- next unless s.matched
468
- triple = s.matched.split(", ")
469
- match_id = triple[0].to_i
470
- start_index = triple[1].to_i
471
- end_index = triple[2].to_i - 1
472
- results << {match_id: match_id, start_index: start_index, end_index: end_index}
473
- end
474
- results
696
+ # It is recommended to use {Language#vocab} or {Token#lexeme} methods to create tokens.
697
+ # There is no way to generate a lexeme from scratch but relying on a pre-exising Python {Lexeme} object.
698
+ # @param py_lexeme [Object] Python `Lexeme` object
699
+ def initialize(py_lexeme)
700
+ @py_lexeme = py_lexeme
701
+ @text = @py_lexeme.text
475
702
  end
476
- end
477
-
478
- # See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
479
- class Language
480
-
481
- # @return [String] an identifier string that can be used when referring to the Python object inside `PyCall::exec` or `PyCall::eval`
482
- attr_reader :spacy_nlp_id
483
-
484
- # @return [Object] a Python `Language` instance accessible via `PyCall`
485
- attr_reader :py_nlp
486
703
 
487
- # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
488
- # @param model [String] A language model installed in the system
489
- def initialize(model = "en_core_web_sm")
490
- @spacy_nlp_id = "nlp_#{model.object_id}"
491
- PyCall.exec("import spacy; from spacy.tokens import Span; from spacy.matcher import Matcher; from spacy import displacy")
492
- PyCall.exec("#{@spacy_nlp_id} = spacy.load('#{model}')")
493
- @py_nlp = PyCall.eval(@spacy_nlp_id)
704
+ # String representation of the token.
705
+ # @return [String]
706
+ def to_s
707
+ @text
494
708
  end
495
709
 
496
- # Reads and analyze the given text.
497
- # @param text [String] A text to be read and analyzed
498
- def read(text)
499
- Doc.new(@spacy_nlp_id, text)
710
+ # Returns the lowercase form by calling `lower_' of `@py_lexeme` object
711
+ # @return [String]
712
+ def lower
713
+ @py_lexeme.lower_
500
714
  end
501
715
 
502
- # Generates a matcher for the current language model.
503
- # @return [Matcher]
504
- def matcher
505
- Matcher.new(@spacy_nlp_id)
716
+ # Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
717
+ # @return [String]
718
+ def shape
719
+ @py_lexeme.shape_
506
720
  end
507
721
 
508
- # A utility method to lookup a vocabulary item of the given id.
509
- # @param id [Integer] A vocabulary id
510
- # @return [Object] A Python `Lexeme` object
511
- def vocab_string_lookup(id)
512
- PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
722
+ # Returns the language by calling `lang_' of `@py_lexeme` object
723
+ # @return [String]
724
+ def lang
725
+ @py_lexeme.lang_
513
726
  end
514
727
 
515
- # A utility method to list pipeline components.
516
- # @return [Array<String>] An array of text strings representing pipeline components
517
- def pipe_names
518
- pipe_array = []
519
- PyCall::List.(@py_nlp.pipe_names).each do |pipe|
520
- pipe_array << pipe
521
- end
522
- pipe_array
728
+ # Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
729
+ # @return [String]
730
+ def prefix
731
+ @py_lexeme.prefix_
523
732
  end
524
-
525
- # A utility method to get the tokenizer Python object.
526
- # @return [Object] Python `Tokenizer` object
527
- def tokenizer
528
- return PyCall.eval("#{@spacy_nlp_id}.tokenizer")
733
+ #
734
+ # Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
735
+ # @return [String]
736
+ def suffix
737
+ @py_lexeme.suffix_
529
738
  end
530
739
 
531
- # A utility method to get a Python `Lexeme` object.
532
- # @param text [String] A text string representing a lexeme
533
- # @return [Object] Python `Tokenizer` object
534
- def get_lexeme(text)
535
- text = text.gsub("'", "\'")
536
- py_lexeme = PyCall.eval("#{@spacy_nlp_id}.vocab['#{text}']")
537
- return py_lexeme
740
+ # Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
741
+ # @return [String]
742
+ def norm
743
+ @py_lexeme.norm_
538
744
  end
539
745
 
540
- # Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
541
- # @param vector [Object] A vector representation of a word (whether existing or non-existing)
542
- # @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
543
- def most_similar(vector, n)
544
- vec_array = Numpy.asarray([vector])
545
- py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: n)
546
- key_texts = PyCall.eval("[[str(n), #{@spacy_nlp_id}.vocab[n].text] for n in #{py_result[0][0].tolist()}]")
547
- keys = key_texts.map{|kt| kt[0]}
548
- texts = key_texts.map{|kt| kt[1]}
549
- best_rows = PyCall::List.(py_result[1])[0]
550
- scores = PyCall::List.(py_result[2])[0]
551
-
552
- results = []
553
- n.times do |i|
554
- results << {key: keys[i].to_i, text: texts[i], best_row: best_rows[i], score: scores[i]}
555
- end
556
-
557
- results
746
+ # Returns a semantic similarity estimate.
747
+ # @param other [Lexeme] the other doc to which a similarity estimation is made
748
+ # @return [Float]
749
+ def similarity(other)
750
+ @py_lexeme.similarity(other.py_lexeme)
558
751
  end
559
752
 
560
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
753
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
561
754
  def method_missing(name, *args)
562
- @py_nlp.send(name, *args)
755
+ @py_lexeme.send(name, *args)
563
756
  end
564
757
  end
565
758