ruby-spacy 0.1.4.1 → 0.1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +48 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +7 -7
- data/Gemfile.lock +88 -9
- data/README.md +7 -10
- data/examples/get_started/lexeme.rb +3 -1
- data/examples/get_started/linguistic_annotations.rb +3 -1
- data/examples/get_started/morphology.rb +3 -1
- data/examples/get_started/most_similar.rb +3 -1
- data/examples/get_started/named_entities.rb +4 -2
- data/examples/get_started/pos_tags_and_dependencies.rb +3 -1
- data/examples/get_started/similarity.rb +4 -2
- data/examples/get_started/tokenization.rb +3 -1
- data/examples/get_started/visualizing_dependencies.rb +2 -2
- data/examples/get_started/visualizing_dependencies_compact.rb +2 -0
- data/examples/get_started/visualizing_named_entities.rb +4 -2
- data/examples/get_started/vocab.rb +3 -1
- data/examples/get_started/word_vectors.rb +3 -1
- data/examples/japanese/ancestors.rb +6 -4
- data/examples/japanese/entity_annotations_and_labels.rb +4 -2
- data/examples/japanese/information_extraction.rb +6 -6
- data/examples/japanese/lemmatization.rb +3 -1
- data/examples/japanese/most_similar.rb +3 -1
- data/examples/japanese/named_entity_recognition.rb +3 -2
- data/examples/japanese/navigating_parse_tree.rb +19 -17
- data/examples/japanese/noun_chunks.rb +2 -0
- data/examples/japanese/pos_tagging.rb +3 -1
- data/examples/japanese/sentence_segmentation.rb +3 -2
- data/examples/japanese/similarity.rb +2 -0
- data/examples/japanese/tokenization.rb +2 -0
- data/examples/japanese/visualizing_dependencies.rb +3 -1
- data/examples/japanese/visualizing_named_entities.rb +4 -2
- data/examples/linguistic_features/ancestors.rb +7 -5
- data/examples/linguistic_features/entity_annotations_and_labels.rb +4 -2
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +3 -5
- data/examples/linguistic_features/information_extraction.rb +9 -9
- data/examples/linguistic_features/iterating_children.rb +6 -8
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +7 -5
- data/examples/linguistic_features/lemmatization.rb +3 -1
- data/examples/linguistic_features/named_entity_recognition.rb +3 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +3 -1
- data/examples/linguistic_features/noun_chunks.rb +3 -1
- data/examples/linguistic_features/pos_tagging.rb +3 -1
- data/examples/linguistic_features/retokenize_1.rb +2 -0
- data/examples/linguistic_features/retokenize_2.rb +4 -2
- data/examples/linguistic_features/rule_based_morphology.rb +4 -2
- data/examples/linguistic_features/sentence_segmentation.rb +3 -2
- data/examples/linguistic_features/similarity.rb +4 -2
- data/examples/linguistic_features/similarity_between_lexemes.rb +2 -0
- data/examples/linguistic_features/similarity_between_spans.rb +7 -5
- data/examples/linguistic_features/tokenization.rb +3 -2
- data/examples/rule_based_matching/creating_spans_from_matches.rb +5 -3
- data/examples/rule_based_matching/matcher.rb +4 -2
- data/lib/ruby-spacy/version.rb +1 -1
- data/lib/ruby-spacy.rb +147 -142
- data/ruby-spacy.gemspec +15 -17
- metadata +68 -10
data/lib/ruby-spacy.rb
CHANGED
@@ -1,17 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "ruby-spacy/version"
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require 'pycall/import'
|
8
|
-
include PyCall::Import
|
4
|
+
require "strscan"
|
5
|
+
require "numpy"
|
6
|
+
require "pycall"
|
9
7
|
|
10
8
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
9
|
module Spacy
|
10
|
+
MAX_RETRIAL = 20
|
12
11
|
|
13
|
-
|
14
|
-
spacy = PyCall.import_module('spacy')
|
12
|
+
spacy = PyCall.import_module("spacy")
|
15
13
|
|
16
14
|
# Python `Language` class
|
17
15
|
PyLanguage = spacy.language.Language
|
@@ -24,23 +22,22 @@ module Spacy
|
|
24
22
|
|
25
23
|
# Python `Token` class object
|
26
24
|
PyToken = spacy.tokens.Token
|
27
|
-
|
25
|
+
|
28
26
|
# Python `Matcher` class object
|
29
27
|
PyMatcher = spacy.matcher.Matcher
|
30
28
|
|
31
29
|
# Python `displacy` object
|
32
30
|
PyDisplacy = spacy.displacy
|
33
31
|
|
34
|
-
# A utility module method to convert Python's generator object to a Ruby array,
|
32
|
+
# A utility module method to convert Python's generator object to a Ruby array,
|
35
33
|
# mainly used on the items inside the array returned from dependency-related methods
|
36
34
|
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
37
35
|
def self.generator_to_array(py_generator)
|
38
|
-
PyCall::List.(py_generator)
|
36
|
+
PyCall::List.call(py_generator)
|
39
37
|
end
|
40
38
|
|
41
39
|
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
42
40
|
class Doc
|
43
|
-
|
44
41
|
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
45
42
|
attr_reader :py_nlp
|
46
43
|
|
@@ -52,23 +49,19 @@ module Spacy
|
|
52
49
|
|
53
50
|
include Enumerable
|
54
51
|
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
alias length count
|
53
|
+
alias len count
|
54
|
+
alias size count
|
58
55
|
|
59
|
-
# It is recommended to use {Language#read} method to create a doc. If you need to
|
60
|
-
# create one using {Doc#initialize}, there are two method signatures:
|
56
|
+
# It is recommended to use {Language#read} method to create a doc. If you need to
|
57
|
+
# create one using {Doc#initialize}, there are two method signatures:
|
61
58
|
# `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
|
62
59
|
# @param nlp [Language] an instance of {Language} class
|
63
60
|
# @param py_doc [Object] an instance of Python `Doc` class
|
64
61
|
# @param text [String] the text string to be analyzed
|
65
62
|
def initialize(nlp, py_doc: nil, text: nil)
|
66
63
|
@py_nlp = nlp
|
67
|
-
|
68
|
-
@py_doc = py_doc
|
69
|
-
else
|
70
|
-
@py_doc = nlp.(text)
|
71
|
-
end
|
64
|
+
@py_doc = py_doc || @py_doc = nlp.call(text)
|
72
65
|
@text = @py_doc.text
|
73
66
|
end
|
74
67
|
|
@@ -77,25 +70,25 @@ module Spacy
|
|
77
70
|
# @param end_index [Integer] the end position of the span to be retokenized in the document
|
78
71
|
# @param attributes [Hash] attributes to set on the merged token
|
79
72
|
def retokenize(start_index, end_index, attributes = {})
|
80
|
-
PyCall.with(@py_doc.retokenize
|
81
|
-
retokenizer.merge(@py_doc[start_index
|
73
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
74
|
+
retokenizer.merge(@py_doc[start_index..end_index], attrs: attributes)
|
82
75
|
end
|
83
76
|
end
|
84
77
|
|
85
78
|
# Retokenizes the text splitting the specified token.
|
86
79
|
# @param pos_in_doc [Integer] the position of the span to be retokenized in the document
|
87
|
-
# @param split_array [Array<String>] text strings of the split results
|
80
|
+
# @param split_array [Array<String>] text strings of the split results
|
88
81
|
# @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
|
89
82
|
# @param attributes [Hash] the attributes of the split elements
|
90
83
|
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
91
|
-
PyCall.with(@py_doc.retokenize
|
84
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
92
85
|
heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
|
93
86
|
retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
|
94
87
|
end
|
95
88
|
end
|
96
89
|
|
97
90
|
# String representation of the document.
|
98
|
-
# @return [String]
|
91
|
+
# @return [String]
|
99
92
|
def to_s
|
100
93
|
@text
|
101
94
|
end
|
@@ -104,7 +97,7 @@ module Spacy
|
|
104
97
|
# @return [Array<Token>]
|
105
98
|
def tokens
|
106
99
|
results = []
|
107
|
-
PyCall::List.(@py_doc).each do |py_token|
|
100
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
108
101
|
results << Token.new(py_token)
|
109
102
|
end
|
110
103
|
results
|
@@ -112,12 +105,12 @@ module Spacy
|
|
112
105
|
|
113
106
|
# Iterates over the elements in the doc yielding a token instance each time.
|
114
107
|
def each
|
115
|
-
PyCall::List.(@py_doc).each do |py_token|
|
108
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
116
109
|
yield Token.new(py_token)
|
117
110
|
end
|
118
111
|
end
|
119
112
|
|
120
|
-
# Returns a span of the specified range within the doc.
|
113
|
+
# Returns a span of the specified range within the doc.
|
121
114
|
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
122
115
|
# @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
|
123
116
|
# @param optional_size [Integer] an integer representing the size of the span
|
@@ -125,7 +118,7 @@ module Spacy
|
|
125
118
|
def span(range_or_start, optional_size = nil)
|
126
119
|
if optional_size
|
127
120
|
start_index = range_or_start
|
128
|
-
temp = tokens[start_index
|
121
|
+
temp = tokens[start_index...start_index + optional_size]
|
129
122
|
else
|
130
123
|
start_index = range_or_start.first
|
131
124
|
range = range_or_start
|
@@ -141,7 +134,7 @@ module Spacy
|
|
141
134
|
# @return [Array<Span>]
|
142
135
|
def noun_chunks
|
143
136
|
chunk_array = []
|
144
|
-
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
137
|
+
py_chunks = PyCall::List.call(@py_doc.noun_chunks)
|
145
138
|
py_chunks.each do |py_chunk|
|
146
139
|
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
147
140
|
end
|
@@ -152,7 +145,7 @@ module Spacy
|
|
152
145
|
# @return [Array<Span>]
|
153
146
|
def sents
|
154
147
|
sentence_array = []
|
155
|
-
py_sentences = PyCall::List.(@py_doc.sents)
|
148
|
+
py_sentences = PyCall::List.call(@py_doc.sents)
|
156
149
|
py_sentences.each do |py_sent|
|
157
150
|
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
158
151
|
end
|
@@ -164,9 +157,9 @@ module Spacy
|
|
164
157
|
def ents
|
165
158
|
# so that ents canbe "each"-ed in Ruby
|
166
159
|
ent_array = []
|
167
|
-
PyCall::List.(@py_doc.ents).each do |ent|
|
160
|
+
PyCall::List.call(@py_doc.ents).each do |ent|
|
168
161
|
ent.define_singleton_method :label do
|
169
|
-
|
162
|
+
label_
|
170
163
|
end
|
171
164
|
ent_array << ent
|
172
165
|
end
|
@@ -178,15 +171,15 @@ module Spacy
|
|
178
171
|
def [](range)
|
179
172
|
if range.is_a?(Range)
|
180
173
|
py_span = @py_doc[range]
|
181
|
-
|
174
|
+
Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
182
175
|
else
|
183
|
-
|
176
|
+
Token.new(@py_doc[range])
|
184
177
|
end
|
185
178
|
end
|
186
179
|
|
187
180
|
# Returns a semantic similarity estimate.
|
188
181
|
# @param other [Doc] the other doc to which a similarity estimation is made
|
189
|
-
# @return [Float]
|
182
|
+
# @return [Float]
|
190
183
|
def similarity(other)
|
191
184
|
py_doc.similarity(other.py_doc)
|
192
185
|
end
|
@@ -196,18 +189,21 @@ module Spacy
|
|
196
189
|
# @param compact [Boolean] only relevant to the `dep' style
|
197
190
|
# @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
|
198
191
|
def displacy(style: "dep", compact: false)
|
199
|
-
PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
|
192
|
+
PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
|
200
193
|
end
|
201
194
|
|
202
195
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
203
196
|
def method_missing(name, *args)
|
204
197
|
@py_doc.send(name, *args)
|
205
198
|
end
|
199
|
+
|
200
|
+
def respond_to_missing?(sym)
|
201
|
+
sym ? true : super
|
202
|
+
end
|
206
203
|
end
|
207
204
|
|
208
205
|
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
209
206
|
class Language
|
210
|
-
|
211
207
|
# @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
|
212
208
|
attr_reader :spacy_nlp_id
|
213
209
|
|
@@ -216,10 +212,16 @@ module Spacy
|
|
216
212
|
|
217
213
|
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
218
214
|
# @param model [String] A language model installed in the system
|
219
|
-
def initialize(model = "en_core_web_sm")
|
215
|
+
def initialize(model = "en_core_web_sm", max_retrial = MAX_RETRIAL, retrial = 0)
|
220
216
|
@spacy_nlp_id = "nlp_#{model.object_id}"
|
221
217
|
PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
|
222
218
|
@py_nlp = PyCall.eval(@spacy_nlp_id)
|
219
|
+
rescue StandardError
|
220
|
+
retrial += 1
|
221
|
+
raise "Error: Pycall failed to load Spacy" unless retrial <= max_retrial
|
222
|
+
|
223
|
+
sleep 0.5
|
224
|
+
initialize(model, max_retrial, retrial)
|
223
225
|
end
|
224
226
|
|
225
227
|
# Reads and analyze the given text.
|
@@ -245,7 +247,7 @@ module Spacy
|
|
245
247
|
# @return [Array<String>] An array of text strings representing pipeline components
|
246
248
|
def pipe_names
|
247
249
|
pipe_array = []
|
248
|
-
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
250
|
+
PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
|
249
251
|
pipe_array << pipe
|
250
252
|
end
|
251
253
|
pipe_array
|
@@ -268,24 +270,23 @@ module Spacy
|
|
268
270
|
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
269
271
|
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
270
272
|
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
271
|
-
def most_similar(vector,
|
273
|
+
def most_similar(vector, num)
|
272
274
|
vec_array = Numpy.asarray([vector])
|
273
|
-
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n:
|
274
|
-
key_texts = PyCall.eval("[[str(
|
275
|
-
keys = key_texts.map{|kt| kt[0]}
|
276
|
-
texts = key_texts.map{|kt| kt[1]}
|
277
|
-
best_rows = PyCall::List.(py_result[1])[0]
|
278
|
-
scores = PyCall::List.(py_result[2])[0]
|
275
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: num)
|
276
|
+
key_texts = PyCall.eval("[[str(num), #{@spacy_nlp_id}.vocab[num].text] for num in #{py_result[0][0].tolist}]")
|
277
|
+
keys = key_texts.map { |kt| kt[0] }
|
278
|
+
texts = key_texts.map { |kt| kt[1] }
|
279
|
+
best_rows = PyCall::List.call(py_result[1])[0]
|
280
|
+
scores = PyCall::List.call(py_result[2])[0]
|
279
281
|
|
280
282
|
results = []
|
281
|
-
|
282
|
-
result = {key: keys[i].to_i,
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
}
|
283
|
+
num.times do |i|
|
284
|
+
result = { key: keys[i].to_i,
|
285
|
+
text: texts[i],
|
286
|
+
best_row: best_rows[i],
|
287
|
+
score: scores[i] }
|
287
288
|
result.each_key do |key|
|
288
|
-
result.define_singleton_method(key){ result[key] }
|
289
|
+
result.define_singleton_method(key) { result[key] }
|
289
290
|
end
|
290
291
|
results << result
|
291
292
|
end
|
@@ -297,9 +298,9 @@ module Spacy
|
|
297
298
|
# @param disable [Array<String>]
|
298
299
|
# @param batch_size [Integer]
|
299
300
|
# @return [Array<Doc>]
|
300
|
-
def pipe(texts, disable: [], batch_size: 50)
|
301
|
+
def pipe(texts, disable: [], batch_size: 50)
|
301
302
|
docs = []
|
302
|
-
PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
303
|
+
PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
303
304
|
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
304
305
|
end
|
305
306
|
docs
|
@@ -309,18 +310,21 @@ module Spacy
|
|
309
310
|
def method_missing(name, *args)
|
310
311
|
@py_nlp.send(name, *args)
|
311
312
|
end
|
313
|
+
|
314
|
+
def respond_to_missing?(sym)
|
315
|
+
sym ? true : super
|
316
|
+
end
|
312
317
|
end
|
313
318
|
|
314
319
|
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
315
320
|
class Matcher
|
316
|
-
|
317
321
|
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
318
322
|
attr_reader :py_matcher
|
319
323
|
|
320
324
|
# Creates a {Matcher} instance
|
321
325
|
# @param nlp [Language] an instance of {Language} class
|
322
326
|
def initialize(nlp)
|
323
|
-
@py_matcher = PyMatcher.(nlp.vocab)
|
327
|
+
@py_matcher = PyMatcher.call(nlp.vocab)
|
324
328
|
end
|
325
329
|
|
326
330
|
# Adds a label string and a text pattern.
|
@@ -334,16 +338,17 @@ module Spacy
|
|
334
338
|
# @param doc [Doc] an {Doc} instance
|
335
339
|
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
336
340
|
def match(doc)
|
337
|
-
str_results = @py_matcher.(doc.py_doc).to_s
|
341
|
+
str_results = @py_matcher.call(doc.py_doc).to_s
|
338
342
|
s = StringScanner.new(str_results[1..-2])
|
339
343
|
results = []
|
340
344
|
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
341
345
|
next unless s.matched
|
346
|
+
|
342
347
|
triple = s.matched.split(", ")
|
343
348
|
match_id = triple[0].to_i
|
344
349
|
start_index = triple[1].to_i
|
345
350
|
end_index = triple[2].to_i - 1
|
346
|
-
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
351
|
+
results << { match_id: match_id, start_index: start_index, end_index: end_index }
|
347
352
|
end
|
348
353
|
results
|
349
354
|
end
|
@@ -351,7 +356,6 @@ module Spacy
|
|
351
356
|
|
352
357
|
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
353
358
|
class Span
|
354
|
-
|
355
359
|
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
356
360
|
attr_reader :py_span
|
357
361
|
|
@@ -360,11 +364,11 @@ module Spacy
|
|
360
364
|
|
361
365
|
include Enumerable
|
362
366
|
|
363
|
-
|
364
|
-
|
365
|
-
|
367
|
+
alias length count
|
368
|
+
alias len count
|
369
|
+
alias size count
|
366
370
|
|
367
|
-
# It is recommended to use {Doc#span} method to create a span. If you need to
|
371
|
+
# It is recommended to use {Doc#span} method to create a span. If you need to
|
368
372
|
# create one using {Span#initialize}, there are two method signatures:
|
369
373
|
# `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
|
370
374
|
# @param doc [Doc] the document to which this span belongs to
|
@@ -373,18 +377,14 @@ module Spacy
|
|
373
377
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
374
378
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
375
379
|
@doc = doc
|
376
|
-
|
377
|
-
@py_span = py_span
|
378
|
-
else
|
379
|
-
@py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
|
380
|
-
end
|
380
|
+
@py_span = py_span || @py_span = PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
|
381
381
|
end
|
382
382
|
|
383
383
|
# Returns an array of tokens contained in the span.
|
384
384
|
# @return [Array<Token>]
|
385
385
|
def tokens
|
386
386
|
results = []
|
387
|
-
PyCall::List.(@py_span).each do |py_token|
|
387
|
+
PyCall::List.call(@py_span).each do |py_token|
|
388
388
|
results << Token.new(py_token)
|
389
389
|
end
|
390
390
|
results
|
@@ -392,7 +392,7 @@ module Spacy
|
|
392
392
|
|
393
393
|
# Iterates over the elements in the span yielding a token instance each time.
|
394
394
|
def each
|
395
|
-
PyCall::List.(@py_span).each do |py_token|
|
395
|
+
PyCall::List.call(@py_span).each do |py_token|
|
396
396
|
yield Token.new(py_token)
|
397
397
|
end
|
398
398
|
end
|
@@ -401,7 +401,7 @@ module Spacy
|
|
401
401
|
# @return [Array<Span>]
|
402
402
|
def noun_chunks
|
403
403
|
chunk_array = []
|
404
|
-
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
404
|
+
py_chunks = PyCall::List.call(@py_span.noun_chunks)
|
405
405
|
py_chunks.each do |py_span|
|
406
406
|
chunk_array << Span.new(@doc, py_span: py_span)
|
407
407
|
end
|
@@ -410,7 +410,7 @@ module Spacy
|
|
410
410
|
|
411
411
|
# Returns the head token
|
412
412
|
# @return [Token]
|
413
|
-
def root
|
413
|
+
def root
|
414
414
|
Token.new(@py_span.root)
|
415
415
|
end
|
416
416
|
|
@@ -418,7 +418,7 @@ module Spacy
|
|
418
418
|
# @return [Array<Span>]
|
419
419
|
def sents
|
420
420
|
sentence_array = []
|
421
|
-
py_sentences = PyCall::List.(@py_span.sents)
|
421
|
+
py_sentences = PyCall::List.call(@py_span.sents)
|
422
422
|
py_sentences.each do |py_span|
|
423
423
|
sentence_array << Span.new(@doc, py_span: py_span)
|
424
424
|
end
|
@@ -429,7 +429,7 @@ module Spacy
|
|
429
429
|
# @return [Array<Span>]
|
430
430
|
def ents
|
431
431
|
ent_array = []
|
432
|
-
PyCall::List.(@py_span.ents).each do |py_span|
|
432
|
+
PyCall::List.call(@py_span.ents).each do |py_span|
|
433
433
|
ent_array << Span.new(@doc, py_span: py_span)
|
434
434
|
end
|
435
435
|
ent_array
|
@@ -438,8 +438,8 @@ module Spacy
|
|
438
438
|
# Returns a span that represents the sentence that the given span is part of.
|
439
439
|
# @return [Span]
|
440
440
|
def sent
|
441
|
-
py_span = @py_span.sent
|
442
|
-
|
441
|
+
py_span = @py_span.sent
|
442
|
+
Span.new(@doc, py_span: py_span)
|
443
443
|
end
|
444
444
|
|
445
445
|
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
@@ -447,67 +447,67 @@ module Spacy
|
|
447
447
|
def [](range)
|
448
448
|
if range.is_a?(Range)
|
449
449
|
py_span = @py_span[range]
|
450
|
-
|
450
|
+
Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
451
451
|
else
|
452
|
-
|
452
|
+
Token.new(@py_span[range])
|
453
453
|
end
|
454
454
|
end
|
455
455
|
|
456
456
|
# Returns a semantic similarity estimate.
|
457
457
|
# @param other [Span] the other span to which a similarity estimation is conducted
|
458
|
-
# @return [Float]
|
458
|
+
# @return [Float]
|
459
459
|
def similarity(other)
|
460
460
|
py_span.similarity(other.py_span)
|
461
461
|
end
|
462
462
|
|
463
463
|
# Creates a document instance from the span
|
464
|
-
# @return [Doc]
|
464
|
+
# @return [Doc]
|
465
465
|
def as_doc
|
466
|
-
Doc.new(@doc.py_nlp, text:
|
466
|
+
Doc.new(@doc.py_nlp, text: text)
|
467
467
|
end
|
468
468
|
|
469
469
|
# Returns tokens conjugated to the root of the span.
|
470
470
|
# @return [Array<Token>] an array of tokens
|
471
471
|
def conjuncts
|
472
472
|
conjunct_array = []
|
473
|
-
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
473
|
+
PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
|
474
474
|
conjunct_array << Token.new(py_conjunct)
|
475
475
|
end
|
476
476
|
conjunct_array
|
477
477
|
end
|
478
478
|
|
479
479
|
# Returns tokens that are to the left of the span, whose heads are within the span.
|
480
|
-
# @return [Array<Token>] an array of tokens
|
480
|
+
# @return [Array<Token>] an array of tokens
|
481
481
|
def lefts
|
482
482
|
left_array = []
|
483
|
-
PyCall::List.(@py_span.lefts).each do |py_left|
|
483
|
+
PyCall::List.call(@py_span.lefts).each do |py_left|
|
484
484
|
left_array << Token.new(py_left)
|
485
485
|
end
|
486
486
|
left_array
|
487
487
|
end
|
488
488
|
|
489
489
|
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
490
|
-
# @return [Array<Token>] an array of Tokens
|
490
|
+
# @return [Array<Token>] an array of Tokens
|
491
491
|
def rights
|
492
492
|
right_array = []
|
493
|
-
PyCall::List.(@py_span.rights).each do |py_right|
|
493
|
+
PyCall::List.call(@py_span.rights).each do |py_right|
|
494
494
|
right_array << Token.new(py_right)
|
495
495
|
end
|
496
496
|
right_array
|
497
497
|
end
|
498
498
|
|
499
499
|
# Returns Tokens that are within the span and tokens that descend from them.
|
500
|
-
# @return [Array<Token>] an array of tokens
|
500
|
+
# @return [Array<Token>] an array of tokens
|
501
501
|
def subtree
|
502
502
|
subtree_array = []
|
503
|
-
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
503
|
+
PyCall::List.call(@py_span.subtree).each do |py_subtree|
|
504
504
|
subtree_array << Token.new(py_subtree)
|
505
505
|
end
|
506
506
|
subtree_array
|
507
507
|
end
|
508
508
|
|
509
509
|
# Returns the label
|
510
|
-
# @return [String]
|
510
|
+
# @return [String]
|
511
511
|
def label
|
512
512
|
@py_span.label_
|
513
513
|
end
|
@@ -516,11 +516,14 @@ module Spacy
|
|
516
516
|
def method_missing(name, *args)
|
517
517
|
@py_span.send(name, *args)
|
518
518
|
end
|
519
|
+
|
520
|
+
def respond_to_missing?(sym)
|
521
|
+
sym ? true : super
|
522
|
+
end
|
519
523
|
end
|
520
524
|
|
521
525
|
# See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
|
522
526
|
class Token
|
523
|
-
|
524
527
|
# @return [Object] a Python `Token` instance accessible via `PyCall`
|
525
528
|
attr_reader :py_token
|
526
529
|
|
@@ -528,17 +531,16 @@ module Spacy
|
|
528
531
|
attr_reader :text
|
529
532
|
|
530
533
|
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
531
|
-
# There is no way to generate a token from scratch but relying on a pre-exising Python
|
534
|
+
# There is no way to generate a token from scratch but relying on a pre-exising Python `Token` object.
|
532
535
|
# @param py_token [Object] Python `Token` object
|
533
536
|
def initialize(py_token)
|
534
537
|
@py_token = py_token
|
535
538
|
@text = @py_token.text
|
536
539
|
end
|
537
540
|
|
538
|
-
|
539
541
|
# Returns the head token
|
540
542
|
# @return [Token]
|
541
|
-
def head
|
543
|
+
def head
|
542
544
|
Token.new(@py_token.head)
|
543
545
|
end
|
544
546
|
|
@@ -546,7 +548,7 @@ module Spacy
|
|
546
548
|
# @return [Array<Token>] an array of tokens
|
547
549
|
def subtree
|
548
550
|
descendant_array = []
|
549
|
-
PyCall::List.(@py_token.subtree).each do |descendant|
|
551
|
+
PyCall::List.call(@py_token.subtree).each do |descendant|
|
550
552
|
descendant_array << Token.new(descendant)
|
551
553
|
end
|
552
554
|
descendant_array
|
@@ -556,7 +558,7 @@ module Spacy
|
|
556
558
|
# @return [Array<Token>] an array of tokens
|
557
559
|
def ancestors
|
558
560
|
ancestor_array = []
|
559
|
-
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
561
|
+
PyCall::List.call(@py_token.ancestors).each do |ancestor|
|
560
562
|
ancestor_array << Token.new(ancestor)
|
561
563
|
end
|
562
564
|
ancestor_array
|
@@ -566,7 +568,7 @@ module Spacy
|
|
566
568
|
# @return [Array<Token>] an array of tokens
|
567
569
|
def children
|
568
570
|
child_array = []
|
569
|
-
PyCall::List.(@py_token.children).each do |child|
|
571
|
+
PyCall::List.call(@py_token.children).each do |child|
|
570
572
|
child_array << Token.new(child)
|
571
573
|
end
|
572
574
|
child_array
|
@@ -576,7 +578,7 @@ module Spacy
|
|
576
578
|
# @return [Array<Token>] an array of tokens
|
577
579
|
def lefts
|
578
580
|
token_array = []
|
579
|
-
PyCall::List.(@py_token.lefts).each do |token|
|
581
|
+
PyCall::List.call(@py_token.lefts).each do |token|
|
580
582
|
token_array << Token.new(token)
|
581
583
|
end
|
582
584
|
token_array
|
@@ -586,89 +588,87 @@ module Spacy
|
|
586
588
|
# @return [Array<Token>] an array of tokens
|
587
589
|
def rights
|
588
590
|
token_array = []
|
589
|
-
PyCall::List.(@py_token.rights).each do |token|
|
591
|
+
PyCall::List.call(@py_token.rights).each do |token|
|
590
592
|
token_array << Token.new(token)
|
591
593
|
end
|
592
594
|
token_array
|
593
595
|
end
|
594
596
|
|
595
597
|
# String representation of the token.
|
596
|
-
# @return [String]
|
598
|
+
# @return [String]
|
597
599
|
def to_s
|
598
600
|
@text
|
599
601
|
end
|
600
602
|
|
601
603
|
# Returns a hash or string of morphological information
|
602
604
|
# @param hash [Boolean] if true, a hash will be returned instead of a string
|
603
|
-
# @return [Hash, String]
|
604
|
-
def morphology(hash
|
605
|
+
# @return [Hash, String]
|
606
|
+
def morphology(hash: true)
|
605
607
|
if @py_token.has_morph
|
606
608
|
morph_analysis = @py_token.morph
|
607
|
-
if hash
|
608
|
-
return morph_analysis.to_dict
|
609
|
-
else
|
610
|
-
return morph_analysis.to_s
|
611
|
-
end
|
612
|
-
else
|
613
609
|
if hash
|
614
|
-
|
610
|
+
morph_analysis.to_dict
|
615
611
|
else
|
616
|
-
|
612
|
+
morph_analysis.to_s
|
617
613
|
end
|
614
|
+
elsif hash
|
615
|
+
{}
|
616
|
+
else
|
617
|
+
""
|
618
618
|
end
|
619
619
|
end
|
620
620
|
|
621
621
|
# Returns the lemma by calling `lemma_' of `@py_token` object
|
622
|
-
# @return [String]
|
622
|
+
# @return [String]
|
623
623
|
def lemma
|
624
624
|
@py_token.lemma_
|
625
625
|
end
|
626
626
|
|
627
627
|
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
628
|
-
# @return [String]
|
628
|
+
# @return [String]
|
629
629
|
def lower
|
630
630
|
@py_token.lower_
|
631
631
|
end
|
632
632
|
|
633
633
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
634
|
-
# @return [String]
|
634
|
+
# @return [String]
|
635
635
|
def shape
|
636
636
|
@py_token.shape_
|
637
637
|
end
|
638
638
|
|
639
639
|
# Returns the pos by calling `pos_' of `@py_token` object
|
640
|
-
# @return [String]
|
640
|
+
# @return [String]
|
641
641
|
def pos
|
642
642
|
@py_token.pos_
|
643
643
|
end
|
644
644
|
|
645
645
|
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
646
|
-
# @return [String]
|
647
|
-
def tag
|
646
|
+
# @return [String]
|
647
|
+
def tag
|
648
648
|
@py_token.tag_
|
649
649
|
end
|
650
650
|
|
651
651
|
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
652
|
-
# @return [String]
|
652
|
+
# @return [String]
|
653
653
|
def dep
|
654
654
|
@py_token.dep_
|
655
655
|
end
|
656
|
-
|
656
|
+
|
657
657
|
# Returns the language by calling `lang_' of `@py_token` object
|
658
|
-
# @return [String]
|
659
|
-
def lang
|
658
|
+
# @return [String]
|
659
|
+
def lang
|
660
660
|
@py_token.lang_
|
661
661
|
end
|
662
662
|
|
663
663
|
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
664
|
-
# @return [String]
|
665
|
-
def whitespace
|
664
|
+
# @return [String]
|
665
|
+
def whitespace
|
666
666
|
@py_token.whitespace_
|
667
667
|
end
|
668
668
|
|
669
669
|
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
670
|
-
# @return [String]
|
671
|
-
def ent_type
|
670
|
+
# @return [String]
|
671
|
+
def ent_type
|
672
672
|
@py_token.ent_type_
|
673
673
|
end
|
674
674
|
|
@@ -682,11 +682,14 @@ module Spacy
|
|
682
682
|
def method_missing(name, *args)
|
683
683
|
@py_token.send(name, *args)
|
684
684
|
end
|
685
|
+
|
686
|
+
def respond_to_missing?(sym)
|
687
|
+
sym ? true : super
|
688
|
+
end
|
685
689
|
end
|
686
690
|
|
687
691
|
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
688
|
-
class Lexeme
|
689
|
-
|
692
|
+
class Lexeme
|
690
693
|
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
691
694
|
attr_reader :py_lexeme
|
692
695
|
|
@@ -702,50 +705,50 @@ module Spacy
|
|
702
705
|
end
|
703
706
|
|
704
707
|
# String representation of the token.
|
705
|
-
# @return [String]
|
708
|
+
# @return [String]
|
706
709
|
def to_s
|
707
710
|
@text
|
708
711
|
end
|
709
712
|
|
710
713
|
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
711
|
-
# @return [String]
|
714
|
+
# @return [String]
|
712
715
|
def lower
|
713
716
|
@py_lexeme.lower_
|
714
717
|
end
|
715
718
|
|
716
719
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
717
|
-
# @return [String]
|
720
|
+
# @return [String]
|
718
721
|
def shape
|
719
722
|
@py_lexeme.shape_
|
720
723
|
end
|
721
724
|
|
722
725
|
# Returns the language by calling `lang_' of `@py_lexeme` object
|
723
|
-
# @return [String]
|
724
|
-
def lang
|
726
|
+
# @return [String]
|
727
|
+
def lang
|
725
728
|
@py_lexeme.lang_
|
726
729
|
end
|
727
730
|
|
728
731
|
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
729
|
-
# @return [String]
|
730
|
-
def prefix
|
732
|
+
# @return [String]
|
733
|
+
def prefix
|
731
734
|
@py_lexeme.prefix_
|
732
735
|
end
|
733
|
-
|
736
|
+
|
734
737
|
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
735
|
-
# @return [String]
|
738
|
+
# @return [String]
|
736
739
|
def suffix
|
737
740
|
@py_lexeme.suffix_
|
738
741
|
end
|
739
742
|
|
740
743
|
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
741
|
-
# @return [String]
|
744
|
+
# @return [String]
|
742
745
|
def norm
|
743
746
|
@py_lexeme.norm_
|
744
747
|
end
|
745
748
|
|
746
749
|
# Returns a semantic similarity estimate.
|
747
|
-
# @param other [Lexeme] the other
|
748
|
-
# @return [Float]
|
750
|
+
# @param other [Lexeme] the other lexeme to which a similarity estimation is made
|
751
|
+
# @return [Float]
|
749
752
|
def similarity(other)
|
750
753
|
@py_lexeme.similarity(other.py_lexeme)
|
751
754
|
end
|
@@ -754,7 +757,9 @@ module Spacy
|
|
754
757
|
def method_missing(name, *args)
|
755
758
|
@py_lexeme.send(name, *args)
|
756
759
|
end
|
757
|
-
end
|
758
760
|
|
761
|
+
def respond_to_missing?(sym)
|
762
|
+
sym ? true : super
|
763
|
+
end
|
764
|
+
end
|
759
765
|
end
|
760
|
-
|