ruby-spacy 0.1.4.1 → 0.1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +48 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +7 -7
- data/Gemfile.lock +2 -2
- data/README.md +7 -10
- data/examples/get_started/lexeme.rb +3 -1
- data/examples/get_started/linguistic_annotations.rb +3 -1
- data/examples/get_started/morphology.rb +3 -1
- data/examples/get_started/most_similar.rb +3 -1
- data/examples/get_started/named_entities.rb +4 -2
- data/examples/get_started/pos_tags_and_dependencies.rb +3 -1
- data/examples/get_started/similarity.rb +4 -2
- data/examples/get_started/tokenization.rb +3 -1
- data/examples/get_started/visualizing_dependencies.rb +2 -2
- data/examples/get_started/visualizing_dependencies_compact.rb +2 -0
- data/examples/get_started/visualizing_named_entities.rb +4 -2
- data/examples/get_started/vocab.rb +3 -1
- data/examples/get_started/word_vectors.rb +3 -1
- data/examples/japanese/ancestors.rb +6 -4
- data/examples/japanese/entity_annotations_and_labels.rb +4 -2
- data/examples/japanese/information_extraction.rb +6 -6
- data/examples/japanese/lemmatization.rb +3 -1
- data/examples/japanese/most_similar.rb +3 -1
- data/examples/japanese/named_entity_recognition.rb +3 -2
- data/examples/japanese/navigating_parse_tree.rb +19 -17
- data/examples/japanese/noun_chunks.rb +2 -0
- data/examples/japanese/pos_tagging.rb +3 -1
- data/examples/japanese/sentence_segmentation.rb +3 -2
- data/examples/japanese/similarity.rb +2 -0
- data/examples/japanese/tokenization.rb +2 -0
- data/examples/japanese/visualizing_dependencies.rb +3 -1
- data/examples/japanese/visualizing_named_entities.rb +4 -2
- data/examples/linguistic_features/ancestors.rb +7 -5
- data/examples/linguistic_features/entity_annotations_and_labels.rb +4 -2
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +3 -5
- data/examples/linguistic_features/information_extraction.rb +9 -9
- data/examples/linguistic_features/iterating_children.rb +6 -8
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +7 -5
- data/examples/linguistic_features/lemmatization.rb +3 -1
- data/examples/linguistic_features/named_entity_recognition.rb +3 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +3 -1
- data/examples/linguistic_features/noun_chunks.rb +3 -1
- data/examples/linguistic_features/pos_tagging.rb +3 -1
- data/examples/linguistic_features/retokenize_1.rb +2 -0
- data/examples/linguistic_features/retokenize_2.rb +4 -2
- data/examples/linguistic_features/rule_based_morphology.rb +4 -2
- data/examples/linguistic_features/sentence_segmentation.rb +3 -2
- data/examples/linguistic_features/similarity.rb +4 -2
- data/examples/linguistic_features/similarity_between_lexemes.rb +2 -0
- data/examples/linguistic_features/similarity_between_spans.rb +7 -5
- data/examples/linguistic_features/tokenization.rb +3 -2
- data/examples/rule_based_matching/creating_spans_from_matches.rb +5 -3
- data/examples/rule_based_matching/matcher.rb +4 -2
- data/lib/ruby-spacy/version.rb +1 -1
- data/lib/ruby-spacy.rb +139 -141
- data/ruby-spacy.gemspec +15 -17
- data/tags +132 -0
- metadata +69 -10
data/lib/ruby-spacy.rb
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "ruby-spacy/version"
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require 'pycall/import'
|
8
|
-
include PyCall::Import
|
4
|
+
require "strscan"
|
5
|
+
require "numpy"
|
6
|
+
require "pycall/import"
|
9
7
|
|
10
8
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
9
|
module Spacy
|
12
|
-
|
13
10
|
extend PyCall::Import
|
14
|
-
spacy = PyCall.import_module(
|
11
|
+
spacy = PyCall.import_module("spacy")
|
15
12
|
|
16
13
|
# Python `Language` class
|
17
14
|
PyLanguage = spacy.language.Language
|
@@ -24,23 +21,22 @@ module Spacy
|
|
24
21
|
|
25
22
|
# Python `Token` class object
|
26
23
|
PyToken = spacy.tokens.Token
|
27
|
-
|
24
|
+
|
28
25
|
# Python `Matcher` class object
|
29
26
|
PyMatcher = spacy.matcher.Matcher
|
30
27
|
|
31
28
|
# Python `displacy` object
|
32
29
|
PyDisplacy = spacy.displacy
|
33
30
|
|
34
|
-
# A utility module method to convert Python's generator object to a Ruby array,
|
31
|
+
# A utility module method to convert Python's generator object to a Ruby array,
|
35
32
|
# mainly used on the items inside the array returned from dependency-related methods
|
36
33
|
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
37
34
|
def self.generator_to_array(py_generator)
|
38
|
-
PyCall::List.(py_generator)
|
35
|
+
PyCall::List.call(py_generator)
|
39
36
|
end
|
40
37
|
|
41
38
|
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
42
39
|
class Doc
|
43
|
-
|
44
40
|
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
45
41
|
attr_reader :py_nlp
|
46
42
|
|
@@ -52,23 +48,19 @@ module Spacy
|
|
52
48
|
|
53
49
|
include Enumerable
|
54
50
|
|
55
|
-
|
56
|
-
|
57
|
-
|
51
|
+
alias length count
|
52
|
+
alias len count
|
53
|
+
alias size count
|
58
54
|
|
59
|
-
# It is recommended to use {Language#read} method to create a doc. If you need to
|
60
|
-
# create one using {Doc#initialize}, there are two method signatures:
|
55
|
+
# It is recommended to use {Language#read} method to create a doc. If you need to
|
56
|
+
# create one using {Doc#initialize}, there are two method signatures:
|
61
57
|
# `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
|
62
58
|
# @param nlp [Language] an instance of {Language} class
|
63
59
|
# @param py_doc [Object] an instance of Python `Doc` class
|
64
60
|
# @param text [String] the text string to be analyzed
|
65
61
|
def initialize(nlp, py_doc: nil, text: nil)
|
66
62
|
@py_nlp = nlp
|
67
|
-
|
68
|
-
@py_doc = py_doc
|
69
|
-
else
|
70
|
-
@py_doc = nlp.(text)
|
71
|
-
end
|
63
|
+
@py_doc = py_doc || @py_doc = nlp.call(text)
|
72
64
|
@text = @py_doc.text
|
73
65
|
end
|
74
66
|
|
@@ -77,25 +69,25 @@ module Spacy
|
|
77
69
|
# @param end_index [Integer] the end position of the span to be retokenized in the document
|
78
70
|
# @param attributes [Hash] attributes to set on the merged token
|
79
71
|
def retokenize(start_index, end_index, attributes = {})
|
80
|
-
PyCall.with(@py_doc.retokenize
|
81
|
-
retokenizer.merge(@py_doc[start_index
|
72
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
73
|
+
retokenizer.merge(@py_doc[start_index..end_index], attrs: attributes)
|
82
74
|
end
|
83
75
|
end
|
84
76
|
|
85
77
|
# Retokenizes the text splitting the specified token.
|
86
78
|
# @param pos_in_doc [Integer] the position of the span to be retokenized in the document
|
87
|
-
# @param split_array [Array<String>] text strings of the split results
|
79
|
+
# @param split_array [Array<String>] text strings of the split results
|
88
80
|
# @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
|
89
81
|
# @param attributes [Hash] the attributes of the split elements
|
90
82
|
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
91
|
-
PyCall.with(@py_doc.retokenize
|
83
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
92
84
|
heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
|
93
85
|
retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
|
94
86
|
end
|
95
87
|
end
|
96
88
|
|
97
89
|
# String representation of the document.
|
98
|
-
# @return [String]
|
90
|
+
# @return [String]
|
99
91
|
def to_s
|
100
92
|
@text
|
101
93
|
end
|
@@ -104,7 +96,7 @@ module Spacy
|
|
104
96
|
# @return [Array<Token>]
|
105
97
|
def tokens
|
106
98
|
results = []
|
107
|
-
PyCall::List.(@py_doc).each do |py_token|
|
99
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
108
100
|
results << Token.new(py_token)
|
109
101
|
end
|
110
102
|
results
|
@@ -112,12 +104,12 @@ module Spacy
|
|
112
104
|
|
113
105
|
# Iterates over the elements in the doc yielding a token instance each time.
|
114
106
|
def each
|
115
|
-
PyCall::List.(@py_doc).each do |py_token|
|
107
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
116
108
|
yield Token.new(py_token)
|
117
109
|
end
|
118
110
|
end
|
119
111
|
|
120
|
-
# Returns a span of the specified range within the doc.
|
112
|
+
# Returns a span of the specified range within the doc.
|
121
113
|
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
122
114
|
# @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
|
123
115
|
# @param optional_size [Integer] an integer representing the size of the span
|
@@ -125,7 +117,7 @@ module Spacy
|
|
125
117
|
def span(range_or_start, optional_size = nil)
|
126
118
|
if optional_size
|
127
119
|
start_index = range_or_start
|
128
|
-
temp = tokens[start_index
|
120
|
+
temp = tokens[start_index...start_index + optional_size]
|
129
121
|
else
|
130
122
|
start_index = range_or_start.first
|
131
123
|
range = range_or_start
|
@@ -141,7 +133,7 @@ module Spacy
|
|
141
133
|
# @return [Array<Span>]
|
142
134
|
def noun_chunks
|
143
135
|
chunk_array = []
|
144
|
-
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
136
|
+
py_chunks = PyCall::List.call(@py_doc.noun_chunks)
|
145
137
|
py_chunks.each do |py_chunk|
|
146
138
|
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
147
139
|
end
|
@@ -152,7 +144,7 @@ module Spacy
|
|
152
144
|
# @return [Array<Span>]
|
153
145
|
def sents
|
154
146
|
sentence_array = []
|
155
|
-
py_sentences = PyCall::List.(@py_doc.sents)
|
147
|
+
py_sentences = PyCall::List.call(@py_doc.sents)
|
156
148
|
py_sentences.each do |py_sent|
|
157
149
|
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
158
150
|
end
|
@@ -164,9 +156,9 @@ module Spacy
|
|
164
156
|
def ents
|
165
157
|
# so that ents canbe "each"-ed in Ruby
|
166
158
|
ent_array = []
|
167
|
-
PyCall::List.(@py_doc.ents).each do |ent|
|
159
|
+
PyCall::List.call(@py_doc.ents).each do |ent|
|
168
160
|
ent.define_singleton_method :label do
|
169
|
-
|
161
|
+
label_
|
170
162
|
end
|
171
163
|
ent_array << ent
|
172
164
|
end
|
@@ -178,15 +170,15 @@ module Spacy
|
|
178
170
|
def [](range)
|
179
171
|
if range.is_a?(Range)
|
180
172
|
py_span = @py_doc[range]
|
181
|
-
|
173
|
+
Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
182
174
|
else
|
183
|
-
|
175
|
+
Token.new(@py_doc[range])
|
184
176
|
end
|
185
177
|
end
|
186
178
|
|
187
179
|
# Returns a semantic similarity estimate.
|
188
180
|
# @param other [Doc] the other doc to which a similarity estimation is made
|
189
|
-
# @return [Float]
|
181
|
+
# @return [Float]
|
190
182
|
def similarity(other)
|
191
183
|
py_doc.similarity(other.py_doc)
|
192
184
|
end
|
@@ -196,18 +188,21 @@ module Spacy
|
|
196
188
|
# @param compact [Boolean] only relevant to the `dep' style
|
197
189
|
# @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
|
198
190
|
def displacy(style: "dep", compact: false)
|
199
|
-
PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
|
191
|
+
PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
|
200
192
|
end
|
201
193
|
|
202
194
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
203
195
|
def method_missing(name, *args)
|
204
196
|
@py_doc.send(name, *args)
|
205
197
|
end
|
198
|
+
|
199
|
+
def respond_to_missing?(sym)
|
200
|
+
sym ? true : super
|
201
|
+
end
|
206
202
|
end
|
207
203
|
|
208
204
|
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
209
205
|
class Language
|
210
|
-
|
211
206
|
# @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
|
212
207
|
attr_reader :spacy_nlp_id
|
213
208
|
|
@@ -245,7 +240,7 @@ module Spacy
|
|
245
240
|
# @return [Array<String>] An array of text strings representing pipeline components
|
246
241
|
def pipe_names
|
247
242
|
pipe_array = []
|
248
|
-
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
243
|
+
PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
|
249
244
|
pipe_array << pipe
|
250
245
|
end
|
251
246
|
pipe_array
|
@@ -268,24 +263,23 @@ module Spacy
|
|
268
263
|
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
269
264
|
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
270
265
|
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
271
|
-
def most_similar(vector,
|
266
|
+
def most_similar(vector, num)
|
272
267
|
vec_array = Numpy.asarray([vector])
|
273
|
-
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n:
|
274
|
-
key_texts = PyCall.eval("[[str(
|
275
|
-
keys = key_texts.map{|kt| kt[0]}
|
276
|
-
texts = key_texts.map{|kt| kt[1]}
|
277
|
-
best_rows = PyCall::List.(py_result[1])[0]
|
278
|
-
scores = PyCall::List.(py_result[2])[0]
|
268
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: num)
|
269
|
+
key_texts = PyCall.eval("[[str(num), #{@spacy_nlp_id}.vocab[num].text] for num in #{py_result[0][0].tolist}]")
|
270
|
+
keys = key_texts.map { |kt| kt[0] }
|
271
|
+
texts = key_texts.map { |kt| kt[1] }
|
272
|
+
best_rows = PyCall::List.call(py_result[1])[0]
|
273
|
+
scores = PyCall::List.call(py_result[2])[0]
|
279
274
|
|
280
275
|
results = []
|
281
|
-
|
282
|
-
result = {key: keys[i].to_i,
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
}
|
276
|
+
num.times do |i|
|
277
|
+
result = { key: keys[i].to_i,
|
278
|
+
text: texts[i],
|
279
|
+
best_row: best_rows[i],
|
280
|
+
score: scores[i] }
|
287
281
|
result.each_key do |key|
|
288
|
-
result.define_singleton_method(key){ result[key] }
|
282
|
+
result.define_singleton_method(key) { result[key] }
|
289
283
|
end
|
290
284
|
results << result
|
291
285
|
end
|
@@ -297,9 +291,9 @@ module Spacy
|
|
297
291
|
# @param disable [Array<String>]
|
298
292
|
# @param batch_size [Integer]
|
299
293
|
# @return [Array<Doc>]
|
300
|
-
def pipe(texts, disable: [], batch_size: 50)
|
294
|
+
def pipe(texts, disable: [], batch_size: 50)
|
301
295
|
docs = []
|
302
|
-
PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
296
|
+
PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
303
297
|
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
304
298
|
end
|
305
299
|
docs
|
@@ -309,18 +303,21 @@ module Spacy
|
|
309
303
|
def method_missing(name, *args)
|
310
304
|
@py_nlp.send(name, *args)
|
311
305
|
end
|
306
|
+
|
307
|
+
def respond_to_missing?(sym)
|
308
|
+
sym ? true : super
|
309
|
+
end
|
312
310
|
end
|
313
311
|
|
314
312
|
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
315
313
|
class Matcher
|
316
|
-
|
317
314
|
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
318
315
|
attr_reader :py_matcher
|
319
316
|
|
320
317
|
# Creates a {Matcher} instance
|
321
318
|
# @param nlp [Language] an instance of {Language} class
|
322
319
|
def initialize(nlp)
|
323
|
-
@py_matcher = PyMatcher.(nlp.vocab)
|
320
|
+
@py_matcher = PyMatcher.call(nlp.vocab)
|
324
321
|
end
|
325
322
|
|
326
323
|
# Adds a label string and a text pattern.
|
@@ -334,16 +331,17 @@ module Spacy
|
|
334
331
|
# @param doc [Doc] an {Doc} instance
|
335
332
|
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
336
333
|
def match(doc)
|
337
|
-
str_results = @py_matcher.(doc.py_doc).to_s
|
334
|
+
str_results = @py_matcher.call(doc.py_doc).to_s
|
338
335
|
s = StringScanner.new(str_results[1..-2])
|
339
336
|
results = []
|
340
337
|
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
341
338
|
next unless s.matched
|
339
|
+
|
342
340
|
triple = s.matched.split(", ")
|
343
341
|
match_id = triple[0].to_i
|
344
342
|
start_index = triple[1].to_i
|
345
343
|
end_index = triple[2].to_i - 1
|
346
|
-
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
344
|
+
results << { match_id: match_id, start_index: start_index, end_index: end_index }
|
347
345
|
end
|
348
346
|
results
|
349
347
|
end
|
@@ -351,7 +349,6 @@ module Spacy
|
|
351
349
|
|
352
350
|
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
353
351
|
class Span
|
354
|
-
|
355
352
|
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
356
353
|
attr_reader :py_span
|
357
354
|
|
@@ -360,11 +357,11 @@ module Spacy
|
|
360
357
|
|
361
358
|
include Enumerable
|
362
359
|
|
363
|
-
|
364
|
-
|
365
|
-
|
360
|
+
alias length count
|
361
|
+
alias len count
|
362
|
+
alias size count
|
366
363
|
|
367
|
-
# It is recommended to use {Doc#span} method to create a span. If you need to
|
364
|
+
# It is recommended to use {Doc#span} method to create a span. If you need to
|
368
365
|
# create one using {Span#initialize}, there are two method signatures:
|
369
366
|
# `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
|
370
367
|
# @param doc [Doc] the document to which this span belongs to
|
@@ -373,18 +370,14 @@ module Spacy
|
|
373
370
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
374
371
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
375
372
|
@doc = doc
|
376
|
-
|
377
|
-
@py_span = py_span
|
378
|
-
else
|
379
|
-
@py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
|
380
|
-
end
|
373
|
+
@py_span = py_span || @py_span = PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
|
381
374
|
end
|
382
375
|
|
383
376
|
# Returns an array of tokens contained in the span.
|
384
377
|
# @return [Array<Token>]
|
385
378
|
def tokens
|
386
379
|
results = []
|
387
|
-
PyCall::List.(@py_span).each do |py_token|
|
380
|
+
PyCall::List.call(@py_span).each do |py_token|
|
388
381
|
results << Token.new(py_token)
|
389
382
|
end
|
390
383
|
results
|
@@ -392,7 +385,7 @@ module Spacy
|
|
392
385
|
|
393
386
|
# Iterates over the elements in the span yielding a token instance each time.
|
394
387
|
def each
|
395
|
-
PyCall::List.(@py_span).each do |py_token|
|
388
|
+
PyCall::List.call(@py_span).each do |py_token|
|
396
389
|
yield Token.new(py_token)
|
397
390
|
end
|
398
391
|
end
|
@@ -401,7 +394,7 @@ module Spacy
|
|
401
394
|
# @return [Array<Span>]
|
402
395
|
def noun_chunks
|
403
396
|
chunk_array = []
|
404
|
-
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
397
|
+
py_chunks = PyCall::List.call(@py_span.noun_chunks)
|
405
398
|
py_chunks.each do |py_span|
|
406
399
|
chunk_array << Span.new(@doc, py_span: py_span)
|
407
400
|
end
|
@@ -410,7 +403,7 @@ module Spacy
|
|
410
403
|
|
411
404
|
# Returns the head token
|
412
405
|
# @return [Token]
|
413
|
-
def root
|
406
|
+
def root
|
414
407
|
Token.new(@py_span.root)
|
415
408
|
end
|
416
409
|
|
@@ -418,7 +411,7 @@ module Spacy
|
|
418
411
|
# @return [Array<Span>]
|
419
412
|
def sents
|
420
413
|
sentence_array = []
|
421
|
-
py_sentences = PyCall::List.(@py_span.sents)
|
414
|
+
py_sentences = PyCall::List.call(@py_span.sents)
|
422
415
|
py_sentences.each do |py_span|
|
423
416
|
sentence_array << Span.new(@doc, py_span: py_span)
|
424
417
|
end
|
@@ -429,7 +422,7 @@ module Spacy
|
|
429
422
|
# @return [Array<Span>]
|
430
423
|
def ents
|
431
424
|
ent_array = []
|
432
|
-
PyCall::List.(@py_span.ents).each do |py_span|
|
425
|
+
PyCall::List.call(@py_span.ents).each do |py_span|
|
433
426
|
ent_array << Span.new(@doc, py_span: py_span)
|
434
427
|
end
|
435
428
|
ent_array
|
@@ -438,8 +431,8 @@ module Spacy
|
|
438
431
|
# Returns a span that represents the sentence that the given span is part of.
|
439
432
|
# @return [Span]
|
440
433
|
def sent
|
441
|
-
py_span = @py_span.sent
|
442
|
-
|
434
|
+
py_span = @py_span.sent
|
435
|
+
Span.new(@doc, py_span: py_span)
|
443
436
|
end
|
444
437
|
|
445
438
|
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
@@ -447,67 +440,67 @@ module Spacy
|
|
447
440
|
def [](range)
|
448
441
|
if range.is_a?(Range)
|
449
442
|
py_span = @py_span[range]
|
450
|
-
|
443
|
+
Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
451
444
|
else
|
452
|
-
|
445
|
+
Token.new(@py_span[range])
|
453
446
|
end
|
454
447
|
end
|
455
448
|
|
456
449
|
# Returns a semantic similarity estimate.
|
457
450
|
# @param other [Span] the other span to which a similarity estimation is conducted
|
458
|
-
# @return [Float]
|
451
|
+
# @return [Float]
|
459
452
|
def similarity(other)
|
460
453
|
py_span.similarity(other.py_span)
|
461
454
|
end
|
462
455
|
|
463
456
|
# Creates a document instance from the span
|
464
|
-
# @return [Doc]
|
457
|
+
# @return [Doc]
|
465
458
|
def as_doc
|
466
|
-
Doc.new(@doc.py_nlp, text:
|
459
|
+
Doc.new(@doc.py_nlp, text: text)
|
467
460
|
end
|
468
461
|
|
469
462
|
# Returns tokens conjugated to the root of the span.
|
470
463
|
# @return [Array<Token>] an array of tokens
|
471
464
|
def conjuncts
|
472
465
|
conjunct_array = []
|
473
|
-
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
466
|
+
PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
|
474
467
|
conjunct_array << Token.new(py_conjunct)
|
475
468
|
end
|
476
469
|
conjunct_array
|
477
470
|
end
|
478
471
|
|
479
472
|
# Returns tokens that are to the left of the span, whose heads are within the span.
|
480
|
-
# @return [Array<Token>] an array of tokens
|
473
|
+
# @return [Array<Token>] an array of tokens
|
481
474
|
def lefts
|
482
475
|
left_array = []
|
483
|
-
PyCall::List.(@py_span.lefts).each do |py_left|
|
476
|
+
PyCall::List.call(@py_span.lefts).each do |py_left|
|
484
477
|
left_array << Token.new(py_left)
|
485
478
|
end
|
486
479
|
left_array
|
487
480
|
end
|
488
481
|
|
489
482
|
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
490
|
-
# @return [Array<Token>] an array of Tokens
|
483
|
+
# @return [Array<Token>] an array of Tokens
|
491
484
|
def rights
|
492
485
|
right_array = []
|
493
|
-
PyCall::List.(@py_span.rights).each do |py_right|
|
486
|
+
PyCall::List.call(@py_span.rights).each do |py_right|
|
494
487
|
right_array << Token.new(py_right)
|
495
488
|
end
|
496
489
|
right_array
|
497
490
|
end
|
498
491
|
|
499
492
|
# Returns Tokens that are within the span and tokens that descend from them.
|
500
|
-
# @return [Array<Token>] an array of tokens
|
493
|
+
# @return [Array<Token>] an array of tokens
|
501
494
|
def subtree
|
502
495
|
subtree_array = []
|
503
|
-
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
496
|
+
PyCall::List.call(@py_span.subtree).each do |py_subtree|
|
504
497
|
subtree_array << Token.new(py_subtree)
|
505
498
|
end
|
506
499
|
subtree_array
|
507
500
|
end
|
508
501
|
|
509
502
|
# Returns the label
|
510
|
-
# @return [String]
|
503
|
+
# @return [String]
|
511
504
|
def label
|
512
505
|
@py_span.label_
|
513
506
|
end
|
@@ -516,11 +509,14 @@ module Spacy
|
|
516
509
|
def method_missing(name, *args)
|
517
510
|
@py_span.send(name, *args)
|
518
511
|
end
|
512
|
+
|
513
|
+
def respond_to_missing?(sym)
|
514
|
+
sym ? true : super
|
515
|
+
end
|
519
516
|
end
|
520
517
|
|
521
518
|
# See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
|
522
519
|
class Token
|
523
|
-
|
524
520
|
# @return [Object] a Python `Token` instance accessible via `PyCall`
|
525
521
|
attr_reader :py_token
|
526
522
|
|
@@ -528,17 +524,16 @@ module Spacy
|
|
528
524
|
attr_reader :text
|
529
525
|
|
530
526
|
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
531
|
-
# There is no way to generate a token from scratch but relying on a pre-exising Python
|
527
|
+
# There is no way to generate a token from scratch but relying on a pre-exising Python `Token` object.
|
532
528
|
# @param py_token [Object] Python `Token` object
|
533
529
|
def initialize(py_token)
|
534
530
|
@py_token = py_token
|
535
531
|
@text = @py_token.text
|
536
532
|
end
|
537
533
|
|
538
|
-
|
539
534
|
# Returns the head token
|
540
535
|
# @return [Token]
|
541
|
-
def head
|
536
|
+
def head
|
542
537
|
Token.new(@py_token.head)
|
543
538
|
end
|
544
539
|
|
@@ -546,7 +541,7 @@ module Spacy
|
|
546
541
|
# @return [Array<Token>] an array of tokens
|
547
542
|
def subtree
|
548
543
|
descendant_array = []
|
549
|
-
PyCall::List.(@py_token.subtree).each do |descendant|
|
544
|
+
PyCall::List.call(@py_token.subtree).each do |descendant|
|
550
545
|
descendant_array << Token.new(descendant)
|
551
546
|
end
|
552
547
|
descendant_array
|
@@ -556,7 +551,7 @@ module Spacy
|
|
556
551
|
# @return [Array<Token>] an array of tokens
|
557
552
|
def ancestors
|
558
553
|
ancestor_array = []
|
559
|
-
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
554
|
+
PyCall::List.call(@py_token.ancestors).each do |ancestor|
|
560
555
|
ancestor_array << Token.new(ancestor)
|
561
556
|
end
|
562
557
|
ancestor_array
|
@@ -566,7 +561,7 @@ module Spacy
|
|
566
561
|
# @return [Array<Token>] an array of tokens
|
567
562
|
def children
|
568
563
|
child_array = []
|
569
|
-
PyCall::List.(@py_token.children).each do |child|
|
564
|
+
PyCall::List.call(@py_token.children).each do |child|
|
570
565
|
child_array << Token.new(child)
|
571
566
|
end
|
572
567
|
child_array
|
@@ -576,7 +571,7 @@ module Spacy
|
|
576
571
|
# @return [Array<Token>] an array of tokens
|
577
572
|
def lefts
|
578
573
|
token_array = []
|
579
|
-
PyCall::List.(@py_token.lefts).each do |token|
|
574
|
+
PyCall::List.call(@py_token.lefts).each do |token|
|
580
575
|
token_array << Token.new(token)
|
581
576
|
end
|
582
577
|
token_array
|
@@ -586,89 +581,87 @@ module Spacy
|
|
586
581
|
# @return [Array<Token>] an array of tokens
|
587
582
|
def rights
|
588
583
|
token_array = []
|
589
|
-
PyCall::List.(@py_token.rights).each do |token|
|
584
|
+
PyCall::List.call(@py_token.rights).each do |token|
|
590
585
|
token_array << Token.new(token)
|
591
586
|
end
|
592
587
|
token_array
|
593
588
|
end
|
594
589
|
|
595
590
|
# String representation of the token.
|
596
|
-
# @return [String]
|
591
|
+
# @return [String]
|
597
592
|
def to_s
|
598
593
|
@text
|
599
594
|
end
|
600
595
|
|
601
596
|
# Returns a hash or string of morphological information
|
602
597
|
# @param hash [Boolean] if true, a hash will be returned instead of a string
|
603
|
-
# @return [Hash, String]
|
604
|
-
def morphology(hash
|
598
|
+
# @return [Hash, String]
|
599
|
+
def morphology(hash: true)
|
605
600
|
if @py_token.has_morph
|
606
601
|
morph_analysis = @py_token.morph
|
607
|
-
if hash
|
608
|
-
return morph_analysis.to_dict
|
609
|
-
else
|
610
|
-
return morph_analysis.to_s
|
611
|
-
end
|
612
|
-
else
|
613
602
|
if hash
|
614
|
-
|
603
|
+
morph_analysis.to_dict
|
615
604
|
else
|
616
|
-
|
605
|
+
morph_analysis.to_s
|
617
606
|
end
|
607
|
+
elsif hash
|
608
|
+
{}
|
609
|
+
else
|
610
|
+
""
|
618
611
|
end
|
619
612
|
end
|
620
613
|
|
621
614
|
# Returns the lemma by calling `lemma_' of `@py_token` object
|
622
|
-
# @return [String]
|
615
|
+
# @return [String]
|
623
616
|
def lemma
|
624
617
|
@py_token.lemma_
|
625
618
|
end
|
626
619
|
|
627
620
|
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
628
|
-
# @return [String]
|
621
|
+
# @return [String]
|
629
622
|
def lower
|
630
623
|
@py_token.lower_
|
631
624
|
end
|
632
625
|
|
633
626
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
634
|
-
# @return [String]
|
627
|
+
# @return [String]
|
635
628
|
def shape
|
636
629
|
@py_token.shape_
|
637
630
|
end
|
638
631
|
|
639
632
|
# Returns the pos by calling `pos_' of `@py_token` object
|
640
|
-
# @return [String]
|
633
|
+
# @return [String]
|
641
634
|
def pos
|
642
635
|
@py_token.pos_
|
643
636
|
end
|
644
637
|
|
645
638
|
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
646
|
-
# @return [String]
|
647
|
-
def tag
|
639
|
+
# @return [String]
|
640
|
+
def tag
|
648
641
|
@py_token.tag_
|
649
642
|
end
|
650
643
|
|
651
644
|
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
652
|
-
# @return [String]
|
645
|
+
# @return [String]
|
653
646
|
def dep
|
654
647
|
@py_token.dep_
|
655
648
|
end
|
656
|
-
|
649
|
+
|
657
650
|
# Returns the language by calling `lang_' of `@py_token` object
|
658
|
-
# @return [String]
|
659
|
-
def lang
|
651
|
+
# @return [String]
|
652
|
+
def lang
|
660
653
|
@py_token.lang_
|
661
654
|
end
|
662
655
|
|
663
656
|
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
664
|
-
# @return [String]
|
665
|
-
def whitespace
|
657
|
+
# @return [String]
|
658
|
+
def whitespace
|
666
659
|
@py_token.whitespace_
|
667
660
|
end
|
668
661
|
|
669
662
|
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
670
|
-
# @return [String]
|
671
|
-
def ent_type
|
663
|
+
# @return [String]
|
664
|
+
def ent_type
|
672
665
|
@py_token.ent_type_
|
673
666
|
end
|
674
667
|
|
@@ -682,11 +675,14 @@ module Spacy
|
|
682
675
|
def method_missing(name, *args)
|
683
676
|
@py_token.send(name, *args)
|
684
677
|
end
|
678
|
+
|
679
|
+
def respond_to_missing?(sym)
|
680
|
+
sym ? true : super
|
681
|
+
end
|
685
682
|
end
|
686
683
|
|
687
684
|
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
688
|
-
class Lexeme
|
689
|
-
|
685
|
+
class Lexeme
|
690
686
|
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
691
687
|
attr_reader :py_lexeme
|
692
688
|
|
@@ -702,50 +698,50 @@ module Spacy
|
|
702
698
|
end
|
703
699
|
|
704
700
|
# String representation of the token.
|
705
|
-
# @return [String]
|
701
|
+
# @return [String]
|
706
702
|
def to_s
|
707
703
|
@text
|
708
704
|
end
|
709
705
|
|
710
706
|
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
711
|
-
# @return [String]
|
707
|
+
# @return [String]
|
712
708
|
def lower
|
713
709
|
@py_lexeme.lower_
|
714
710
|
end
|
715
711
|
|
716
712
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
717
|
-
# @return [String]
|
713
|
+
# @return [String]
|
718
714
|
def shape
|
719
715
|
@py_lexeme.shape_
|
720
716
|
end
|
721
717
|
|
722
718
|
# Returns the language by calling `lang_' of `@py_lexeme` object
|
723
|
-
# @return [String]
|
724
|
-
def lang
|
719
|
+
# @return [String]
|
720
|
+
def lang
|
725
721
|
@py_lexeme.lang_
|
726
722
|
end
|
727
723
|
|
728
724
|
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
729
|
-
# @return [String]
|
730
|
-
def prefix
|
725
|
+
# @return [String]
|
726
|
+
def prefix
|
731
727
|
@py_lexeme.prefix_
|
732
728
|
end
|
733
|
-
|
729
|
+
|
734
730
|
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
735
|
-
# @return [String]
|
731
|
+
# @return [String]
|
736
732
|
def suffix
|
737
733
|
@py_lexeme.suffix_
|
738
734
|
end
|
739
735
|
|
740
736
|
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
741
|
-
# @return [String]
|
737
|
+
# @return [String]
|
742
738
|
def norm
|
743
739
|
@py_lexeme.norm_
|
744
740
|
end
|
745
741
|
|
746
742
|
# Returns a semantic similarity estimate.
|
747
|
-
# @param other [Lexeme] the other
|
748
|
-
# @return [Float]
|
743
|
+
# @param other [Lexeme] the other lexeme to which a similarity estimation is made
|
744
|
+
# @return [Float]
|
749
745
|
def similarity(other)
|
750
746
|
@py_lexeme.similarity(other.py_lexeme)
|
751
747
|
end
|
@@ -754,7 +750,9 @@ module Spacy
|
|
754
750
|
def method_missing(name, *args)
|
755
751
|
@py_lexeme.send(name, *args)
|
756
752
|
end
|
757
|
-
end
|
758
753
|
|
754
|
+
def respond_to_missing?(sym)
|
755
|
+
sym ? true : super
|
756
|
+
end
|
757
|
+
end
|
759
758
|
end
|
760
|
-
|