ruby-spacy 0.1.4.1 → 0.1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +48 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +7 -7
- data/Gemfile.lock +2 -2
- data/README.md +7 -10
- data/examples/get_started/lexeme.rb +3 -1
- data/examples/get_started/linguistic_annotations.rb +3 -1
- data/examples/get_started/morphology.rb +3 -1
- data/examples/get_started/most_similar.rb +3 -1
- data/examples/get_started/named_entities.rb +4 -2
- data/examples/get_started/pos_tags_and_dependencies.rb +3 -1
- data/examples/get_started/similarity.rb +4 -2
- data/examples/get_started/tokenization.rb +3 -1
- data/examples/get_started/visualizing_dependencies.rb +2 -2
- data/examples/get_started/visualizing_dependencies_compact.rb +2 -0
- data/examples/get_started/visualizing_named_entities.rb +4 -2
- data/examples/get_started/vocab.rb +3 -1
- data/examples/get_started/word_vectors.rb +3 -1
- data/examples/japanese/ancestors.rb +6 -4
- data/examples/japanese/entity_annotations_and_labels.rb +4 -2
- data/examples/japanese/information_extraction.rb +6 -6
- data/examples/japanese/lemmatization.rb +3 -1
- data/examples/japanese/most_similar.rb +3 -1
- data/examples/japanese/named_entity_recognition.rb +3 -2
- data/examples/japanese/navigating_parse_tree.rb +19 -17
- data/examples/japanese/noun_chunks.rb +2 -0
- data/examples/japanese/pos_tagging.rb +3 -1
- data/examples/japanese/sentence_segmentation.rb +3 -2
- data/examples/japanese/similarity.rb +2 -0
- data/examples/japanese/tokenization.rb +2 -0
- data/examples/japanese/visualizing_dependencies.rb +3 -1
- data/examples/japanese/visualizing_named_entities.rb +4 -2
- data/examples/linguistic_features/ancestors.rb +7 -5
- data/examples/linguistic_features/entity_annotations_and_labels.rb +4 -2
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +3 -5
- data/examples/linguistic_features/information_extraction.rb +9 -9
- data/examples/linguistic_features/iterating_children.rb +6 -8
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +7 -5
- data/examples/linguistic_features/lemmatization.rb +3 -1
- data/examples/linguistic_features/named_entity_recognition.rb +3 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +3 -1
- data/examples/linguistic_features/noun_chunks.rb +3 -1
- data/examples/linguistic_features/pos_tagging.rb +3 -1
- data/examples/linguistic_features/retokenize_1.rb +2 -0
- data/examples/linguistic_features/retokenize_2.rb +4 -2
- data/examples/linguistic_features/rule_based_morphology.rb +4 -2
- data/examples/linguistic_features/sentence_segmentation.rb +3 -2
- data/examples/linguistic_features/similarity.rb +4 -2
- data/examples/linguistic_features/similarity_between_lexemes.rb +2 -0
- data/examples/linguistic_features/similarity_between_spans.rb +7 -5
- data/examples/linguistic_features/tokenization.rb +3 -2
- data/examples/rule_based_matching/creating_spans_from_matches.rb +5 -3
- data/examples/rule_based_matching/matcher.rb +4 -2
- data/lib/ruby-spacy/version.rb +1 -1
- data/lib/ruby-spacy.rb +139 -141
- data/ruby-spacy.gemspec +15 -17
- data/tags +132 -0
- metadata +69 -10
data/lib/ruby-spacy.rb
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "ruby-spacy/version"
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require 'pycall/import'
|
8
|
-
include PyCall::Import
|
4
|
+
require "strscan"
|
5
|
+
require "numpy"
|
6
|
+
require "pycall/import"
|
9
7
|
|
10
8
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
9
|
module Spacy
|
12
|
-
|
13
10
|
extend PyCall::Import
|
14
|
-
spacy = PyCall.import_module(
|
11
|
+
spacy = PyCall.import_module("spacy")
|
15
12
|
|
16
13
|
# Python `Language` class
|
17
14
|
PyLanguage = spacy.language.Language
|
@@ -24,23 +21,22 @@ module Spacy
|
|
24
21
|
|
25
22
|
# Python `Token` class object
|
26
23
|
PyToken = spacy.tokens.Token
|
27
|
-
|
24
|
+
|
28
25
|
# Python `Matcher` class object
|
29
26
|
PyMatcher = spacy.matcher.Matcher
|
30
27
|
|
31
28
|
# Python `displacy` object
|
32
29
|
PyDisplacy = spacy.displacy
|
33
30
|
|
34
|
-
# A utility module method to convert Python's generator object to a Ruby array,
|
31
|
+
# A utility module method to convert Python's generator object to a Ruby array,
|
35
32
|
# mainly used on the items inside the array returned from dependency-related methods
|
36
33
|
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
37
34
|
def self.generator_to_array(py_generator)
|
38
|
-
PyCall::List.(py_generator)
|
35
|
+
PyCall::List.call(py_generator)
|
39
36
|
end
|
40
37
|
|
41
38
|
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
42
39
|
class Doc
|
43
|
-
|
44
40
|
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
45
41
|
attr_reader :py_nlp
|
46
42
|
|
@@ -52,23 +48,19 @@ module Spacy
|
|
52
48
|
|
53
49
|
include Enumerable
|
54
50
|
|
55
|
-
|
56
|
-
|
57
|
-
|
51
|
+
alias length count
|
52
|
+
alias len count
|
53
|
+
alias size count
|
58
54
|
|
59
|
-
# It is recommended to use {Language#read} method to create a doc. If you need to
|
60
|
-
# create one using {Doc#initialize}, there are two method signatures:
|
55
|
+
# It is recommended to use {Language#read} method to create a doc. If you need to
|
56
|
+
# create one using {Doc#initialize}, there are two method signatures:
|
61
57
|
# `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
|
62
58
|
# @param nlp [Language] an instance of {Language} class
|
63
59
|
# @param py_doc [Object] an instance of Python `Doc` class
|
64
60
|
# @param text [String] the text string to be analyzed
|
65
61
|
def initialize(nlp, py_doc: nil, text: nil)
|
66
62
|
@py_nlp = nlp
|
67
|
-
|
68
|
-
@py_doc = py_doc
|
69
|
-
else
|
70
|
-
@py_doc = nlp.(text)
|
71
|
-
end
|
63
|
+
@py_doc = py_doc || @py_doc = nlp.call(text)
|
72
64
|
@text = @py_doc.text
|
73
65
|
end
|
74
66
|
|
@@ -77,25 +69,25 @@ module Spacy
|
|
77
69
|
# @param end_index [Integer] the end position of the span to be retokenized in the document
|
78
70
|
# @param attributes [Hash] attributes to set on the merged token
|
79
71
|
def retokenize(start_index, end_index, attributes = {})
|
80
|
-
PyCall.with(@py_doc.retokenize
|
81
|
-
retokenizer.merge(@py_doc[start_index
|
72
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
73
|
+
retokenizer.merge(@py_doc[start_index..end_index], attrs: attributes)
|
82
74
|
end
|
83
75
|
end
|
84
76
|
|
85
77
|
# Retokenizes the text splitting the specified token.
|
86
78
|
# @param pos_in_doc [Integer] the position of the span to be retokenized in the document
|
87
|
-
# @param split_array [Array<String>] text strings of the split results
|
79
|
+
# @param split_array [Array<String>] text strings of the split results
|
88
80
|
# @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
|
89
81
|
# @param attributes [Hash] the attributes of the split elements
|
90
82
|
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
91
|
-
PyCall.with(@py_doc.retokenize
|
83
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
92
84
|
heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
|
93
85
|
retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
|
94
86
|
end
|
95
87
|
end
|
96
88
|
|
97
89
|
# String representation of the document.
|
98
|
-
# @return [String]
|
90
|
+
# @return [String]
|
99
91
|
def to_s
|
100
92
|
@text
|
101
93
|
end
|
@@ -104,7 +96,7 @@ module Spacy
|
|
104
96
|
# @return [Array<Token>]
|
105
97
|
def tokens
|
106
98
|
results = []
|
107
|
-
PyCall::List.(@py_doc).each do |py_token|
|
99
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
108
100
|
results << Token.new(py_token)
|
109
101
|
end
|
110
102
|
results
|
@@ -112,12 +104,12 @@ module Spacy
|
|
112
104
|
|
113
105
|
# Iterates over the elements in the doc yielding a token instance each time.
|
114
106
|
def each
|
115
|
-
PyCall::List.(@py_doc).each do |py_token|
|
107
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
116
108
|
yield Token.new(py_token)
|
117
109
|
end
|
118
110
|
end
|
119
111
|
|
120
|
-
# Returns a span of the specified range within the doc.
|
112
|
+
# Returns a span of the specified range within the doc.
|
121
113
|
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
122
114
|
# @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
|
123
115
|
# @param optional_size [Integer] an integer representing the size of the span
|
@@ -125,7 +117,7 @@ module Spacy
|
|
125
117
|
def span(range_or_start, optional_size = nil)
|
126
118
|
if optional_size
|
127
119
|
start_index = range_or_start
|
128
|
-
temp = tokens[start_index
|
120
|
+
temp = tokens[start_index...start_index + optional_size]
|
129
121
|
else
|
130
122
|
start_index = range_or_start.first
|
131
123
|
range = range_or_start
|
@@ -141,7 +133,7 @@ module Spacy
|
|
141
133
|
# @return [Array<Span>]
|
142
134
|
def noun_chunks
|
143
135
|
chunk_array = []
|
144
|
-
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
136
|
+
py_chunks = PyCall::List.call(@py_doc.noun_chunks)
|
145
137
|
py_chunks.each do |py_chunk|
|
146
138
|
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
147
139
|
end
|
@@ -152,7 +144,7 @@ module Spacy
|
|
152
144
|
# @return [Array<Span>]
|
153
145
|
def sents
|
154
146
|
sentence_array = []
|
155
|
-
py_sentences = PyCall::List.(@py_doc.sents)
|
147
|
+
py_sentences = PyCall::List.call(@py_doc.sents)
|
156
148
|
py_sentences.each do |py_sent|
|
157
149
|
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
158
150
|
end
|
@@ -164,9 +156,9 @@ module Spacy
|
|
164
156
|
def ents
|
165
157
|
# so that ents canbe "each"-ed in Ruby
|
166
158
|
ent_array = []
|
167
|
-
PyCall::List.(@py_doc.ents).each do |ent|
|
159
|
+
PyCall::List.call(@py_doc.ents).each do |ent|
|
168
160
|
ent.define_singleton_method :label do
|
169
|
-
|
161
|
+
label_
|
170
162
|
end
|
171
163
|
ent_array << ent
|
172
164
|
end
|
@@ -178,15 +170,15 @@ module Spacy
|
|
178
170
|
def [](range)
|
179
171
|
if range.is_a?(Range)
|
180
172
|
py_span = @py_doc[range]
|
181
|
-
|
173
|
+
Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
182
174
|
else
|
183
|
-
|
175
|
+
Token.new(@py_doc[range])
|
184
176
|
end
|
185
177
|
end
|
186
178
|
|
187
179
|
# Returns a semantic similarity estimate.
|
188
180
|
# @param other [Doc] the other doc to which a similarity estimation is made
|
189
|
-
# @return [Float]
|
181
|
+
# @return [Float]
|
190
182
|
def similarity(other)
|
191
183
|
py_doc.similarity(other.py_doc)
|
192
184
|
end
|
@@ -196,18 +188,21 @@ module Spacy
|
|
196
188
|
# @param compact [Boolean] only relevant to the `dep' style
|
197
189
|
# @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
|
198
190
|
def displacy(style: "dep", compact: false)
|
199
|
-
PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
|
191
|
+
PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
|
200
192
|
end
|
201
193
|
|
202
194
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
203
195
|
def method_missing(name, *args)
|
204
196
|
@py_doc.send(name, *args)
|
205
197
|
end
|
198
|
+
|
199
|
+
def respond_to_missing?(sym)
|
200
|
+
sym ? true : super
|
201
|
+
end
|
206
202
|
end
|
207
203
|
|
208
204
|
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
209
205
|
class Language
|
210
|
-
|
211
206
|
# @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
|
212
207
|
attr_reader :spacy_nlp_id
|
213
208
|
|
@@ -245,7 +240,7 @@ module Spacy
|
|
245
240
|
# @return [Array<String>] An array of text strings representing pipeline components
|
246
241
|
def pipe_names
|
247
242
|
pipe_array = []
|
248
|
-
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
243
|
+
PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
|
249
244
|
pipe_array << pipe
|
250
245
|
end
|
251
246
|
pipe_array
|
@@ -268,24 +263,23 @@ module Spacy
|
|
268
263
|
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
269
264
|
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
270
265
|
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
271
|
-
def most_similar(vector,
|
266
|
+
def most_similar(vector, num)
|
272
267
|
vec_array = Numpy.asarray([vector])
|
273
|
-
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n:
|
274
|
-
key_texts = PyCall.eval("[[str(
|
275
|
-
keys = key_texts.map{|kt| kt[0]}
|
276
|
-
texts = key_texts.map{|kt| kt[1]}
|
277
|
-
best_rows = PyCall::List.(py_result[1])[0]
|
278
|
-
scores = PyCall::List.(py_result[2])[0]
|
268
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: num)
|
269
|
+
key_texts = PyCall.eval("[[str(num), #{@spacy_nlp_id}.vocab[num].text] for num in #{py_result[0][0].tolist}]")
|
270
|
+
keys = key_texts.map { |kt| kt[0] }
|
271
|
+
texts = key_texts.map { |kt| kt[1] }
|
272
|
+
best_rows = PyCall::List.call(py_result[1])[0]
|
273
|
+
scores = PyCall::List.call(py_result[2])[0]
|
279
274
|
|
280
275
|
results = []
|
281
|
-
|
282
|
-
result = {key: keys[i].to_i,
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
}
|
276
|
+
num.times do |i|
|
277
|
+
result = { key: keys[i].to_i,
|
278
|
+
text: texts[i],
|
279
|
+
best_row: best_rows[i],
|
280
|
+
score: scores[i] }
|
287
281
|
result.each_key do |key|
|
288
|
-
result.define_singleton_method(key){ result[key] }
|
282
|
+
result.define_singleton_method(key) { result[key] }
|
289
283
|
end
|
290
284
|
results << result
|
291
285
|
end
|
@@ -297,9 +291,9 @@ module Spacy
|
|
297
291
|
# @param disable [Array<String>]
|
298
292
|
# @param batch_size [Integer]
|
299
293
|
# @return [Array<Doc>]
|
300
|
-
def pipe(texts, disable: [], batch_size: 50)
|
294
|
+
def pipe(texts, disable: [], batch_size: 50)
|
301
295
|
docs = []
|
302
|
-
PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
296
|
+
PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
303
297
|
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
304
298
|
end
|
305
299
|
docs
|
@@ -309,18 +303,21 @@ module Spacy
|
|
309
303
|
def method_missing(name, *args)
|
310
304
|
@py_nlp.send(name, *args)
|
311
305
|
end
|
306
|
+
|
307
|
+
def respond_to_missing?(sym)
|
308
|
+
sym ? true : super
|
309
|
+
end
|
312
310
|
end
|
313
311
|
|
314
312
|
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
315
313
|
class Matcher
|
316
|
-
|
317
314
|
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
318
315
|
attr_reader :py_matcher
|
319
316
|
|
320
317
|
# Creates a {Matcher} instance
|
321
318
|
# @param nlp [Language] an instance of {Language} class
|
322
319
|
def initialize(nlp)
|
323
|
-
@py_matcher = PyMatcher.(nlp.vocab)
|
320
|
+
@py_matcher = PyMatcher.call(nlp.vocab)
|
324
321
|
end
|
325
322
|
|
326
323
|
# Adds a label string and a text pattern.
|
@@ -334,16 +331,17 @@ module Spacy
|
|
334
331
|
# @param doc [Doc] an {Doc} instance
|
335
332
|
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
336
333
|
def match(doc)
|
337
|
-
str_results = @py_matcher.(doc.py_doc).to_s
|
334
|
+
str_results = @py_matcher.call(doc.py_doc).to_s
|
338
335
|
s = StringScanner.new(str_results[1..-2])
|
339
336
|
results = []
|
340
337
|
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
341
338
|
next unless s.matched
|
339
|
+
|
342
340
|
triple = s.matched.split(", ")
|
343
341
|
match_id = triple[0].to_i
|
344
342
|
start_index = triple[1].to_i
|
345
343
|
end_index = triple[2].to_i - 1
|
346
|
-
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
344
|
+
results << { match_id: match_id, start_index: start_index, end_index: end_index }
|
347
345
|
end
|
348
346
|
results
|
349
347
|
end
|
@@ -351,7 +349,6 @@ module Spacy
|
|
351
349
|
|
352
350
|
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
353
351
|
class Span
|
354
|
-
|
355
352
|
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
356
353
|
attr_reader :py_span
|
357
354
|
|
@@ -360,11 +357,11 @@ module Spacy
|
|
360
357
|
|
361
358
|
include Enumerable
|
362
359
|
|
363
|
-
|
364
|
-
|
365
|
-
|
360
|
+
alias length count
|
361
|
+
alias len count
|
362
|
+
alias size count
|
366
363
|
|
367
|
-
# It is recommended to use {Doc#span} method to create a span. If you need to
|
364
|
+
# It is recommended to use {Doc#span} method to create a span. If you need to
|
368
365
|
# create one using {Span#initialize}, there are two method signatures:
|
369
366
|
# `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
|
370
367
|
# @param doc [Doc] the document to which this span belongs to
|
@@ -373,18 +370,14 @@ module Spacy
|
|
373
370
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
374
371
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
375
372
|
@doc = doc
|
376
|
-
|
377
|
-
@py_span = py_span
|
378
|
-
else
|
379
|
-
@py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
|
380
|
-
end
|
373
|
+
@py_span = py_span || @py_span = PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
|
381
374
|
end
|
382
375
|
|
383
376
|
# Returns an array of tokens contained in the span.
|
384
377
|
# @return [Array<Token>]
|
385
378
|
def tokens
|
386
379
|
results = []
|
387
|
-
PyCall::List.(@py_span).each do |py_token|
|
380
|
+
PyCall::List.call(@py_span).each do |py_token|
|
388
381
|
results << Token.new(py_token)
|
389
382
|
end
|
390
383
|
results
|
@@ -392,7 +385,7 @@ module Spacy
|
|
392
385
|
|
393
386
|
# Iterates over the elements in the span yielding a token instance each time.
|
394
387
|
def each
|
395
|
-
PyCall::List.(@py_span).each do |py_token|
|
388
|
+
PyCall::List.call(@py_span).each do |py_token|
|
396
389
|
yield Token.new(py_token)
|
397
390
|
end
|
398
391
|
end
|
@@ -401,7 +394,7 @@ module Spacy
|
|
401
394
|
# @return [Array<Span>]
|
402
395
|
def noun_chunks
|
403
396
|
chunk_array = []
|
404
|
-
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
397
|
+
py_chunks = PyCall::List.call(@py_span.noun_chunks)
|
405
398
|
py_chunks.each do |py_span|
|
406
399
|
chunk_array << Span.new(@doc, py_span: py_span)
|
407
400
|
end
|
@@ -410,7 +403,7 @@ module Spacy
|
|
410
403
|
|
411
404
|
# Returns the head token
|
412
405
|
# @return [Token]
|
413
|
-
def root
|
406
|
+
def root
|
414
407
|
Token.new(@py_span.root)
|
415
408
|
end
|
416
409
|
|
@@ -418,7 +411,7 @@ module Spacy
|
|
418
411
|
# @return [Array<Span>]
|
419
412
|
def sents
|
420
413
|
sentence_array = []
|
421
|
-
py_sentences = PyCall::List.(@py_span.sents)
|
414
|
+
py_sentences = PyCall::List.call(@py_span.sents)
|
422
415
|
py_sentences.each do |py_span|
|
423
416
|
sentence_array << Span.new(@doc, py_span: py_span)
|
424
417
|
end
|
@@ -429,7 +422,7 @@ module Spacy
|
|
429
422
|
# @return [Array<Span>]
|
430
423
|
def ents
|
431
424
|
ent_array = []
|
432
|
-
PyCall::List.(@py_span.ents).each do |py_span|
|
425
|
+
PyCall::List.call(@py_span.ents).each do |py_span|
|
433
426
|
ent_array << Span.new(@doc, py_span: py_span)
|
434
427
|
end
|
435
428
|
ent_array
|
@@ -438,8 +431,8 @@ module Spacy
|
|
438
431
|
# Returns a span that represents the sentence that the given span is part of.
|
439
432
|
# @return [Span]
|
440
433
|
def sent
|
441
|
-
py_span = @py_span.sent
|
442
|
-
|
434
|
+
py_span = @py_span.sent
|
435
|
+
Span.new(@doc, py_span: py_span)
|
443
436
|
end
|
444
437
|
|
445
438
|
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
@@ -447,67 +440,67 @@ module Spacy
|
|
447
440
|
def [](range)
|
448
441
|
if range.is_a?(Range)
|
449
442
|
py_span = @py_span[range]
|
450
|
-
|
443
|
+
Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
451
444
|
else
|
452
|
-
|
445
|
+
Token.new(@py_span[range])
|
453
446
|
end
|
454
447
|
end
|
455
448
|
|
456
449
|
# Returns a semantic similarity estimate.
|
457
450
|
# @param other [Span] the other span to which a similarity estimation is conducted
|
458
|
-
# @return [Float]
|
451
|
+
# @return [Float]
|
459
452
|
def similarity(other)
|
460
453
|
py_span.similarity(other.py_span)
|
461
454
|
end
|
462
455
|
|
463
456
|
# Creates a document instance from the span
|
464
|
-
# @return [Doc]
|
457
|
+
# @return [Doc]
|
465
458
|
def as_doc
|
466
|
-
Doc.new(@doc.py_nlp, text:
|
459
|
+
Doc.new(@doc.py_nlp, text: text)
|
467
460
|
end
|
468
461
|
|
469
462
|
# Returns tokens conjugated to the root of the span.
|
470
463
|
# @return [Array<Token>] an array of tokens
|
471
464
|
def conjuncts
|
472
465
|
conjunct_array = []
|
473
|
-
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
466
|
+
PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
|
474
467
|
conjunct_array << Token.new(py_conjunct)
|
475
468
|
end
|
476
469
|
conjunct_array
|
477
470
|
end
|
478
471
|
|
479
472
|
# Returns tokens that are to the left of the span, whose heads are within the span.
|
480
|
-
# @return [Array<Token>] an array of tokens
|
473
|
+
# @return [Array<Token>] an array of tokens
|
481
474
|
def lefts
|
482
475
|
left_array = []
|
483
|
-
PyCall::List.(@py_span.lefts).each do |py_left|
|
476
|
+
PyCall::List.call(@py_span.lefts).each do |py_left|
|
484
477
|
left_array << Token.new(py_left)
|
485
478
|
end
|
486
479
|
left_array
|
487
480
|
end
|
488
481
|
|
489
482
|
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
490
|
-
# @return [Array<Token>] an array of Tokens
|
483
|
+
# @return [Array<Token>] an array of Tokens
|
491
484
|
def rights
|
492
485
|
right_array = []
|
493
|
-
PyCall::List.(@py_span.rights).each do |py_right|
|
486
|
+
PyCall::List.call(@py_span.rights).each do |py_right|
|
494
487
|
right_array << Token.new(py_right)
|
495
488
|
end
|
496
489
|
right_array
|
497
490
|
end
|
498
491
|
|
499
492
|
# Returns Tokens that are within the span and tokens that descend from them.
|
500
|
-
# @return [Array<Token>] an array of tokens
|
493
|
+
# @return [Array<Token>] an array of tokens
|
501
494
|
def subtree
|
502
495
|
subtree_array = []
|
503
|
-
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
496
|
+
PyCall::List.call(@py_span.subtree).each do |py_subtree|
|
504
497
|
subtree_array << Token.new(py_subtree)
|
505
498
|
end
|
506
499
|
subtree_array
|
507
500
|
end
|
508
501
|
|
509
502
|
# Returns the label
|
510
|
-
# @return [String]
|
503
|
+
# @return [String]
|
511
504
|
def label
|
512
505
|
@py_span.label_
|
513
506
|
end
|
@@ -516,11 +509,14 @@ module Spacy
|
|
516
509
|
def method_missing(name, *args)
|
517
510
|
@py_span.send(name, *args)
|
518
511
|
end
|
512
|
+
|
513
|
+
def respond_to_missing?(sym)
|
514
|
+
sym ? true : super
|
515
|
+
end
|
519
516
|
end
|
520
517
|
|
521
518
|
# See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
|
522
519
|
class Token
|
523
|
-
|
524
520
|
# @return [Object] a Python `Token` instance accessible via `PyCall`
|
525
521
|
attr_reader :py_token
|
526
522
|
|
@@ -528,17 +524,16 @@ module Spacy
|
|
528
524
|
attr_reader :text
|
529
525
|
|
530
526
|
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
531
|
-
# There is no way to generate a token from scratch but relying on a pre-exising Python
|
527
|
+
# There is no way to generate a token from scratch but relying on a pre-exising Python `Token` object.
|
532
528
|
# @param py_token [Object] Python `Token` object
|
533
529
|
def initialize(py_token)
|
534
530
|
@py_token = py_token
|
535
531
|
@text = @py_token.text
|
536
532
|
end
|
537
533
|
|
538
|
-
|
539
534
|
# Returns the head token
|
540
535
|
# @return [Token]
|
541
|
-
def head
|
536
|
+
def head
|
542
537
|
Token.new(@py_token.head)
|
543
538
|
end
|
544
539
|
|
@@ -546,7 +541,7 @@ module Spacy
|
|
546
541
|
# @return [Array<Token>] an array of tokens
|
547
542
|
def subtree
|
548
543
|
descendant_array = []
|
549
|
-
PyCall::List.(@py_token.subtree).each do |descendant|
|
544
|
+
PyCall::List.call(@py_token.subtree).each do |descendant|
|
550
545
|
descendant_array << Token.new(descendant)
|
551
546
|
end
|
552
547
|
descendant_array
|
@@ -556,7 +551,7 @@ module Spacy
|
|
556
551
|
# @return [Array<Token>] an array of tokens
|
557
552
|
def ancestors
|
558
553
|
ancestor_array = []
|
559
|
-
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
554
|
+
PyCall::List.call(@py_token.ancestors).each do |ancestor|
|
560
555
|
ancestor_array << Token.new(ancestor)
|
561
556
|
end
|
562
557
|
ancestor_array
|
@@ -566,7 +561,7 @@ module Spacy
|
|
566
561
|
# @return [Array<Token>] an array of tokens
|
567
562
|
def children
|
568
563
|
child_array = []
|
569
|
-
PyCall::List.(@py_token.children).each do |child|
|
564
|
+
PyCall::List.call(@py_token.children).each do |child|
|
570
565
|
child_array << Token.new(child)
|
571
566
|
end
|
572
567
|
child_array
|
@@ -576,7 +571,7 @@ module Spacy
|
|
576
571
|
# @return [Array<Token>] an array of tokens
|
577
572
|
def lefts
|
578
573
|
token_array = []
|
579
|
-
PyCall::List.(@py_token.lefts).each do |token|
|
574
|
+
PyCall::List.call(@py_token.lefts).each do |token|
|
580
575
|
token_array << Token.new(token)
|
581
576
|
end
|
582
577
|
token_array
|
@@ -586,89 +581,87 @@ module Spacy
|
|
586
581
|
# @return [Array<Token>] an array of tokens
|
587
582
|
def rights
|
588
583
|
token_array = []
|
589
|
-
PyCall::List.(@py_token.rights).each do |token|
|
584
|
+
PyCall::List.call(@py_token.rights).each do |token|
|
590
585
|
token_array << Token.new(token)
|
591
586
|
end
|
592
587
|
token_array
|
593
588
|
end
|
594
589
|
|
595
590
|
# String representation of the token.
|
596
|
-
# @return [String]
|
591
|
+
# @return [String]
|
597
592
|
def to_s
|
598
593
|
@text
|
599
594
|
end
|
600
595
|
|
601
596
|
# Returns a hash or string of morphological information
|
602
597
|
# @param hash [Boolean] if true, a hash will be returned instead of a string
|
603
|
-
# @return [Hash, String]
|
604
|
-
def morphology(hash
|
598
|
+
# @return [Hash, String]
|
599
|
+
def morphology(hash: true)
|
605
600
|
if @py_token.has_morph
|
606
601
|
morph_analysis = @py_token.morph
|
607
|
-
if hash
|
608
|
-
return morph_analysis.to_dict
|
609
|
-
else
|
610
|
-
return morph_analysis.to_s
|
611
|
-
end
|
612
|
-
else
|
613
602
|
if hash
|
614
|
-
|
603
|
+
morph_analysis.to_dict
|
615
604
|
else
|
616
|
-
|
605
|
+
morph_analysis.to_s
|
617
606
|
end
|
607
|
+
elsif hash
|
608
|
+
{}
|
609
|
+
else
|
610
|
+
""
|
618
611
|
end
|
619
612
|
end
|
620
613
|
|
621
614
|
# Returns the lemma by calling `lemma_' of `@py_token` object
|
622
|
-
# @return [String]
|
615
|
+
# @return [String]
|
623
616
|
def lemma
|
624
617
|
@py_token.lemma_
|
625
618
|
end
|
626
619
|
|
627
620
|
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
628
|
-
# @return [String]
|
621
|
+
# @return [String]
|
629
622
|
def lower
|
630
623
|
@py_token.lower_
|
631
624
|
end
|
632
625
|
|
633
626
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
634
|
-
# @return [String]
|
627
|
+
# @return [String]
|
635
628
|
def shape
|
636
629
|
@py_token.shape_
|
637
630
|
end
|
638
631
|
|
639
632
|
# Returns the pos by calling `pos_' of `@py_token` object
|
640
|
-
# @return [String]
|
633
|
+
# @return [String]
|
641
634
|
def pos
|
642
635
|
@py_token.pos_
|
643
636
|
end
|
644
637
|
|
645
638
|
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
646
|
-
# @return [String]
|
647
|
-
def tag
|
639
|
+
# @return [String]
|
640
|
+
def tag
|
648
641
|
@py_token.tag_
|
649
642
|
end
|
650
643
|
|
651
644
|
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
652
|
-
# @return [String]
|
645
|
+
# @return [String]
|
653
646
|
def dep
|
654
647
|
@py_token.dep_
|
655
648
|
end
|
656
|
-
|
649
|
+
|
657
650
|
# Returns the language by calling `lang_' of `@py_token` object
|
658
|
-
# @return [String]
|
659
|
-
def lang
|
651
|
+
# @return [String]
|
652
|
+
def lang
|
660
653
|
@py_token.lang_
|
661
654
|
end
|
662
655
|
|
663
656
|
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
664
|
-
# @return [String]
|
665
|
-
def whitespace
|
657
|
+
# @return [String]
|
658
|
+
def whitespace
|
666
659
|
@py_token.whitespace_
|
667
660
|
end
|
668
661
|
|
669
662
|
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
670
|
-
# @return [String]
|
671
|
-
def ent_type
|
663
|
+
# @return [String]
|
664
|
+
def ent_type
|
672
665
|
@py_token.ent_type_
|
673
666
|
end
|
674
667
|
|
@@ -682,11 +675,14 @@ module Spacy
|
|
682
675
|
def method_missing(name, *args)
|
683
676
|
@py_token.send(name, *args)
|
684
677
|
end
|
678
|
+
|
679
|
+
def respond_to_missing?(sym)
|
680
|
+
sym ? true : super
|
681
|
+
end
|
685
682
|
end
|
686
683
|
|
687
684
|
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
688
|
-
class Lexeme
|
689
|
-
|
685
|
+
class Lexeme
|
690
686
|
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
691
687
|
attr_reader :py_lexeme
|
692
688
|
|
@@ -702,50 +698,50 @@ module Spacy
|
|
702
698
|
end
|
703
699
|
|
704
700
|
# String representation of the token.
|
705
|
-
# @return [String]
|
701
|
+
# @return [String]
|
706
702
|
def to_s
|
707
703
|
@text
|
708
704
|
end
|
709
705
|
|
710
706
|
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
711
|
-
# @return [String]
|
707
|
+
# @return [String]
|
712
708
|
def lower
|
713
709
|
@py_lexeme.lower_
|
714
710
|
end
|
715
711
|
|
716
712
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
717
|
-
# @return [String]
|
713
|
+
# @return [String]
|
718
714
|
def shape
|
719
715
|
@py_lexeme.shape_
|
720
716
|
end
|
721
717
|
|
722
718
|
# Returns the language by calling `lang_' of `@py_lexeme` object
|
723
|
-
# @return [String]
|
724
|
-
def lang
|
719
|
+
# @return [String]
|
720
|
+
def lang
|
725
721
|
@py_lexeme.lang_
|
726
722
|
end
|
727
723
|
|
728
724
|
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
729
|
-
# @return [String]
|
730
|
-
def prefix
|
725
|
+
# @return [String]
|
726
|
+
def prefix
|
731
727
|
@py_lexeme.prefix_
|
732
728
|
end
|
733
|
-
|
729
|
+
|
734
730
|
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
735
|
-
# @return [String]
|
731
|
+
# @return [String]
|
736
732
|
def suffix
|
737
733
|
@py_lexeme.suffix_
|
738
734
|
end
|
739
735
|
|
740
736
|
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
741
|
-
# @return [String]
|
737
|
+
# @return [String]
|
742
738
|
def norm
|
743
739
|
@py_lexeme.norm_
|
744
740
|
end
|
745
741
|
|
746
742
|
# Returns a semantic similarity estimate.
|
747
|
-
# @param other [Lexeme] the other
|
748
|
-
# @return [Float]
|
743
|
+
# @param other [Lexeme] the other lexeme to which a similarity estimation is made
|
744
|
+
# @return [Float]
|
749
745
|
def similarity(other)
|
750
746
|
@py_lexeme.similarity(other.py_lexeme)
|
751
747
|
end
|
@@ -754,7 +750,9 @@ module Spacy
|
|
754
750
|
def method_missing(name, *args)
|
755
751
|
@py_lexeme.send(name, *args)
|
756
752
|
end
|
757
|
-
end
|
758
753
|
|
754
|
+
def respond_to_missing?(sym)
|
755
|
+
sym ? true : super
|
756
|
+
end
|
757
|
+
end
|
759
758
|
end
|
760
|
-
|