ruby-spacy 0.1.4 → 0.1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +48 -0
- data/.solargraph.yml +22 -0
- data/CHANGELOG.md +5 -1
- data/Gemfile +7 -7
- data/Gemfile.lock +3 -3
- data/README.md +40 -39
- data/examples/get_started/lexeme.rb +3 -1
- data/examples/get_started/linguistic_annotations.rb +3 -1
- data/examples/get_started/morphology.rb +3 -1
- data/examples/get_started/most_similar.rb +30 -27
- data/examples/get_started/named_entities.rb +4 -2
- data/examples/get_started/pos_tags_and_dependencies.rb +3 -1
- data/examples/get_started/similarity.rb +4 -2
- data/examples/get_started/tokenization.rb +3 -1
- data/examples/get_started/visualizing_dependencies.rb +2 -2
- data/examples/get_started/visualizing_dependencies_compact.rb +2 -0
- data/examples/get_started/visualizing_named_entities.rb +4 -2
- data/examples/get_started/vocab.rb +3 -1
- data/examples/get_started/word_vectors.rb +3 -1
- data/examples/japanese/ancestors.rb +6 -4
- data/examples/japanese/entity_annotations_and_labels.rb +4 -2
- data/examples/japanese/information_extraction.rb +6 -6
- data/examples/japanese/lemmatization.rb +3 -1
- data/examples/japanese/most_similar.rb +30 -27
- data/examples/japanese/named_entity_recognition.rb +3 -2
- data/examples/japanese/navigating_parse_tree.rb +19 -17
- data/examples/japanese/noun_chunks.rb +2 -0
- data/examples/japanese/pos_tagging.rb +3 -1
- data/examples/japanese/sentence_segmentation.rb +3 -2
- data/examples/japanese/similarity.rb +2 -0
- data/examples/japanese/tokenization.rb +2 -0
- data/examples/japanese/visualizing_dependencies.rb +3 -1
- data/examples/japanese/visualizing_named_entities.rb +4 -2
- data/examples/linguistic_features/ancestors.rb +7 -5
- data/examples/linguistic_features/entity_annotations_and_labels.rb +4 -2
- data/examples/linguistic_features/finding_a_verb_with_a_subject.rb +3 -5
- data/examples/linguistic_features/information_extraction.rb +9 -9
- data/examples/linguistic_features/iterating_children.rb +6 -8
- data/examples/linguistic_features/iterating_lefts_and_rights.rb +7 -5
- data/examples/linguistic_features/lemmatization.rb +3 -1
- data/examples/linguistic_features/named_entity_recognition.rb +3 -1
- data/examples/linguistic_features/navigating_parse_tree.rb +3 -1
- data/examples/linguistic_features/noun_chunks.rb +3 -1
- data/examples/linguistic_features/pos_tagging.rb +3 -1
- data/examples/linguistic_features/retokenize_1.rb +2 -0
- data/examples/linguistic_features/retokenize_2.rb +4 -2
- data/examples/linguistic_features/rule_based_morphology.rb +4 -2
- data/examples/linguistic_features/sentence_segmentation.rb +3 -2
- data/examples/linguistic_features/similarity.rb +4 -2
- data/examples/linguistic_features/similarity_between_lexemes.rb +2 -0
- data/examples/linguistic_features/similarity_between_spans.rb +7 -5
- data/examples/linguistic_features/tokenization.rb +3 -2
- data/examples/rule_based_matching/creating_spans_from_matches.rb +5 -3
- data/examples/rule_based_matching/matcher.rb +4 -2
- data/lib/ruby-spacy/version.rb +1 -1
- data/lib/ruby-spacy.rb +142 -136
- data/ruby-spacy.gemspec +15 -17
- data/tags +132 -0
- metadata +69 -10
data/lib/ruby-spacy.rb
CHANGED
@@ -1,17 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "ruby-spacy/version"
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require 'pycall/import'
|
8
|
-
include PyCall::Import
|
4
|
+
require "strscan"
|
5
|
+
require "numpy"
|
6
|
+
require "pycall/import"
|
9
7
|
|
10
8
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
11
9
|
module Spacy
|
12
|
-
|
13
10
|
extend PyCall::Import
|
14
|
-
spacy = PyCall.import_module(
|
11
|
+
spacy = PyCall.import_module("spacy")
|
15
12
|
|
16
13
|
# Python `Language` class
|
17
14
|
PyLanguage = spacy.language.Language
|
@@ -24,23 +21,22 @@ module Spacy
|
|
24
21
|
|
25
22
|
# Python `Token` class object
|
26
23
|
PyToken = spacy.tokens.Token
|
27
|
-
|
24
|
+
|
28
25
|
# Python `Matcher` class object
|
29
26
|
PyMatcher = spacy.matcher.Matcher
|
30
27
|
|
31
28
|
# Python `displacy` object
|
32
29
|
PyDisplacy = spacy.displacy
|
33
30
|
|
34
|
-
# A utility module method to convert Python's generator object to a Ruby array,
|
31
|
+
# A utility module method to convert Python's generator object to a Ruby array,
|
35
32
|
# mainly used on the items inside the array returned from dependency-related methods
|
36
33
|
# such as {Span#rights}, {Span#lefts} and {Span#subtree}.
|
37
34
|
def self.generator_to_array(py_generator)
|
38
|
-
PyCall::List.(py_generator)
|
35
|
+
PyCall::List.call(py_generator)
|
39
36
|
end
|
40
37
|
|
41
38
|
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
42
39
|
class Doc
|
43
|
-
|
44
40
|
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
45
41
|
attr_reader :py_nlp
|
46
42
|
|
@@ -52,23 +48,19 @@ module Spacy
|
|
52
48
|
|
53
49
|
include Enumerable
|
54
50
|
|
55
|
-
|
56
|
-
|
57
|
-
|
51
|
+
alias length count
|
52
|
+
alias len count
|
53
|
+
alias size count
|
58
54
|
|
59
|
-
# It is recommended to use {Language#read} method to create a doc. If you need to
|
60
|
-
# create one using {Doc#initialize}, there are two method signatures:
|
55
|
+
# It is recommended to use {Language#read} method to create a doc. If you need to
|
56
|
+
# create one using {Doc#initialize}, there are two method signatures:
|
61
57
|
# `Spacy::Doc.new(nlp_id, py_doc: Object)` and `Spacy::Doc.new(nlp_id, text: String)`.
|
62
58
|
# @param nlp [Language] an instance of {Language} class
|
63
59
|
# @param py_doc [Object] an instance of Python `Doc` class
|
64
60
|
# @param text [String] the text string to be analyzed
|
65
61
|
def initialize(nlp, py_doc: nil, text: nil)
|
66
62
|
@py_nlp = nlp
|
67
|
-
|
68
|
-
@py_doc = py_doc
|
69
|
-
else
|
70
|
-
@py_doc = nlp.(text)
|
71
|
-
end
|
63
|
+
@py_doc = py_doc || @py_doc = nlp.call(text)
|
72
64
|
@text = @py_doc.text
|
73
65
|
end
|
74
66
|
|
@@ -77,25 +69,25 @@ module Spacy
|
|
77
69
|
# @param end_index [Integer] the end position of the span to be retokenized in the document
|
78
70
|
# @param attributes [Hash] attributes to set on the merged token
|
79
71
|
def retokenize(start_index, end_index, attributes = {})
|
80
|
-
PyCall.with(@py_doc.retokenize
|
81
|
-
retokenizer.merge(@py_doc[start_index
|
72
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
73
|
+
retokenizer.merge(@py_doc[start_index..end_index], attrs: attributes)
|
82
74
|
end
|
83
75
|
end
|
84
76
|
|
85
77
|
# Retokenizes the text splitting the specified token.
|
86
78
|
# @param pos_in_doc [Integer] the position of the span to be retokenized in the document
|
87
|
-
# @param split_array [Array<String>] text strings of the split results
|
79
|
+
# @param split_array [Array<String>] text strings of the split results
|
88
80
|
# @param ancestor_pos [Integer] the position of the immediate ancestor element of the split elements in the document
|
89
81
|
# @param attributes [Hash] the attributes of the split elements
|
90
82
|
def retokenize_split(pos_in_doc, split_array, head_pos_in_split, ancestor_pos, attributes = {})
|
91
|
-
PyCall.with(@py_doc.retokenize
|
83
|
+
PyCall.with(@py_doc.retokenize) do |retokenizer|
|
92
84
|
heads = [[@py_doc[pos_in_doc], head_pos_in_split], @py_doc[ancestor_pos]]
|
93
85
|
retokenizer.split(@py_doc[pos_in_doc], split_array, heads: heads, attrs: attributes)
|
94
86
|
end
|
95
87
|
end
|
96
88
|
|
97
89
|
# String representation of the document.
|
98
|
-
# @return [String]
|
90
|
+
# @return [String]
|
99
91
|
def to_s
|
100
92
|
@text
|
101
93
|
end
|
@@ -104,7 +96,7 @@ module Spacy
|
|
104
96
|
# @return [Array<Token>]
|
105
97
|
def tokens
|
106
98
|
results = []
|
107
|
-
PyCall::List.(@py_doc).each do |py_token|
|
99
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
108
100
|
results << Token.new(py_token)
|
109
101
|
end
|
110
102
|
results
|
@@ -112,12 +104,12 @@ module Spacy
|
|
112
104
|
|
113
105
|
# Iterates over the elements in the doc yielding a token instance each time.
|
114
106
|
def each
|
115
|
-
PyCall::List.(@py_doc).each do |py_token|
|
107
|
+
PyCall::List.call(@py_doc).each do |py_token|
|
116
108
|
yield Token.new(py_token)
|
117
109
|
end
|
118
110
|
end
|
119
111
|
|
120
|
-
# Returns a span of the specified range within the doc.
|
112
|
+
# Returns a span of the specified range within the doc.
|
121
113
|
# The method should be used either of the two ways: `Doc#span(range)` or `Doc#span{start_pos, size_of_span}`.
|
122
114
|
# @param range_or_start [Range, Integer] a range object, or, alternatively, an integer that represents the start position of the span
|
123
115
|
# @param optional_size [Integer] an integer representing the size of the span
|
@@ -125,7 +117,7 @@ module Spacy
|
|
125
117
|
def span(range_or_start, optional_size = nil)
|
126
118
|
if optional_size
|
127
119
|
start_index = range_or_start
|
128
|
-
temp = tokens[start_index
|
120
|
+
temp = tokens[start_index...start_index + optional_size]
|
129
121
|
else
|
130
122
|
start_index = range_or_start.first
|
131
123
|
range = range_or_start
|
@@ -141,7 +133,7 @@ module Spacy
|
|
141
133
|
# @return [Array<Span>]
|
142
134
|
def noun_chunks
|
143
135
|
chunk_array = []
|
144
|
-
py_chunks = PyCall::List.(@py_doc.noun_chunks)
|
136
|
+
py_chunks = PyCall::List.call(@py_doc.noun_chunks)
|
145
137
|
py_chunks.each do |py_chunk|
|
146
138
|
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
147
139
|
end
|
@@ -152,7 +144,7 @@ module Spacy
|
|
152
144
|
# @return [Array<Span>]
|
153
145
|
def sents
|
154
146
|
sentence_array = []
|
155
|
-
py_sentences = PyCall::List.(@py_doc.sents)
|
147
|
+
py_sentences = PyCall::List.call(@py_doc.sents)
|
156
148
|
py_sentences.each do |py_sent|
|
157
149
|
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
158
150
|
end
|
@@ -164,9 +156,9 @@ module Spacy
|
|
164
156
|
def ents
|
165
157
|
# so that ents canbe "each"-ed in Ruby
|
166
158
|
ent_array = []
|
167
|
-
PyCall::List.(@py_doc.ents).each do |ent|
|
159
|
+
PyCall::List.call(@py_doc.ents).each do |ent|
|
168
160
|
ent.define_singleton_method :label do
|
169
|
-
|
161
|
+
label_
|
170
162
|
end
|
171
163
|
ent_array << ent
|
172
164
|
end
|
@@ -178,15 +170,15 @@ module Spacy
|
|
178
170
|
def [](range)
|
179
171
|
if range.is_a?(Range)
|
180
172
|
py_span = @py_doc[range]
|
181
|
-
|
173
|
+
Span.new(self, start_index: py_span.start, end_index: py_span.end - 1)
|
182
174
|
else
|
183
|
-
|
175
|
+
Token.new(@py_doc[range])
|
184
176
|
end
|
185
177
|
end
|
186
178
|
|
187
179
|
# Returns a semantic similarity estimate.
|
188
180
|
# @param other [Doc] the other doc to which a similarity estimation is made
|
189
|
-
# @return [Float]
|
181
|
+
# @return [Float]
|
190
182
|
def similarity(other)
|
191
183
|
py_doc.similarity(other.py_doc)
|
192
184
|
end
|
@@ -196,18 +188,21 @@ module Spacy
|
|
196
188
|
# @param compact [Boolean] only relevant to the `dep' style
|
197
189
|
# @return [String] in the case of `dep`, the output text will be an SVG, whereas in the `ent` style, the output text will be an HTML.
|
198
190
|
def displacy(style: "dep", compact: false)
|
199
|
-
PyDisplacy.render(py_doc, style: style, options: {compact: compact}, jupyter: false)
|
191
|
+
PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
|
200
192
|
end
|
201
193
|
|
202
194
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
203
195
|
def method_missing(name, *args)
|
204
196
|
@py_doc.send(name, *args)
|
205
197
|
end
|
198
|
+
|
199
|
+
def respond_to_missing?(sym)
|
200
|
+
sym ? true : super
|
201
|
+
end
|
206
202
|
end
|
207
203
|
|
208
204
|
# See also spaCy Python API document for [`Language`](https://spacy.io/api/language).
|
209
205
|
class Language
|
210
|
-
|
211
206
|
# @return [String] an identifier string that can be used to refer to the Python `Language` object inside `PyCall::exec` or `PyCall::eval`
|
212
207
|
attr_reader :spacy_nlp_id
|
213
208
|
|
@@ -245,7 +240,7 @@ module Spacy
|
|
245
240
|
# @return [Array<String>] An array of text strings representing pipeline components
|
246
241
|
def pipe_names
|
247
242
|
pipe_array = []
|
248
|
-
PyCall::List.(@py_nlp.pipe_names).each do |pipe|
|
243
|
+
PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
|
249
244
|
pipe_array << pipe
|
250
245
|
end
|
251
246
|
pipe_array
|
@@ -268,18 +263,25 @@ module Spacy
|
|
268
263
|
# Returns _n_ lexemes having the vector representations that are the most similar to a given vector representation of a word.
|
269
264
|
# @param vector [Object] A vector representation of a word (whether existing or non-existing)
|
270
265
|
# @return [Array<Hash{:key => Integer, :text => String, :best_rows => Array<Float>, :score => Float}>] An array of hash objects each contains the `key`, `text`, `best_row` and similarity `score` of a lexeme
|
271
|
-
def most_similar(vector,
|
266
|
+
def most_similar(vector, num)
|
272
267
|
vec_array = Numpy.asarray([vector])
|
273
|
-
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n:
|
274
|
-
key_texts = PyCall.eval("[[str(
|
275
|
-
keys = key_texts.map{|kt| kt[0]}
|
276
|
-
texts = key_texts.map{|kt| kt[1]}
|
277
|
-
best_rows = PyCall::List.(py_result[1])[0]
|
278
|
-
scores = PyCall::List.(py_result[2])[0]
|
268
|
+
py_result = @py_nlp.vocab.vectors.most_similar(vec_array, n: num)
|
269
|
+
key_texts = PyCall.eval("[[str(num), #{@spacy_nlp_id}.vocab[num].text] for num in #{py_result[0][0].tolist}]")
|
270
|
+
keys = key_texts.map { |kt| kt[0] }
|
271
|
+
texts = key_texts.map { |kt| kt[1] }
|
272
|
+
best_rows = PyCall::List.call(py_result[1])[0]
|
273
|
+
scores = PyCall::List.call(py_result[2])[0]
|
279
274
|
|
280
275
|
results = []
|
281
|
-
|
282
|
-
|
276
|
+
num.times do |i|
|
277
|
+
result = { key: keys[i].to_i,
|
278
|
+
text: texts[i],
|
279
|
+
best_row: best_rows[i],
|
280
|
+
score: scores[i] }
|
281
|
+
result.each_key do |key|
|
282
|
+
result.define_singleton_method(key) { result[key] }
|
283
|
+
end
|
284
|
+
results << result
|
283
285
|
end
|
284
286
|
results
|
285
287
|
end
|
@@ -289,9 +291,9 @@ module Spacy
|
|
289
291
|
# @param disable [Array<String>]
|
290
292
|
# @param batch_size [Integer]
|
291
293
|
# @return [Array<Doc>]
|
292
|
-
def pipe(texts, disable: [], batch_size: 50)
|
294
|
+
def pipe(texts, disable: [], batch_size: 50)
|
293
295
|
docs = []
|
294
|
-
PyCall::List.(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
296
|
+
PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
|
295
297
|
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
296
298
|
end
|
297
299
|
docs
|
@@ -301,18 +303,21 @@ module Spacy
|
|
301
303
|
def method_missing(name, *args)
|
302
304
|
@py_nlp.send(name, *args)
|
303
305
|
end
|
306
|
+
|
307
|
+
def respond_to_missing?(sym)
|
308
|
+
sym ? true : super
|
309
|
+
end
|
304
310
|
end
|
305
311
|
|
306
312
|
# See also spaCy Python API document for [`Matcher`](https://spacy.io/api/matcher).
|
307
313
|
class Matcher
|
308
|
-
|
309
314
|
# @return [Object] a Python `Matcher` instance accessible via `PyCall`
|
310
315
|
attr_reader :py_matcher
|
311
316
|
|
312
317
|
# Creates a {Matcher} instance
|
313
318
|
# @param nlp [Language] an instance of {Language} class
|
314
319
|
def initialize(nlp)
|
315
|
-
@py_matcher = PyMatcher.(nlp.vocab)
|
320
|
+
@py_matcher = PyMatcher.call(nlp.vocab)
|
316
321
|
end
|
317
322
|
|
318
323
|
# Adds a label string and a text pattern.
|
@@ -326,16 +331,17 @@ module Spacy
|
|
326
331
|
# @param doc [Doc] an {Doc} instance
|
327
332
|
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
328
333
|
def match(doc)
|
329
|
-
str_results = @py_matcher.(doc.py_doc).to_s
|
334
|
+
str_results = @py_matcher.call(doc.py_doc).to_s
|
330
335
|
s = StringScanner.new(str_results[1..-2])
|
331
336
|
results = []
|
332
337
|
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
333
338
|
next unless s.matched
|
339
|
+
|
334
340
|
triple = s.matched.split(", ")
|
335
341
|
match_id = triple[0].to_i
|
336
342
|
start_index = triple[1].to_i
|
337
343
|
end_index = triple[2].to_i - 1
|
338
|
-
results << {match_id: match_id, start_index: start_index, end_index: end_index}
|
344
|
+
results << { match_id: match_id, start_index: start_index, end_index: end_index }
|
339
345
|
end
|
340
346
|
results
|
341
347
|
end
|
@@ -343,7 +349,6 @@ module Spacy
|
|
343
349
|
|
344
350
|
# See also spaCy Python API document for [`Span`](https://spacy.io/api/span).
|
345
351
|
class Span
|
346
|
-
|
347
352
|
# @return [Object] a Python `Span` instance accessible via `PyCall`
|
348
353
|
attr_reader :py_span
|
349
354
|
|
@@ -352,11 +357,11 @@ module Spacy
|
|
352
357
|
|
353
358
|
include Enumerable
|
354
359
|
|
355
|
-
|
356
|
-
|
357
|
-
|
360
|
+
alias length count
|
361
|
+
alias len count
|
362
|
+
alias size count
|
358
363
|
|
359
|
-
# It is recommended to use {Doc#span} method to create a span. If you need to
|
364
|
+
# It is recommended to use {Doc#span} method to create a span. If you need to
|
360
365
|
# create one using {Span#initialize}, there are two method signatures:
|
361
366
|
# `Span.new(doc, py_span: Object)` or `Span.new(doc, start_index: Integer, end_index: Integer, options: Hash)`.
|
362
367
|
# @param doc [Doc] the document to which this span belongs to
|
@@ -365,18 +370,14 @@ module Spacy
|
|
365
370
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
366
371
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
367
372
|
@doc = doc
|
368
|
-
|
369
|
-
@py_span = py_span
|
370
|
-
else
|
371
|
-
@py_span = PySpan.(@doc.py_doc, start_index, end_index + 1, options)
|
372
|
-
end
|
373
|
+
@py_span = py_span || @py_span = PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
|
373
374
|
end
|
374
375
|
|
375
376
|
# Returns an array of tokens contained in the span.
|
376
377
|
# @return [Array<Token>]
|
377
378
|
def tokens
|
378
379
|
results = []
|
379
|
-
PyCall::List.(@py_span).each do |py_token|
|
380
|
+
PyCall::List.call(@py_span).each do |py_token|
|
380
381
|
results << Token.new(py_token)
|
381
382
|
end
|
382
383
|
results
|
@@ -384,7 +385,7 @@ module Spacy
|
|
384
385
|
|
385
386
|
# Iterates over the elements in the span yielding a token instance each time.
|
386
387
|
def each
|
387
|
-
PyCall::List.(@py_span).each do |py_token|
|
388
|
+
PyCall::List.call(@py_span).each do |py_token|
|
388
389
|
yield Token.new(py_token)
|
389
390
|
end
|
390
391
|
end
|
@@ -393,7 +394,7 @@ module Spacy
|
|
393
394
|
# @return [Array<Span>]
|
394
395
|
def noun_chunks
|
395
396
|
chunk_array = []
|
396
|
-
py_chunks = PyCall::List.(@py_span.noun_chunks)
|
397
|
+
py_chunks = PyCall::List.call(@py_span.noun_chunks)
|
397
398
|
py_chunks.each do |py_span|
|
398
399
|
chunk_array << Span.new(@doc, py_span: py_span)
|
399
400
|
end
|
@@ -402,7 +403,7 @@ module Spacy
|
|
402
403
|
|
403
404
|
# Returns the head token
|
404
405
|
# @return [Token]
|
405
|
-
def root
|
406
|
+
def root
|
406
407
|
Token.new(@py_span.root)
|
407
408
|
end
|
408
409
|
|
@@ -410,7 +411,7 @@ module Spacy
|
|
410
411
|
# @return [Array<Span>]
|
411
412
|
def sents
|
412
413
|
sentence_array = []
|
413
|
-
py_sentences = PyCall::List.(@py_span.sents)
|
414
|
+
py_sentences = PyCall::List.call(@py_span.sents)
|
414
415
|
py_sentences.each do |py_span|
|
415
416
|
sentence_array << Span.new(@doc, py_span: py_span)
|
416
417
|
end
|
@@ -421,7 +422,7 @@ module Spacy
|
|
421
422
|
# @return [Array<Span>]
|
422
423
|
def ents
|
423
424
|
ent_array = []
|
424
|
-
PyCall::List.(@py_span.ents).each do |py_span|
|
425
|
+
PyCall::List.call(@py_span.ents).each do |py_span|
|
425
426
|
ent_array << Span.new(@doc, py_span: py_span)
|
426
427
|
end
|
427
428
|
ent_array
|
@@ -430,8 +431,8 @@ module Spacy
|
|
430
431
|
# Returns a span that represents the sentence that the given span is part of.
|
431
432
|
# @return [Span]
|
432
433
|
def sent
|
433
|
-
py_span = @py_span.sent
|
434
|
-
|
434
|
+
py_span = @py_span.sent
|
435
|
+
Span.new(@doc, py_span: py_span)
|
435
436
|
end
|
436
437
|
|
437
438
|
# Returns a span if a range object is given or a token if an integer representing the position of the doc is given.
|
@@ -439,67 +440,67 @@ module Spacy
|
|
439
440
|
def [](range)
|
440
441
|
if range.is_a?(Range)
|
441
442
|
py_span = @py_span[range]
|
442
|
-
|
443
|
+
Span.new(@doc, start_index: py_span.start, end_index: py_span.end - 1)
|
443
444
|
else
|
444
|
-
|
445
|
+
Token.new(@py_span[range])
|
445
446
|
end
|
446
447
|
end
|
447
448
|
|
448
449
|
# Returns a semantic similarity estimate.
|
449
450
|
# @param other [Span] the other span to which a similarity estimation is conducted
|
450
|
-
# @return [Float]
|
451
|
+
# @return [Float]
|
451
452
|
def similarity(other)
|
452
453
|
py_span.similarity(other.py_span)
|
453
454
|
end
|
454
455
|
|
455
456
|
# Creates a document instance from the span
|
456
|
-
# @return [Doc]
|
457
|
+
# @return [Doc]
|
457
458
|
def as_doc
|
458
|
-
Doc.new(@doc.py_nlp, text:
|
459
|
+
Doc.new(@doc.py_nlp, text: text)
|
459
460
|
end
|
460
461
|
|
461
462
|
# Returns tokens conjugated to the root of the span.
|
462
463
|
# @return [Array<Token>] an array of tokens
|
463
464
|
def conjuncts
|
464
465
|
conjunct_array = []
|
465
|
-
PyCall::List.(@py_span.conjuncts).each do |py_conjunct|
|
466
|
+
PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
|
466
467
|
conjunct_array << Token.new(py_conjunct)
|
467
468
|
end
|
468
469
|
conjunct_array
|
469
470
|
end
|
470
471
|
|
471
472
|
# Returns tokens that are to the left of the span, whose heads are within the span.
|
472
|
-
# @return [Array<Token>] an array of tokens
|
473
|
+
# @return [Array<Token>] an array of tokens
|
473
474
|
def lefts
|
474
475
|
left_array = []
|
475
|
-
PyCall::List.(@py_span.lefts).each do |py_left|
|
476
|
+
PyCall::List.call(@py_span.lefts).each do |py_left|
|
476
477
|
left_array << Token.new(py_left)
|
477
478
|
end
|
478
479
|
left_array
|
479
480
|
end
|
480
481
|
|
481
482
|
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
482
|
-
# @return [Array<Token>] an array of Tokens
|
483
|
+
# @return [Array<Token>] an array of Tokens
|
483
484
|
def rights
|
484
485
|
right_array = []
|
485
|
-
PyCall::List.(@py_span.rights).each do |py_right|
|
486
|
+
PyCall::List.call(@py_span.rights).each do |py_right|
|
486
487
|
right_array << Token.new(py_right)
|
487
488
|
end
|
488
489
|
right_array
|
489
490
|
end
|
490
491
|
|
491
492
|
# Returns Tokens that are within the span and tokens that descend from them.
|
492
|
-
# @return [Array<Token>] an array of tokens
|
493
|
+
# @return [Array<Token>] an array of tokens
|
493
494
|
def subtree
|
494
495
|
subtree_array = []
|
495
|
-
PyCall::List.(@py_span.subtree).each do |py_subtree|
|
496
|
+
PyCall::List.call(@py_span.subtree).each do |py_subtree|
|
496
497
|
subtree_array << Token.new(py_subtree)
|
497
498
|
end
|
498
499
|
subtree_array
|
499
500
|
end
|
500
501
|
|
501
502
|
# Returns the label
|
502
|
-
# @return [String]
|
503
|
+
# @return [String]
|
503
504
|
def label
|
504
505
|
@py_span.label_
|
505
506
|
end
|
@@ -508,11 +509,14 @@ module Spacy
|
|
508
509
|
def method_missing(name, *args)
|
509
510
|
@py_span.send(name, *args)
|
510
511
|
end
|
512
|
+
|
513
|
+
def respond_to_missing?(sym)
|
514
|
+
sym ? true : super
|
515
|
+
end
|
511
516
|
end
|
512
517
|
|
513
518
|
# See also spaCy Python API document for [`Token`](https://spacy.io/api/token).
|
514
519
|
class Token
|
515
|
-
|
516
520
|
# @return [Object] a Python `Token` instance accessible via `PyCall`
|
517
521
|
attr_reader :py_token
|
518
522
|
|
@@ -520,17 +524,16 @@ module Spacy
|
|
520
524
|
attr_reader :text
|
521
525
|
|
522
526
|
# It is recommended to use {Doc#tokens} or {Span#tokens} methods to create tokens.
|
523
|
-
# There is no way to generate a token from scratch but relying on a pre-exising Python
|
527
|
+
# There is no way to generate a token from scratch but relying on a pre-exising Python `Token` object.
|
524
528
|
# @param py_token [Object] Python `Token` object
|
525
529
|
def initialize(py_token)
|
526
530
|
@py_token = py_token
|
527
531
|
@text = @py_token.text
|
528
532
|
end
|
529
533
|
|
530
|
-
|
531
534
|
# Returns the head token
|
532
535
|
# @return [Token]
|
533
|
-
def head
|
536
|
+
def head
|
534
537
|
Token.new(@py_token.head)
|
535
538
|
end
|
536
539
|
|
@@ -538,7 +541,7 @@ module Spacy
|
|
538
541
|
# @return [Array<Token>] an array of tokens
|
539
542
|
def subtree
|
540
543
|
descendant_array = []
|
541
|
-
PyCall::List.(@py_token.subtree).each do |descendant|
|
544
|
+
PyCall::List.call(@py_token.subtree).each do |descendant|
|
542
545
|
descendant_array << Token.new(descendant)
|
543
546
|
end
|
544
547
|
descendant_array
|
@@ -548,7 +551,7 @@ module Spacy
|
|
548
551
|
# @return [Array<Token>] an array of tokens
|
549
552
|
def ancestors
|
550
553
|
ancestor_array = []
|
551
|
-
PyCall::List.(@py_token.ancestors).each do |ancestor|
|
554
|
+
PyCall::List.call(@py_token.ancestors).each do |ancestor|
|
552
555
|
ancestor_array << Token.new(ancestor)
|
553
556
|
end
|
554
557
|
ancestor_array
|
@@ -558,7 +561,7 @@ module Spacy
|
|
558
561
|
# @return [Array<Token>] an array of tokens
|
559
562
|
def children
|
560
563
|
child_array = []
|
561
|
-
PyCall::List.(@py_token.children).each do |child|
|
564
|
+
PyCall::List.call(@py_token.children).each do |child|
|
562
565
|
child_array << Token.new(child)
|
563
566
|
end
|
564
567
|
child_array
|
@@ -568,7 +571,7 @@ module Spacy
|
|
568
571
|
# @return [Array<Token>] an array of tokens
|
569
572
|
def lefts
|
570
573
|
token_array = []
|
571
|
-
PyCall::List.(@py_token.lefts).each do |token|
|
574
|
+
PyCall::List.call(@py_token.lefts).each do |token|
|
572
575
|
token_array << Token.new(token)
|
573
576
|
end
|
574
577
|
token_array
|
@@ -578,89 +581,87 @@ module Spacy
|
|
578
581
|
# @return [Array<Token>] an array of tokens
|
579
582
|
def rights
|
580
583
|
token_array = []
|
581
|
-
PyCall::List.(@py_token.rights).each do |token|
|
584
|
+
PyCall::List.call(@py_token.rights).each do |token|
|
582
585
|
token_array << Token.new(token)
|
583
586
|
end
|
584
587
|
token_array
|
585
588
|
end
|
586
589
|
|
587
590
|
# String representation of the token.
|
588
|
-
# @return [String]
|
591
|
+
# @return [String]
|
589
592
|
def to_s
|
590
593
|
@text
|
591
594
|
end
|
592
595
|
|
593
596
|
# Returns a hash or string of morphological information
|
594
597
|
# @param hash [Boolean] if true, a hash will be returned instead of a string
|
595
|
-
# @return [Hash, String]
|
596
|
-
def morphology(hash
|
598
|
+
# @return [Hash, String]
|
599
|
+
def morphology(hash: true)
|
597
600
|
if @py_token.has_morph
|
598
601
|
morph_analysis = @py_token.morph
|
599
|
-
if hash
|
600
|
-
return morph_analysis.to_dict
|
601
|
-
else
|
602
|
-
return morph_analysis.to_s
|
603
|
-
end
|
604
|
-
else
|
605
602
|
if hash
|
606
|
-
|
603
|
+
morph_analysis.to_dict
|
607
604
|
else
|
608
|
-
|
605
|
+
morph_analysis.to_s
|
609
606
|
end
|
607
|
+
elsif hash
|
608
|
+
{}
|
609
|
+
else
|
610
|
+
""
|
610
611
|
end
|
611
612
|
end
|
612
613
|
|
613
614
|
# Returns the lemma by calling `lemma_' of `@py_token` object
|
614
|
-
# @return [String]
|
615
|
+
# @return [String]
|
615
616
|
def lemma
|
616
617
|
@py_token.lemma_
|
617
618
|
end
|
618
619
|
|
619
620
|
# Returns the lowercase form by calling `lower_' of `@py_token` object
|
620
|
-
# @return [String]
|
621
|
+
# @return [String]
|
621
622
|
def lower
|
622
623
|
@py_token.lower_
|
623
624
|
end
|
624
625
|
|
625
626
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_token` object
|
626
|
-
# @return [String]
|
627
|
+
# @return [String]
|
627
628
|
def shape
|
628
629
|
@py_token.shape_
|
629
630
|
end
|
630
631
|
|
631
632
|
# Returns the pos by calling `pos_' of `@py_token` object
|
632
|
-
# @return [String]
|
633
|
+
# @return [String]
|
633
634
|
def pos
|
634
635
|
@py_token.pos_
|
635
636
|
end
|
636
637
|
|
637
638
|
# Returns the fine-grained pos by calling `tag_' of `@py_token` object
|
638
|
-
# @return [String]
|
639
|
-
def tag
|
639
|
+
# @return [String]
|
640
|
+
def tag
|
640
641
|
@py_token.tag_
|
641
642
|
end
|
642
643
|
|
643
644
|
# Returns the dependency relation by calling `dep_' of `@py_token` object
|
644
|
-
# @return [String]
|
645
|
+
# @return [String]
|
645
646
|
def dep
|
646
647
|
@py_token.dep_
|
647
648
|
end
|
648
|
-
|
649
|
+
|
649
650
|
# Returns the language by calling `lang_' of `@py_token` object
|
650
|
-
# @return [String]
|
651
|
-
def lang
|
651
|
+
# @return [String]
|
652
|
+
def lang
|
652
653
|
@py_token.lang_
|
653
654
|
end
|
654
655
|
|
655
656
|
# Returns the trailing space character if present by calling `whitespace_' of `@py_token` object
|
656
|
-
# @return [String]
|
657
|
-
def whitespace
|
657
|
+
# @return [String]
|
658
|
+
def whitespace
|
658
659
|
@py_token.whitespace_
|
659
660
|
end
|
660
661
|
|
661
662
|
# Returns the named entity type by calling `ent_type_' of `@py_token` object
|
662
|
-
# @return [String]
|
663
|
-
def ent_type
|
663
|
+
# @return [String]
|
664
|
+
def ent_type
|
664
665
|
@py_token.ent_type_
|
665
666
|
end
|
666
667
|
|
@@ -674,11 +675,14 @@ module Spacy
|
|
674
675
|
def method_missing(name, *args)
|
675
676
|
@py_token.send(name, *args)
|
676
677
|
end
|
678
|
+
|
679
|
+
def respond_to_missing?(sym)
|
680
|
+
sym ? true : super
|
681
|
+
end
|
677
682
|
end
|
678
683
|
|
679
684
|
# See also spaCy Python API document for [`Lexeme`](https://spacy.io/api/lexeme).
|
680
|
-
class Lexeme
|
681
|
-
|
685
|
+
class Lexeme
|
682
686
|
# @return [Object] a Python `Lexeme` instance accessible via `PyCall`
|
683
687
|
attr_reader :py_lexeme
|
684
688
|
|
@@ -694,50 +698,50 @@ module Spacy
|
|
694
698
|
end
|
695
699
|
|
696
700
|
# String representation of the token.
|
697
|
-
# @return [String]
|
701
|
+
# @return [String]
|
698
702
|
def to_s
|
699
703
|
@text
|
700
704
|
end
|
701
705
|
|
702
706
|
# Returns the lowercase form by calling `lower_' of `@py_lexeme` object
|
703
|
-
# @return [String]
|
707
|
+
# @return [String]
|
704
708
|
def lower
|
705
709
|
@py_lexeme.lower_
|
706
710
|
end
|
707
711
|
|
708
712
|
# Returns the shape (e.g. "Xxxxx") by calling `shape_' of `@py_lexeme` object
|
709
|
-
# @return [String]
|
713
|
+
# @return [String]
|
710
714
|
def shape
|
711
715
|
@py_lexeme.shape_
|
712
716
|
end
|
713
717
|
|
714
718
|
# Returns the language by calling `lang_' of `@py_lexeme` object
|
715
|
-
# @return [String]
|
716
|
-
def lang
|
719
|
+
# @return [String]
|
720
|
+
def lang
|
717
721
|
@py_lexeme.lang_
|
718
722
|
end
|
719
723
|
|
720
724
|
# Returns the length-N substring from the start of the word by calling `prefix_' of `@py_lexeme` object
|
721
|
-
# @return [String]
|
722
|
-
def prefix
|
725
|
+
# @return [String]
|
726
|
+
def prefix
|
723
727
|
@py_lexeme.prefix_
|
724
728
|
end
|
725
|
-
|
729
|
+
|
726
730
|
# Returns the length-N substring from the end of the word by calling `suffix_' of `@py_lexeme` object
|
727
|
-
# @return [String]
|
731
|
+
# @return [String]
|
728
732
|
def suffix
|
729
733
|
@py_lexeme.suffix_
|
730
734
|
end
|
731
735
|
|
732
736
|
# Returns the lexemes's norm, i.e. a normalized form of the lexeme calling `norm_' of `@py_lexeme` object
|
733
|
-
# @return [String]
|
737
|
+
# @return [String]
|
734
738
|
def norm
|
735
739
|
@py_lexeme.norm_
|
736
740
|
end
|
737
741
|
|
738
742
|
# Returns a semantic similarity estimate.
|
739
|
-
# @param other [Lexeme] the other
|
740
|
-
# @return [Float]
|
743
|
+
# @param other [Lexeme] the other lexeme to which a similarity estimation is made
|
744
|
+
# @return [Float]
|
741
745
|
def similarity(other)
|
742
746
|
@py_lexeme.similarity(other.py_lexeme)
|
743
747
|
end
|
@@ -746,7 +750,9 @@ module Spacy
|
|
746
750
|
def method_missing(name, *args)
|
747
751
|
@py_lexeme.send(name, *args)
|
748
752
|
end
|
749
|
-
end
|
750
753
|
|
754
|
+
def respond_to_missing?(sym)
|
755
|
+
sym ? true : super
|
756
|
+
end
|
757
|
+
end
|
751
758
|
end
|
752
|
-
|