ruby-spacy 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/FUNDING.yml +6 -0
- data/.gitignore +1 -0
- data/CHANGELOG.md +24 -7
- data/Gemfile +1 -1
- data/README.md +120 -22
- data/lib/ruby-spacy/openai_client.rb +166 -0
- data/lib/ruby-spacy/openai_helper.rb +91 -0
- data/lib/ruby-spacy/version.rb +1 -1
- data/lib/ruby-spacy.rb +455 -248
- data/ruby-spacy.gemspec +3 -2
- metadata +34 -20
data/lib/ruby-spacy.rb
CHANGED
|
@@ -1,27 +1,29 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "ruby-spacy/version"
|
|
4
|
+
require_relative "ruby-spacy/openai_client"
|
|
5
|
+
require_relative "ruby-spacy/openai_helper"
|
|
4
6
|
require "numpy"
|
|
5
|
-
require "openai"
|
|
6
7
|
require "pycall"
|
|
7
|
-
require "strscan"
|
|
8
8
|
require "timeout"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
PyCall.init
|
|
12
|
-
_spacy = PyCall.import_module("spacy")
|
|
13
|
-
rescue PyCall::PyError => e
|
|
14
|
-
puts "Failed to initialize PyCall or import spacy: #{e.message}"
|
|
15
|
-
puts "Python traceback:"
|
|
16
|
-
puts e.traceback
|
|
17
|
-
raise
|
|
18
|
-
end
|
|
9
|
+
require "json"
|
|
10
|
+
require "base64"
|
|
19
11
|
|
|
20
12
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
|
21
13
|
module Spacy
|
|
22
14
|
MAX_RETRIAL = 5
|
|
23
15
|
|
|
24
|
-
|
|
16
|
+
begin
|
|
17
|
+
PyCall.init
|
|
18
|
+
spacy = PyCall.import_module("spacy")
|
|
19
|
+
rescue PyCall::PyError => e
|
|
20
|
+
puts "Failed to initialize PyCall or import spacy: #{e.message}"
|
|
21
|
+
puts "Python traceback:"
|
|
22
|
+
puts e.traceback
|
|
23
|
+
raise
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
Builtins = PyCall.import_module("builtins")
|
|
25
27
|
SpacyVersion = spacy.__version__
|
|
26
28
|
|
|
27
29
|
# Python `Language` class
|
|
@@ -39,6 +41,9 @@ module Spacy
|
|
|
39
41
|
# Python `Matcher` class object
|
|
40
42
|
PyMatcher = spacy.matcher.Matcher
|
|
41
43
|
|
|
44
|
+
# Python `PhraseMatcher` class object
|
|
45
|
+
PyPhraseMatcher = spacy.matcher.PhraseMatcher
|
|
46
|
+
|
|
42
47
|
# Python `displacy` object
|
|
43
48
|
PyDisplacy = PyCall.import_module('spacy.displacy')
|
|
44
49
|
|
|
@@ -49,16 +54,15 @@ module Spacy
|
|
|
49
54
|
PyCall::List.call(py_generator)
|
|
50
55
|
end
|
|
51
56
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@openai_client
|
|
57
|
+
# Checks if a Python object has a given attribute using builtins.hasattr.
|
|
58
|
+
# Falls back to true if the check itself fails (e.g. due to PyCall issues).
|
|
59
|
+
# @param py_obj [Object] a Python object
|
|
60
|
+
# @param attr [String, Symbol] the attribute name to check
|
|
61
|
+
# @return [Boolean]
|
|
62
|
+
def self.py_hasattr?(py_obj, attr)
|
|
63
|
+
Builtins.hasattr(py_obj, attr.to_s)
|
|
64
|
+
rescue StandardError
|
|
65
|
+
true
|
|
62
66
|
end
|
|
63
67
|
|
|
64
68
|
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
|
@@ -84,17 +88,19 @@ module Spacy
|
|
|
84
88
|
# @param nlp [Language] an instance of {Language} class
|
|
85
89
|
# @param py_doc [Object] an instance of Python `Doc` class
|
|
86
90
|
# @param text [String] the text string to be analyzed
|
|
87
|
-
def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL
|
|
88
|
-
retrial: 0)
|
|
91
|
+
def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL)
|
|
89
92
|
@py_nlp = nlp
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
93
|
+
retrial = 0
|
|
94
|
+
begin
|
|
95
|
+
@py_doc = py_doc || nlp.call(text)
|
|
96
|
+
@text = @py_doc.text
|
|
97
|
+
rescue StandardError
|
|
98
|
+
retrial += 1
|
|
99
|
+
raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
|
|
95
100
|
|
|
96
|
-
|
|
97
|
-
|
|
101
|
+
sleep 0.5
|
|
102
|
+
retry
|
|
103
|
+
end
|
|
98
104
|
end
|
|
99
105
|
|
|
100
106
|
# Retokenizes the text merging a span into a single token.
|
|
@@ -128,11 +134,7 @@ module Spacy
|
|
|
128
134
|
# Returns an array of tokens contained in the doc.
|
|
129
135
|
# @return [Array<Token>]
|
|
130
136
|
def tokens
|
|
131
|
-
|
|
132
|
-
PyCall::List.call(@py_doc).each do |py_token|
|
|
133
|
-
results << Token.new(py_token)
|
|
134
|
-
end
|
|
135
|
-
results
|
|
137
|
+
PyCall::List.call(@py_doc).map { |py_token| Token.new(py_token) }
|
|
136
138
|
end
|
|
137
139
|
|
|
138
140
|
# Iterates over the elements in the doc yielding a token instance each time.
|
|
@@ -148,54 +150,50 @@ module Spacy
|
|
|
148
150
|
# @param optional_size [Integer] an integer representing the size of the span
|
|
149
151
|
# @return [Span]
|
|
150
152
|
def span(range_or_start, optional_size = nil)
|
|
153
|
+
doc_len = PyCall.len(@py_doc)
|
|
154
|
+
|
|
151
155
|
if optional_size
|
|
152
156
|
start_index = range_or_start
|
|
153
|
-
|
|
157
|
+
start_index += doc_len if start_index < 0
|
|
158
|
+
end_index = start_index + optional_size - 1
|
|
154
159
|
else
|
|
155
|
-
start_index = range_or_start.first
|
|
156
160
|
range = range_or_start
|
|
157
|
-
|
|
161
|
+
start_index = range.first
|
|
162
|
+
start_index += doc_len if start_index < 0
|
|
163
|
+
end_val = range.end
|
|
164
|
+
if end_val.nil?
|
|
165
|
+
end_index = doc_len - 1
|
|
166
|
+
else
|
|
167
|
+
end_val += doc_len if end_val < 0
|
|
168
|
+
end_index = range.exclude_end? ? end_val - 1 : end_val
|
|
169
|
+
end
|
|
158
170
|
end
|
|
159
171
|
|
|
160
|
-
end_index = start_index + temp.size - 1
|
|
161
|
-
|
|
162
172
|
Span.new(self, start_index: start_index, end_index: end_index)
|
|
163
173
|
end
|
|
164
174
|
|
|
165
175
|
# Returns an array of spans representing noun chunks.
|
|
166
176
|
# @return [Array<Span>]
|
|
167
177
|
def noun_chunks
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
py_chunks.each do |py_chunk|
|
|
171
|
-
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
|
178
|
+
PyCall::List.call(@py_doc.noun_chunks).map do |py_chunk|
|
|
179
|
+
Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
|
172
180
|
end
|
|
173
|
-
chunk_array
|
|
174
181
|
end
|
|
175
182
|
|
|
176
183
|
# Returns an array of spans each representing a sentence.
|
|
177
184
|
# @return [Array<Span>]
|
|
178
185
|
def sents
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
py_sentences.each do |py_sent|
|
|
182
|
-
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
|
186
|
+
PyCall::List.call(@py_doc.sents).map do |py_sent|
|
|
187
|
+
Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
|
183
188
|
end
|
|
184
|
-
sentence_array
|
|
185
189
|
end
|
|
186
190
|
|
|
187
191
|
# Returns an array of spans each representing a named entity.
|
|
188
192
|
# @return [Array<Span>]
|
|
189
193
|
def ents
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
PyCall::List.call(@py_doc.ents).each do |ent|
|
|
193
|
-
ent.define_singleton_method :label do
|
|
194
|
-
label_
|
|
195
|
-
end
|
|
196
|
-
ent_array << ent
|
|
194
|
+
PyCall::List.call(@py_doc.ents).map do |py_span|
|
|
195
|
+
Span.new(self, py_span: py_span)
|
|
197
196
|
end
|
|
198
|
-
ent_array
|
|
199
197
|
end
|
|
200
198
|
|
|
201
199
|
# Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
|
|
@@ -216,6 +214,31 @@ module Spacy
|
|
|
216
214
|
py_doc.similarity(other.py_doc)
|
|
217
215
|
end
|
|
218
216
|
|
|
217
|
+
# Serializes the doc to a binary string.
|
|
218
|
+
# The binary data includes all annotations (tokens, entities, etc.) and can be
|
|
219
|
+
# used to restore the doc later without re-processing.
|
|
220
|
+
# @return [String] binary representation of the doc
|
|
221
|
+
# @example Save doc to file
|
|
222
|
+
# doc = nlp.read("Hello world")
|
|
223
|
+
# File.binwrite("doc.bin", doc.to_bytes)
|
|
224
|
+
def to_bytes
|
|
225
|
+
@py_doc.to_bytes.force_encoding(Encoding::BINARY)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Restores a doc from binary data created by {#to_bytes}.
|
|
229
|
+
# This is useful for caching processed documents to avoid re-processing.
|
|
230
|
+
# @param byte_string [String] binary data from {#to_bytes}
|
|
231
|
+
# @return [Doc] the restored doc
|
|
232
|
+
# @example Load doc from file
|
|
233
|
+
# bytes = File.binread("doc.bin")
|
|
234
|
+
# doc = Spacy::Doc.from_bytes(nlp, bytes)
|
|
235
|
+
def self.from_bytes(nlp, byte_string)
|
|
236
|
+
b64 = Base64.strict_encode64(byte_string)
|
|
237
|
+
py_bytes = PyCall.eval("__import__('base64').b64decode('#{b64}')")
|
|
238
|
+
py_doc = nlp.py_nlp.call("").from_bytes(py_bytes)
|
|
239
|
+
new(nlp.py_nlp, py_doc: py_doc)
|
|
240
|
+
end
|
|
241
|
+
|
|
219
242
|
# Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
|
|
220
243
|
# @param style [String] either `dep` or `ent`
|
|
221
244
|
# @param compact [Boolean] only relevant to the `dep' style
|
|
@@ -224,12 +247,86 @@ module Spacy
|
|
|
224
247
|
PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
|
|
225
248
|
end
|
|
226
249
|
|
|
250
|
+
# Generates a JSON string summarizing the linguistic analysis of the document.
|
|
251
|
+
# Designed to be passed as context to an LLM (e.g., via {OpenAIHelper#chat}).
|
|
252
|
+
#
|
|
253
|
+
# @param sections [Array<Symbol>] which sections to include
|
|
254
|
+
# (:text, :tokens, :entities, :noun_chunks, :sentences)
|
|
255
|
+
# @param token_attributes [Array<Symbol>] which token attributes to include
|
|
256
|
+
# (:text, :lemma, :pos, :tag, :dep, :head, :ent_type, :morphology)
|
|
257
|
+
# @return [String] a JSON string of the linguistic summary
|
|
258
|
+
def linguistic_summary(sections: [:text, :tokens, :entities, :noun_chunks],
|
|
259
|
+
token_attributes: [:text, :lemma, :pos, :dep, :head])
|
|
260
|
+
result = {}
|
|
261
|
+
|
|
262
|
+
sections.each do |section|
|
|
263
|
+
case section
|
|
264
|
+
when :text
|
|
265
|
+
result[:text] = @text
|
|
266
|
+
when :tokens
|
|
267
|
+
result[:tokens] = tokens.map do |token|
|
|
268
|
+
token_hash = {}
|
|
269
|
+
token_attributes.each do |attr|
|
|
270
|
+
case attr
|
|
271
|
+
when :head
|
|
272
|
+
token_hash[:head] = token.head.text
|
|
273
|
+
when :morphology
|
|
274
|
+
# Use string form and parse to ensure a plain Ruby Hash for JSON serialization
|
|
275
|
+
morph_str = token.morphology(hash: false)
|
|
276
|
+
token_hash[:morphology] = if morph_str.empty?
|
|
277
|
+
{}
|
|
278
|
+
else
|
|
279
|
+
morph_str.split("|").each_with_object({}) do |pair, h|
|
|
280
|
+
k, v = pair.split("=", 2)
|
|
281
|
+
h[k] = v
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
else
|
|
285
|
+
token_hash[attr] = token.send(attr)
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
token_hash
|
|
289
|
+
end
|
|
290
|
+
when :entities
|
|
291
|
+
ent_list = ents
|
|
292
|
+
result[:entities] = ent_list.map do |ent|
|
|
293
|
+
{ text: ent.text, label: ent.label }
|
|
294
|
+
end
|
|
295
|
+
when :noun_chunks
|
|
296
|
+
result[:noun_chunks] = noun_chunks.map do |chunk|
|
|
297
|
+
{ text: chunk.text, root: chunk.root.text }
|
|
298
|
+
end
|
|
299
|
+
when :sentences
|
|
300
|
+
result[:sentences] = sents.map(&:text)
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
result.to_json
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
# Sends a query to OpenAI's chat completion API with optional tool support.
|
|
308
|
+
# The get_tokens tool allows the model to request token-level linguistic analysis.
|
|
309
|
+
#
|
|
310
|
+
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
311
|
+
# @param max_completion_tokens [Integer] Maximum tokens in the response
|
|
312
|
+
# @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
|
|
313
|
+
# @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
|
|
314
|
+
# @param model [String] The model to use (default: gpt-5-mini)
|
|
315
|
+
# @param messages [Array<Hash>] Conversation history (for recursive tool calls). Note: this array is modified in place when tool calls occur.
|
|
316
|
+
# @param prompt [String, nil] System prompt for the query
|
|
317
|
+
# @return [String, nil] The model's response content
|
|
227
318
|
def openai_query(access_token: nil,
|
|
228
|
-
|
|
319
|
+
max_completion_tokens: nil,
|
|
320
|
+
max_tokens: nil,
|
|
229
321
|
temperature: 0.7,
|
|
230
|
-
model: "gpt-
|
|
322
|
+
model: "gpt-5-mini",
|
|
231
323
|
messages: [],
|
|
232
|
-
prompt: nil
|
|
324
|
+
prompt: nil,
|
|
325
|
+
response_format: nil,
|
|
326
|
+
max_tool_call_depth: 5,
|
|
327
|
+
_tool_call_depth: 0)
|
|
328
|
+
# Support both max_completion_tokens and max_tokens for backward compatibility
|
|
329
|
+
max_completion_tokens ||= max_tokens || 1000
|
|
233
330
|
if messages.empty?
|
|
234
331
|
messages = [
|
|
235
332
|
{ role: "system", content: prompt },
|
|
@@ -237,122 +334,164 @@ module Spacy
|
|
|
237
334
|
]
|
|
238
335
|
end
|
|
239
336
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
"description": "text to be tokenized"
|
|
262
|
-
}
|
|
263
|
-
},
|
|
264
|
-
"required": ["text"]
|
|
265
|
-
}
|
|
337
|
+
client = openai_client(access_token)
|
|
338
|
+
|
|
339
|
+
# Tool definition for token analysis (GPT-5 tools API format)
|
|
340
|
+
tools = nil
|
|
341
|
+
tool_choice = nil
|
|
342
|
+
if _tool_call_depth < max_tool_call_depth
|
|
343
|
+
tools = [
|
|
344
|
+
{
|
|
345
|
+
type: "function",
|
|
346
|
+
function: {
|
|
347
|
+
name: "get_tokens",
|
|
348
|
+
description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
|
|
349
|
+
parameters: {
|
|
350
|
+
type: "object",
|
|
351
|
+
properties: {
|
|
352
|
+
text: {
|
|
353
|
+
type: "string",
|
|
354
|
+
description: "text to be tokenized"
|
|
355
|
+
}
|
|
356
|
+
},
|
|
357
|
+
required: ["text"]
|
|
266
358
|
}
|
|
267
|
-
|
|
359
|
+
}
|
|
268
360
|
}
|
|
269
|
-
|
|
361
|
+
]
|
|
362
|
+
tool_choice = "auto"
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
response = client.chat(
|
|
366
|
+
model: model,
|
|
367
|
+
messages: messages,
|
|
368
|
+
max_completion_tokens: max_completion_tokens,
|
|
369
|
+
temperature: temperature,
|
|
370
|
+
tools: tools,
|
|
371
|
+
tool_choice: tool_choice,
|
|
372
|
+
response_format: response_format
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
message = response.dig("choices", 0, "message")
|
|
270
376
|
|
|
271
|
-
|
|
377
|
+
# Handle tool calls (GPT-5 format)
|
|
378
|
+
if message["tool_calls"] && !message["tool_calls"].empty?
|
|
379
|
+
messages << message
|
|
380
|
+
|
|
381
|
+
message["tool_calls"].each do |tool_call|
|
|
382
|
+
function_name = tool_call.dig("function", "name")
|
|
383
|
+
tool_call_id = tool_call["id"]
|
|
272
384
|
|
|
273
|
-
if message["role"] == "assistant" && message["function_call"]
|
|
274
|
-
messages << message
|
|
275
|
-
function_name = message.dig("function_call", "name")
|
|
276
|
-
_args = JSON.parse(message.dig("function_call", "arguments"))
|
|
277
385
|
case function_name
|
|
278
386
|
when "get_tokens"
|
|
279
|
-
|
|
387
|
+
result = tokens.map do |t|
|
|
280
388
|
{
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
389
|
+
surface: t.text,
|
|
390
|
+
lemma: t.lemma,
|
|
391
|
+
pos: t.pos,
|
|
392
|
+
tag: t.tag,
|
|
393
|
+
dep: t.dep,
|
|
394
|
+
ent_type: t.ent_type,
|
|
395
|
+
morphology: t.morphology
|
|
288
396
|
}
|
|
289
397
|
end.to_json
|
|
398
|
+
|
|
399
|
+
messages << {
|
|
400
|
+
role: "tool",
|
|
401
|
+
tool_call_id: tool_call_id,
|
|
402
|
+
content: result
|
|
403
|
+
}
|
|
290
404
|
end
|
|
291
|
-
messages << { role: "system", content: res }
|
|
292
|
-
openai_query(access_token: access_token, max_tokens: max_tokens,
|
|
293
|
-
temperature: temperature, model: model,
|
|
294
|
-
messages: messages, prompt: prompt)
|
|
295
|
-
else
|
|
296
|
-
message["content"]
|
|
297
405
|
end
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
406
|
+
|
|
407
|
+
# Recursive call to get final response after tool execution
|
|
408
|
+
openai_query(
|
|
409
|
+
access_token: access_token,
|
|
410
|
+
max_completion_tokens: max_completion_tokens,
|
|
411
|
+
temperature: temperature,
|
|
412
|
+
model: model,
|
|
413
|
+
messages: messages,
|
|
414
|
+
prompt: prompt,
|
|
415
|
+
response_format: response_format,
|
|
416
|
+
max_tool_call_depth: max_tool_call_depth,
|
|
417
|
+
_tool_call_depth: _tool_call_depth + 1
|
|
418
|
+
)
|
|
419
|
+
else
|
|
420
|
+
message["content"]
|
|
302
421
|
end
|
|
303
|
-
|
|
422
|
+
rescue OpenAIClient::APIError => e
|
|
423
|
+
puts "Error: OpenAI API call failed - #{e.message}"
|
|
424
|
+
nil
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
# Sends a text completion request to OpenAI's chat API.
|
|
428
|
+
#
|
|
429
|
+
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
430
|
+
# @param max_completion_tokens [Integer] Maximum tokens in the response
|
|
431
|
+
# @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
|
|
432
|
+
# @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
|
|
433
|
+
# @param model [String] The model to use (default: gpt-5-mini)
|
|
434
|
+
# @return [String, nil] The completed text
|
|
435
|
+
def openai_completion(access_token: nil, max_completion_tokens: nil, max_tokens: nil, temperature: 0.7, model: "gpt-5-mini")
|
|
436
|
+
# Support both max_completion_tokens and max_tokens for backward compatibility
|
|
437
|
+
max_completion_tokens ||= max_tokens || 1000
|
|
304
438
|
|
|
305
|
-
def openai_completion(access_token: nil, max_tokens: 1000, temperature: 0.7, model: "gpt-4o-mini")
|
|
306
439
|
messages = [
|
|
307
440
|
{ role: "system", content: "Complete the text input by the user." },
|
|
308
441
|
{ role: "user", content: @text }
|
|
309
442
|
]
|
|
310
|
-
access_token ||= ENV["OPENAI_API_KEY"]
|
|
311
|
-
raise "Error: OPENAI_API_KEY is not set" unless access_token
|
|
312
|
-
|
|
313
|
-
begin
|
|
314
|
-
response = Spacy.openai_client(access_token: access_token).chat(
|
|
315
|
-
parameters: {
|
|
316
|
-
model: model,
|
|
317
|
-
messages: messages,
|
|
318
|
-
max_tokens: max_tokens,
|
|
319
|
-
temperature: temperature
|
|
320
|
-
}
|
|
321
|
-
)
|
|
322
|
-
response.dig("choices", 0, "message", "content")
|
|
323
|
-
rescue StandardError => e
|
|
324
|
-
puts "Error: OpenAI API call failed."
|
|
325
|
-
pp e.message
|
|
326
|
-
pp e.backtrace
|
|
327
|
-
end
|
|
328
|
-
end
|
|
329
443
|
|
|
330
|
-
|
|
444
|
+
client = openai_client(access_token)
|
|
445
|
+
response = client.chat(
|
|
446
|
+
model: model,
|
|
447
|
+
messages: messages,
|
|
448
|
+
max_completion_tokens: max_completion_tokens,
|
|
449
|
+
temperature: temperature
|
|
450
|
+
)
|
|
451
|
+
response.dig("choices", 0, "message", "content")
|
|
452
|
+
rescue OpenAIClient::APIError => e
|
|
453
|
+
puts "Error: OpenAI API call failed - #{e.message}"
|
|
454
|
+
nil
|
|
455
|
+
end
|
|
456
|
+
|
|
457
|
+
# Generates text embeddings using OpenAI's embeddings API.
|
|
458
|
+
#
|
|
459
|
+
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
460
|
+
# @param model [String] The embeddings model (default: text-embedding-3-small)
|
|
461
|
+
# @param dimensions [Integer, nil] The number of dimensions for the output embeddings (nil uses model default)
|
|
462
|
+
# @return [Array<Float>, nil] The embedding vector
|
|
463
|
+
def openai_embeddings(access_token: nil, model: "text-embedding-3-small", dimensions: nil)
|
|
464
|
+
client = openai_client(access_token)
|
|
465
|
+
response = client.embeddings(model: model, input: @text, dimensions: dimensions)
|
|
466
|
+
response.dig("data", 0, "embedding")
|
|
467
|
+
rescue OpenAIClient::APIError => e
|
|
468
|
+
puts "Error: OpenAI API call failed - #{e.message}"
|
|
469
|
+
nil
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
private
|
|
473
|
+
|
|
474
|
+
def openai_client(access_token)
|
|
331
475
|
access_token ||= ENV["OPENAI_API_KEY"]
|
|
332
476
|
raise "Error: OPENAI_API_KEY is not set" unless access_token
|
|
333
477
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
parameters: {
|
|
337
|
-
model: model,
|
|
338
|
-
input: @text
|
|
339
|
-
}
|
|
340
|
-
)
|
|
341
|
-
response.dig("data", 0, "embedding")
|
|
342
|
-
rescue StandardError => e
|
|
343
|
-
puts "Error: OpenAI API call failed."
|
|
344
|
-
pp e.message
|
|
345
|
-
pp e.backtrace
|
|
346
|
-
end
|
|
478
|
+
@openai_clients ||= {}
|
|
479
|
+
@openai_clients[access_token] ||= OpenAIClient.new(access_token: access_token)
|
|
347
480
|
end
|
|
348
481
|
|
|
482
|
+
public
|
|
483
|
+
|
|
349
484
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
|
350
485
|
def method_missing(name, *args)
|
|
351
486
|
@py_doc.send(name, *args)
|
|
352
487
|
end
|
|
353
488
|
|
|
354
|
-
def respond_to_missing?(sym)
|
|
355
|
-
|
|
489
|
+
def respond_to_missing?(sym, include_private = false)
|
|
490
|
+
Spacy.py_hasattr?(@py_doc, sym) || super
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
def instance_variables_to_inspect
|
|
494
|
+
[:@text]
|
|
356
495
|
end
|
|
357
496
|
end
|
|
358
497
|
|
|
@@ -366,8 +505,13 @@ module Spacy
|
|
|
366
505
|
|
|
367
506
|
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
|
368
507
|
# @param model [String] A language model installed in the system
|
|
369
|
-
def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL,
|
|
508
|
+
def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, timeout: 60)
|
|
509
|
+
unless model.to_s.match?(/\A[a-zA-Z0-9_\-\.\/]+\z/)
|
|
510
|
+
raise ArgumentError, "Invalid model name: #{model.inspect}"
|
|
511
|
+
end
|
|
512
|
+
|
|
370
513
|
@spacy_nlp_id = "nlp_#{model.object_id}"
|
|
514
|
+
retrial = 0
|
|
371
515
|
begin
|
|
372
516
|
Timeout.timeout(timeout) do
|
|
373
517
|
PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
|
|
@@ -398,21 +542,29 @@ module Spacy
|
|
|
398
542
|
Matcher.new(@py_nlp)
|
|
399
543
|
end
|
|
400
544
|
|
|
545
|
+
# Generates a phrase matcher for the current language model.
|
|
546
|
+
# PhraseMatcher is more efficient than {Matcher} for matching large terminology lists.
|
|
547
|
+
# @param attr [String] the token attribute to match on (default: "ORTH").
|
|
548
|
+
# Use "LOWER" for case-insensitive matching.
|
|
549
|
+
# @return [PhraseMatcher]
|
|
550
|
+
# @example
|
|
551
|
+
# matcher = nlp.phrase_matcher(attr: "LOWER")
|
|
552
|
+
# matcher.add("PRODUCT", ["iPhone", "MacBook Pro"])
|
|
553
|
+
def phrase_matcher(attr: "ORTH")
|
|
554
|
+
PhraseMatcher.new(self, attr: attr)
|
|
555
|
+
end
|
|
556
|
+
|
|
401
557
|
# A utility method to lookup a vocabulary item of the given id.
|
|
402
558
|
# @param id [Integer] a vocabulary id
|
|
403
559
|
# @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
|
|
404
560
|
def vocab_string_lookup(id)
|
|
405
|
-
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
|
561
|
+
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{Integer(id)}]")
|
|
406
562
|
end
|
|
407
563
|
|
|
408
564
|
# A utility method to list pipeline components.
|
|
409
565
|
# @return [Array<String>] An array of text strings representing pipeline components
|
|
410
566
|
def pipe_names
|
|
411
|
-
|
|
412
|
-
PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
|
|
413
|
-
pipe_array << pipe
|
|
414
|
-
end
|
|
415
|
-
pipe_array
|
|
567
|
+
PyCall::List.call(@py_nlp.pipe_names).to_a
|
|
416
568
|
end
|
|
417
569
|
|
|
418
570
|
# A utility method to get a Python `Lexeme` object.
|
|
@@ -461,20 +613,62 @@ module Spacy
|
|
|
461
613
|
# @param batch_size [Integer]
|
|
462
614
|
# @return [Array<Doc>]
|
|
463
615
|
def pipe(texts, disable: [], batch_size: 50)
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
|
616
|
+
PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).map do |py_doc|
|
|
617
|
+
Doc.new(@py_nlp, py_doc: py_doc)
|
|
467
618
|
end
|
|
468
|
-
docs
|
|
469
619
|
end
|
|
470
620
|
|
|
471
|
-
#
|
|
621
|
+
# Yields an {OpenAIHelper} instance for making OpenAI API calls within a block.
|
|
622
|
+
# The helper is configured once and reused for all calls within the block,
|
|
623
|
+
# making it efficient for batch processing with {#pipe}.
|
|
624
|
+
#
|
|
625
|
+
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
626
|
+
# @param model [String] the default model for chat requests
|
|
627
|
+
# @param max_completion_tokens [Integer] default maximum tokens in responses
|
|
628
|
+
# @param temperature [Float] default sampling temperature
|
|
629
|
+
# @yield [OpenAIHelper] the helper instance for making API calls
|
|
630
|
+
# @return [Object] the block's return value
|
|
631
|
+
# @example Batch processing with pipe
|
|
632
|
+
# nlp.with_openai(model: "gpt-5-mini") do |ai|
|
|
633
|
+
# nlp.pipe(texts).map do |doc|
|
|
634
|
+
# ai.chat(system: "Analyze.", user: doc.linguistic_summary)
|
|
635
|
+
# end
|
|
636
|
+
# end
|
|
637
|
+
def with_openai(access_token: nil, model: "gpt-5-mini",
|
|
638
|
+
max_completion_tokens: 1000, temperature: 0.7)
|
|
639
|
+
helper = OpenAIHelper.new(
|
|
640
|
+
access_token: access_token,
|
|
641
|
+
model: model,
|
|
642
|
+
max_completion_tokens: max_completion_tokens,
|
|
643
|
+
temperature: temperature
|
|
644
|
+
)
|
|
645
|
+
yield helper
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
# Executes a block within spaCy's memory zone for efficient memory management.
|
|
649
|
+
# Requires spaCy >= 3.8.
|
|
650
|
+
# @yield the block to execute within the memory zone
|
|
651
|
+
# @raise [NotImplementedError] if spaCy version does not support memory zones
|
|
652
|
+
def memory_zone(&block)
|
|
653
|
+
major, minor = SpacyVersion.split(".").map(&:to_i)
|
|
654
|
+
unless major > 3 || (major == 3 && minor >= 8)
|
|
655
|
+
raise NotImplementedError, "memory_zone requires spaCy >= 3.8 (current: #{SpacyVersion})"
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
PyCall.with(@py_nlp.memory_zone, &block)
|
|
659
|
+
end
|
|
660
|
+
|
|
661
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
|
472
662
|
def method_missing(name, *args)
|
|
473
663
|
@py_nlp.send(name, *args)
|
|
474
664
|
end
|
|
475
665
|
|
|
476
|
-
def respond_to_missing?(sym)
|
|
477
|
-
|
|
666
|
+
def respond_to_missing?(sym, include_private = false)
|
|
667
|
+
Spacy.py_hasattr?(@py_nlp, sym) || super
|
|
668
|
+
end
|
|
669
|
+
|
|
670
|
+
def instance_variables_to_inspect
|
|
671
|
+
[:@spacy_nlp_id]
|
|
478
672
|
end
|
|
479
673
|
end
|
|
480
674
|
|
|
@@ -500,19 +694,52 @@ module Spacy
|
|
|
500
694
|
# @param doc [Doc] an {Doc} instance
|
|
501
695
|
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
|
502
696
|
def match(doc)
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
results = []
|
|
506
|
-
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
|
507
|
-
next unless s.matched
|
|
508
|
-
|
|
509
|
-
triple = s.matched.split(", ")
|
|
510
|
-
match_id = triple[0].to_i
|
|
511
|
-
start_index = triple[1].to_i
|
|
512
|
-
end_index = triple[2].to_i - 1
|
|
513
|
-
results << { match_id: match_id, start_index: start_index, end_index: end_index }
|
|
697
|
+
PyCall::List.call(@py_matcher.call(doc.py_doc)).map do |py_match|
|
|
698
|
+
{ match_id: py_match[0].to_i, start_index: py_match[1].to_i, end_index: py_match[2].to_i - 1 }
|
|
514
699
|
end
|
|
515
|
-
|
|
700
|
+
end
|
|
701
|
+
end
|
|
702
|
+
|
|
703
|
+
# See also spaCy Python API document for [`PhraseMatcher`](https://spacy.io/api/phrasematcher).
|
|
704
|
+
# PhraseMatcher is useful for efficiently matching large terminology lists.
|
|
705
|
+
# It's faster than {Matcher} when matching many phrase patterns.
|
|
706
|
+
class PhraseMatcher
|
|
707
|
+
# @return [Object] a Python `PhraseMatcher` instance accessible via `PyCall`
|
|
708
|
+
attr_reader :py_matcher
|
|
709
|
+
|
|
710
|
+
# @return [Language] the language model used by this matcher
|
|
711
|
+
attr_reader :nlp
|
|
712
|
+
|
|
713
|
+
# Creates a {PhraseMatcher} instance.
|
|
714
|
+
# @param nlp [Language] an instance of {Language} class
|
|
715
|
+
# @param attr [String] the token attribute to match on (default: "ORTH").
|
|
716
|
+
# Use "LOWER" for case-insensitive matching.
|
|
717
|
+
# @example Case-insensitive matching
|
|
718
|
+
# matcher = Spacy::PhraseMatcher.new(nlp, attr: "LOWER")
|
|
719
|
+
def initialize(nlp, attr: "ORTH")
|
|
720
|
+
@nlp = nlp
|
|
721
|
+
@py_matcher = PyPhraseMatcher.call(nlp.py_nlp.vocab, attr: attr)
|
|
722
|
+
end
|
|
723
|
+
|
|
724
|
+
# Adds phrase patterns to the matcher.
|
|
725
|
+
# @param label [String] a label string given to the patterns
|
|
726
|
+
# @param phrases [Array<String>] an array of phrase strings to match
|
|
727
|
+
# @example Add product names
|
|
728
|
+
# matcher.add("PRODUCT", ["iPhone", "MacBook Pro", "iPad"])
|
|
729
|
+
def add(label, phrases)
|
|
730
|
+
patterns = phrases.map { |phrase| @nlp.py_nlp.call(phrase) }
|
|
731
|
+
@py_matcher.add(label, patterns)
|
|
732
|
+
end
|
|
733
|
+
|
|
734
|
+
# Execute the phrase match and return matching spans.
|
|
735
|
+
# @param doc [Doc] a {Doc} instance to search
|
|
736
|
+
# @return [Array<Span>] an array of {Span} objects with labels
|
|
737
|
+
# @example Find matches
|
|
738
|
+
# matches = matcher.match(doc)
|
|
739
|
+
# matches.each { |span| puts "#{span.text} => #{span.label}" }
|
|
740
|
+
def match(doc)
|
|
741
|
+
py_matches = @py_matcher.call(doc.py_doc, as_spans: true)
|
|
742
|
+
PyCall::List.call(py_matches).map { |py_span| Span.new(doc, py_span: py_span) }
|
|
516
743
|
end
|
|
517
744
|
end
|
|
518
745
|
|
|
@@ -524,6 +751,9 @@ module Spacy
|
|
|
524
751
|
# @return [Doc] the document to which the span belongs
|
|
525
752
|
attr_reader :doc
|
|
526
753
|
|
|
754
|
+
# @return [String] a text string of the span
|
|
755
|
+
attr_reader :text
|
|
756
|
+
|
|
527
757
|
include Enumerable
|
|
528
758
|
|
|
529
759
|
alias length count
|
|
@@ -539,17 +769,14 @@ module Spacy
|
|
|
539
769
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
|
540
770
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
|
541
771
|
@doc = doc
|
|
542
|
-
@py_span = py_span ||
|
|
772
|
+
@py_span = py_span || PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
|
|
773
|
+
@text = @py_span.text
|
|
543
774
|
end
|
|
544
775
|
|
|
545
776
|
# Returns an array of tokens contained in the span.
|
|
546
777
|
# @return [Array<Token>]
|
|
547
778
|
def tokens
|
|
548
|
-
|
|
549
|
-
PyCall::List.call(@py_span).each do |py_token|
|
|
550
|
-
results << Token.new(py_token)
|
|
551
|
-
end
|
|
552
|
-
results
|
|
779
|
+
PyCall::List.call(@py_span).map { |py_token| Token.new(py_token) }
|
|
553
780
|
end
|
|
554
781
|
|
|
555
782
|
# Iterates over the elements in the span yielding a token instance each time.
|
|
@@ -562,12 +789,9 @@ module Spacy
|
|
|
562
789
|
# Returns an array of spans of noun chunks.
|
|
563
790
|
# @return [Array<Span>]
|
|
564
791
|
def noun_chunks
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
py_chunks.each do |py_span|
|
|
568
|
-
chunk_array << Span.new(@doc, py_span: py_span)
|
|
792
|
+
PyCall::List.call(@py_span.noun_chunks).map do |py_span|
|
|
793
|
+
Span.new(@doc, py_span: py_span)
|
|
569
794
|
end
|
|
570
|
-
chunk_array
|
|
571
795
|
end
|
|
572
796
|
|
|
573
797
|
# Returns the head token
|
|
@@ -579,22 +803,17 @@ module Spacy
|
|
|
579
803
|
# Returns an array of spans that represents sentences.
|
|
580
804
|
# @return [Array<Span>]
|
|
581
805
|
def sents
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
py_sentences.each do |py_span|
|
|
585
|
-
sentence_array << Span.new(@doc, py_span: py_span)
|
|
806
|
+
PyCall::List.call(@py_span.sents).map do |py_span|
|
|
807
|
+
Span.new(@doc, py_span: py_span)
|
|
586
808
|
end
|
|
587
|
-
sentence_array
|
|
588
809
|
end
|
|
589
810
|
|
|
590
811
|
# Returns an array of spans that represents named entities.
|
|
591
812
|
# @return [Array<Span>]
|
|
592
813
|
def ents
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
ent_array << Span.new(@doc, py_span: py_span)
|
|
814
|
+
PyCall::List.call(@py_span.ents).map do |py_span|
|
|
815
|
+
Span.new(@doc, py_span: py_span)
|
|
596
816
|
end
|
|
597
|
-
ent_array
|
|
598
817
|
end
|
|
599
818
|
|
|
600
819
|
# Returns a span that represents the sentence that the given span is part of.
|
|
@@ -631,41 +850,25 @@ module Spacy
|
|
|
631
850
|
# Returns tokens conjugated to the root of the span.
|
|
632
851
|
# @return [Array<Token>] an array of tokens
|
|
633
852
|
def conjuncts
|
|
634
|
-
|
|
635
|
-
PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
|
|
636
|
-
conjunct_array << Token.new(py_conjunct)
|
|
637
|
-
end
|
|
638
|
-
conjunct_array
|
|
853
|
+
PyCall::List.call(@py_span.conjuncts).map { |py_conjunct| Token.new(py_conjunct) }
|
|
639
854
|
end
|
|
640
855
|
|
|
641
856
|
# Returns tokens that are to the left of the span, whose heads are within the span.
|
|
642
857
|
# @return [Array<Token>] an array of tokens
|
|
643
858
|
def lefts
|
|
644
|
-
|
|
645
|
-
PyCall::List.call(@py_span.lefts).each do |py_left|
|
|
646
|
-
left_array << Token.new(py_left)
|
|
647
|
-
end
|
|
648
|
-
left_array
|
|
859
|
+
PyCall::List.call(@py_span.lefts).map { |py_left| Token.new(py_left) }
|
|
649
860
|
end
|
|
650
861
|
|
|
651
862
|
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
|
652
863
|
# @return [Array<Token>] an array of Tokens
|
|
653
864
|
def rights
|
|
654
|
-
|
|
655
|
-
PyCall::List.call(@py_span.rights).each do |py_right|
|
|
656
|
-
right_array << Token.new(py_right)
|
|
657
|
-
end
|
|
658
|
-
right_array
|
|
865
|
+
PyCall::List.call(@py_span.rights).map { |py_right| Token.new(py_right) }
|
|
659
866
|
end
|
|
660
867
|
|
|
661
868
|
# Returns Tokens that are within the span and tokens that descend from them.
|
|
662
869
|
# @return [Array<Token>] an array of tokens
|
|
663
870
|
def subtree
|
|
664
|
-
|
|
665
|
-
PyCall::List.call(@py_span.subtree).each do |py_subtree|
|
|
666
|
-
subtree_array << Token.new(py_subtree)
|
|
667
|
-
end
|
|
668
|
-
subtree_array
|
|
871
|
+
PyCall::List.call(@py_span.subtree).map { |py_subtree| Token.new(py_subtree) }
|
|
669
872
|
end
|
|
670
873
|
|
|
671
874
|
# Returns the label
|
|
@@ -674,13 +877,23 @@ module Spacy
|
|
|
674
877
|
@py_span.label_
|
|
675
878
|
end
|
|
676
879
|
|
|
880
|
+
# String representation of the span.
|
|
881
|
+
# @return [String]
|
|
882
|
+
def to_s
|
|
883
|
+
@text
|
|
884
|
+
end
|
|
885
|
+
|
|
677
886
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
|
678
887
|
def method_missing(name, *args)
|
|
679
888
|
@py_span.send(name, *args)
|
|
680
889
|
end
|
|
681
890
|
|
|
682
|
-
def respond_to_missing?(sym)
|
|
683
|
-
|
|
891
|
+
def respond_to_missing?(sym, include_private = false)
|
|
892
|
+
Spacy.py_hasattr?(@py_span, sym) || super
|
|
893
|
+
end
|
|
894
|
+
|
|
895
|
+
def instance_variables_to_inspect
|
|
896
|
+
[:@text]
|
|
684
897
|
end
|
|
685
898
|
end
|
|
686
899
|
|
|
@@ -700,6 +913,12 @@ module Spacy
|
|
|
700
913
|
@text = @py_token.text
|
|
701
914
|
end
|
|
702
915
|
|
|
916
|
+
# Returns the character offset of the token within the parent document.
|
|
917
|
+
# @return [Integer]
|
|
918
|
+
def idx
|
|
919
|
+
@py_token.idx
|
|
920
|
+
end
|
|
921
|
+
|
|
703
922
|
# Returns the head token
|
|
704
923
|
# @return [Token]
|
|
705
924
|
def head
|
|
@@ -709,51 +928,31 @@ module Spacy
|
|
|
709
928
|
# Returns the token in question and the tokens that descend from it.
|
|
710
929
|
# @return [Array<Token>] an array of tokens
|
|
711
930
|
def subtree
|
|
712
|
-
|
|
713
|
-
PyCall::List.call(@py_token.subtree).each do |descendant|
|
|
714
|
-
descendant_array << Token.new(descendant)
|
|
715
|
-
end
|
|
716
|
-
descendant_array
|
|
931
|
+
PyCall::List.call(@py_token.subtree).map { |descendant| Token.new(descendant) }
|
|
717
932
|
end
|
|
718
933
|
|
|
719
934
|
# Returns the token's ancestors.
|
|
720
935
|
# @return [Array<Token>] an array of tokens
|
|
721
936
|
def ancestors
|
|
722
|
-
|
|
723
|
-
PyCall::List.call(@py_token.ancestors).each do |ancestor|
|
|
724
|
-
ancestor_array << Token.new(ancestor)
|
|
725
|
-
end
|
|
726
|
-
ancestor_array
|
|
937
|
+
PyCall::List.call(@py_token.ancestors).map { |ancestor| Token.new(ancestor) }
|
|
727
938
|
end
|
|
728
939
|
|
|
729
940
|
# Returns a sequence of the token's immediate syntactic children.
|
|
730
941
|
# @return [Array<Token>] an array of tokens
|
|
731
942
|
def children
|
|
732
|
-
|
|
733
|
-
PyCall::List.call(@py_token.children).each do |child|
|
|
734
|
-
child_array << Token.new(child)
|
|
735
|
-
end
|
|
736
|
-
child_array
|
|
943
|
+
PyCall::List.call(@py_token.children).map { |child| Token.new(child) }
|
|
737
944
|
end
|
|
738
945
|
|
|
739
946
|
# The leftward immediate children of the word in the syntactic dependency parse.
|
|
740
947
|
# @return [Array<Token>] an array of tokens
|
|
741
948
|
def lefts
|
|
742
|
-
|
|
743
|
-
PyCall::List.call(@py_token.lefts).each do |token|
|
|
744
|
-
token_array << Token.new(token)
|
|
745
|
-
end
|
|
746
|
-
token_array
|
|
949
|
+
PyCall::List.call(@py_token.lefts).map { |token| Token.new(token) }
|
|
747
950
|
end
|
|
748
951
|
|
|
749
952
|
# The rightward immediate children of the word in the syntactic dependency parse.
|
|
750
953
|
# @return [Array<Token>] an array of tokens
|
|
751
954
|
def rights
|
|
752
|
-
|
|
753
|
-
PyCall::List.call(@py_token.rights).each do |token|
|
|
754
|
-
token_array << Token.new(token)
|
|
755
|
-
end
|
|
756
|
-
token_array
|
|
955
|
+
PyCall::List.call(@py_token.rights).map { |token| Token.new(token) }
|
|
757
956
|
end
|
|
758
957
|
|
|
759
958
|
# String representation of the token.
|
|
@@ -845,8 +1044,12 @@ module Spacy
|
|
|
845
1044
|
@py_token.send(name, *args)
|
|
846
1045
|
end
|
|
847
1046
|
|
|
848
|
-
def respond_to_missing?(sym)
|
|
849
|
-
|
|
1047
|
+
def respond_to_missing?(sym, include_private = false)
|
|
1048
|
+
Spacy.py_hasattr?(@py_token, sym) || super
|
|
1049
|
+
end
|
|
1050
|
+
|
|
1051
|
+
def instance_variables_to_inspect
|
|
1052
|
+
[:@text]
|
|
850
1053
|
end
|
|
851
1054
|
end
|
|
852
1055
|
|
|
@@ -920,8 +1123,12 @@ module Spacy
|
|
|
920
1123
|
@py_lexeme.send(name, *args)
|
|
921
1124
|
end
|
|
922
1125
|
|
|
923
|
-
def respond_to_missing?(sym)
|
|
924
|
-
|
|
1126
|
+
def respond_to_missing?(sym, include_private = false)
|
|
1127
|
+
Spacy.py_hasattr?(@py_lexeme, sym) || super
|
|
1128
|
+
end
|
|
1129
|
+
|
|
1130
|
+
def instance_variables_to_inspect
|
|
1131
|
+
[:@text]
|
|
925
1132
|
end
|
|
926
1133
|
end
|
|
927
1134
|
end
|