ruby-spacy 0.2.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/ruby-spacy.rb CHANGED
@@ -1,27 +1,29 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "ruby-spacy/version"
4
+ require_relative "ruby-spacy/openai_client"
5
+ require_relative "ruby-spacy/openai_helper"
4
6
  require "numpy"
5
- require "openai"
6
7
  require "pycall"
7
- require "strscan"
8
8
  require "timeout"
9
-
10
- begin
11
- PyCall.init
12
- _spacy = PyCall.import_module("spacy")
13
- rescue PyCall::PyError => e
14
- puts "Failed to initialize PyCall or import spacy: #{e.message}"
15
- puts "Python traceback:"
16
- puts e.traceback
17
- raise
18
- end
9
+ require "json"
10
+ require "base64"
19
11
 
20
12
  # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
21
13
  module Spacy
22
14
  MAX_RETRIAL = 5
23
15
 
24
- spacy = PyCall.import_module("spacy")
16
+ begin
17
+ PyCall.init
18
+ spacy = PyCall.import_module("spacy")
19
+ rescue PyCall::PyError => e
20
+ puts "Failed to initialize PyCall or import spacy: #{e.message}"
21
+ puts "Python traceback:"
22
+ puts e.traceback
23
+ raise
24
+ end
25
+
26
+ Builtins = PyCall.import_module("builtins")
25
27
  SpacyVersion = spacy.__version__
26
28
 
27
29
  # Python `Language` class
@@ -39,6 +41,9 @@ module Spacy
39
41
  # Python `Matcher` class object
40
42
  PyMatcher = spacy.matcher.Matcher
41
43
 
44
+ # Python `PhraseMatcher` class object
45
+ PyPhraseMatcher = spacy.matcher.PhraseMatcher
46
+
42
47
  # Python `displacy` object
43
48
  PyDisplacy = PyCall.import_module('spacy.displacy')
44
49
 
@@ -49,16 +54,15 @@ module Spacy
49
54
  PyCall::List.call(py_generator)
50
55
  end
51
56
 
52
- @openai_client = nil
53
-
54
- def self.openai_client(access_token:)
55
- # If @client is already set, just return it. Otherwise, create a new instance.
56
- @openai_client ||= OpenAI::Client.new(access_token: access_token)
57
- end
58
-
59
- # Provide an accessor method to get the client (optional)
60
- def self.client
61
- @openai_client
57
+ # Checks if a Python object has a given attribute using builtins.hasattr.
58
+ # Falls back to true if the check itself fails (e.g. due to PyCall issues).
59
+ # @param py_obj [Object] a Python object
60
+ # @param attr [String, Symbol] the attribute name to check
61
+ # @return [Boolean]
62
+ def self.py_hasattr?(py_obj, attr)
63
+ Builtins.hasattr(py_obj, attr.to_s)
64
+ rescue StandardError
65
+ true
62
66
  end
63
67
 
64
68
  # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
@@ -84,17 +88,19 @@ module Spacy
84
88
  # @param nlp [Language] an instance of {Language} class
85
89
  # @param py_doc [Object] an instance of Python `Doc` class
86
90
  # @param text [String] the text string to be analyzed
87
- def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL,
88
- retrial: 0)
91
+ def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL)
89
92
  @py_nlp = nlp
90
- @py_doc = py_doc || @py_doc = nlp.call(text)
91
- @text = @py_doc.text
92
- rescue StandardError
93
- retrial += 1
94
- raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
93
+ retrial = 0
94
+ begin
95
+ @py_doc = py_doc || nlp.call(text)
96
+ @text = @py_doc.text
97
+ rescue StandardError
98
+ retrial += 1
99
+ raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
95
100
 
96
- sleep 0.5
97
- initialize(nlp, py_doc: py_doc, text: text, max_retrial: max_retrial, retrial: retrial)
101
+ sleep 0.5
102
+ retry
103
+ end
98
104
  end
99
105
 
100
106
  # Retokenizes the text merging a span into a single token.
@@ -128,11 +134,7 @@ module Spacy
128
134
  # Returns an array of tokens contained in the doc.
129
135
  # @return [Array<Token>]
130
136
  def tokens
131
- results = []
132
- PyCall::List.call(@py_doc).each do |py_token|
133
- results << Token.new(py_token)
134
- end
135
- results
137
+ PyCall::List.call(@py_doc).map { |py_token| Token.new(py_token) }
136
138
  end
137
139
 
138
140
  # Iterates over the elements in the doc yielding a token instance each time.
@@ -148,54 +150,50 @@ module Spacy
148
150
  # @param optional_size [Integer] an integer representing the size of the span
149
151
  # @return [Span]
150
152
  def span(range_or_start, optional_size = nil)
153
+ doc_len = PyCall.len(@py_doc)
154
+
151
155
  if optional_size
152
156
  start_index = range_or_start
153
- temp = tokens[start_index...start_index + optional_size]
157
+ start_index += doc_len if start_index < 0
158
+ end_index = start_index + optional_size - 1
154
159
  else
155
- start_index = range_or_start.first
156
160
  range = range_or_start
157
- temp = tokens[range]
161
+ start_index = range.first
162
+ start_index += doc_len if start_index < 0
163
+ end_val = range.end
164
+ if end_val.nil?
165
+ end_index = doc_len - 1
166
+ else
167
+ end_val += doc_len if end_val < 0
168
+ end_index = range.exclude_end? ? end_val - 1 : end_val
169
+ end
158
170
  end
159
171
 
160
- end_index = start_index + temp.size - 1
161
-
162
172
  Span.new(self, start_index: start_index, end_index: end_index)
163
173
  end
164
174
 
165
175
  # Returns an array of spans representing noun chunks.
166
176
  # @return [Array<Span>]
167
177
  def noun_chunks
168
- chunk_array = []
169
- py_chunks = PyCall::List.call(@py_doc.noun_chunks)
170
- py_chunks.each do |py_chunk|
171
- chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
178
+ PyCall::List.call(@py_doc.noun_chunks).map do |py_chunk|
179
+ Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
172
180
  end
173
- chunk_array
174
181
  end
175
182
 
176
183
  # Returns an array of spans each representing a sentence.
177
184
  # @return [Array<Span>]
178
185
  def sents
179
- sentence_array = []
180
- py_sentences = PyCall::List.call(@py_doc.sents)
181
- py_sentences.each do |py_sent|
182
- sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
186
+ PyCall::List.call(@py_doc.sents).map do |py_sent|
187
+ Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
183
188
  end
184
- sentence_array
185
189
  end
186
190
 
187
191
  # Returns an array of spans each representing a named entity.
188
192
  # @return [Array<Span>]
189
193
  def ents
190
- # so that ents canbe "each"-ed in Ruby
191
- ent_array = []
192
- PyCall::List.call(@py_doc.ents).each do |ent|
193
- ent.define_singleton_method :label do
194
- label_
195
- end
196
- ent_array << ent
194
+ PyCall::List.call(@py_doc.ents).map do |py_span|
195
+ Span.new(self, py_span: py_span)
197
196
  end
198
- ent_array
199
197
  end
200
198
 
201
199
  # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
@@ -216,6 +214,31 @@ module Spacy
216
214
  py_doc.similarity(other.py_doc)
217
215
  end
218
216
 
217
+ # Serializes the doc to a binary string.
218
+ # The binary data includes all annotations (tokens, entities, etc.) and can be
219
+ # used to restore the doc later without re-processing.
220
+ # @return [String] binary representation of the doc
221
+ # @example Save doc to file
222
+ # doc = nlp.read("Hello world")
223
+ # File.binwrite("doc.bin", doc.to_bytes)
224
+ def to_bytes
225
+ @py_doc.to_bytes.force_encoding(Encoding::BINARY)
226
+ end
227
+
228
+ # Restores a doc from binary data created by {#to_bytes}.
229
+ # This is useful for caching processed documents to avoid re-processing.
230
+ # @param byte_string [String] binary data from {#to_bytes}
231
+ # @return [Doc] the restored doc
232
+ # @example Load doc from file
233
+ # bytes = File.binread("doc.bin")
234
+ # doc = Spacy::Doc.from_bytes(nlp, bytes)
235
+ def self.from_bytes(nlp, byte_string)
236
+ b64 = Base64.strict_encode64(byte_string)
237
+ py_bytes = PyCall.eval("__import__('base64').b64decode('#{b64}')")
238
+ py_doc = nlp.py_nlp.call("").from_bytes(py_bytes)
239
+ new(nlp.py_nlp, py_doc: py_doc)
240
+ end
241
+
219
242
  # Visualize the document in one of two styles: "dep" (dependencies) or "ent" (named entities).
220
243
  # @param style [String] either `dep` or `ent`
221
244
  # @param compact [Boolean] only relevant to the `dep' style
@@ -224,12 +247,86 @@ module Spacy
224
247
  PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
225
248
  end
226
249
 
250
+ # Generates a JSON string summarizing the linguistic analysis of the document.
251
+ # Designed to be passed as context to an LLM (e.g., via {OpenAIHelper#chat}).
252
+ #
253
+ # @param sections [Array<Symbol>] which sections to include
254
+ # (:text, :tokens, :entities, :noun_chunks, :sentences)
255
+ # @param token_attributes [Array<Symbol>] which token attributes to include
256
+ # (:text, :lemma, :pos, :tag, :dep, :head, :ent_type, :morphology)
257
+ # @return [String] a JSON string of the linguistic summary
258
+ def linguistic_summary(sections: [:text, :tokens, :entities, :noun_chunks],
259
+ token_attributes: [:text, :lemma, :pos, :dep, :head])
260
+ result = {}
261
+
262
+ sections.each do |section|
263
+ case section
264
+ when :text
265
+ result[:text] = @text
266
+ when :tokens
267
+ result[:tokens] = tokens.map do |token|
268
+ token_hash = {}
269
+ token_attributes.each do |attr|
270
+ case attr
271
+ when :head
272
+ token_hash[:head] = token.head.text
273
+ when :morphology
274
+ # Use string form and parse to ensure a plain Ruby Hash for JSON serialization
275
+ morph_str = token.morphology(hash: false)
276
+ token_hash[:morphology] = if morph_str.empty?
277
+ {}
278
+ else
279
+ morph_str.split("|").each_with_object({}) do |pair, h|
280
+ k, v = pair.split("=", 2)
281
+ h[k] = v
282
+ end
283
+ end
284
+ else
285
+ token_hash[attr] = token.send(attr)
286
+ end
287
+ end
288
+ token_hash
289
+ end
290
+ when :entities
291
+ ent_list = ents
292
+ result[:entities] = ent_list.map do |ent|
293
+ { text: ent.text, label: ent.label }
294
+ end
295
+ when :noun_chunks
296
+ result[:noun_chunks] = noun_chunks.map do |chunk|
297
+ { text: chunk.text, root: chunk.root.text }
298
+ end
299
+ when :sentences
300
+ result[:sentences] = sents.map(&:text)
301
+ end
302
+ end
303
+
304
+ result.to_json
305
+ end
306
+
307
+ # Sends a query to OpenAI's chat completion API with optional tool support.
308
+ # The get_tokens tool allows the model to request token-level linguistic analysis.
309
+ #
310
+ # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
311
+ # @param max_completion_tokens [Integer] Maximum tokens in the response
312
+ # @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
313
+ # @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
314
+ # @param model [String] The model to use (default: gpt-5-mini)
315
+ # @param messages [Array<Hash>] Conversation history (for recursive tool calls). Note: this array is modified in place when tool calls occur.
316
+ # @param prompt [String, nil] System prompt for the query
317
+ # @return [String, nil] The model's response content
227
318
  def openai_query(access_token: nil,
228
- max_tokens: 1000,
319
+ max_completion_tokens: nil,
320
+ max_tokens: nil,
229
321
  temperature: 0.7,
230
- model: "gpt-4o-mini",
322
+ model: "gpt-5-mini",
231
323
  messages: [],
232
- prompt: nil)
324
+ prompt: nil,
325
+ response_format: nil,
326
+ max_tool_call_depth: 5,
327
+ _tool_call_depth: 0)
328
+ # Support both max_completion_tokens and max_tokens for backward compatibility
329
+ max_completion_tokens ||= max_tokens || 1000
233
330
  if messages.empty?
234
331
  messages = [
235
332
  { role: "system", content: prompt },
@@ -237,122 +334,164 @@ module Spacy
237
334
  ]
238
335
  end
239
336
 
240
- access_token ||= ENV["OPENAI_API_KEY"]
241
- raise "Error: OPENAI_API_KEY is not set" unless access_token
242
-
243
- begin
244
- response = Spacy.openai_client(access_token: access_token).chat(
245
- parameters: {
246
- model: model,
247
- messages: messages,
248
- max_tokens: max_tokens,
249
- temperature: temperature,
250
- function_call: "auto",
251
- stream: false,
252
- functions: [
253
- {
254
- name: "get_tokens",
255
- description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
256
- "parameters": {
257
- "type": "object",
258
- "properties": {
259
- "text": {
260
- "type": "string",
261
- "description": "text to be tokenized"
262
- }
263
- },
264
- "required": ["text"]
265
- }
337
+ client = openai_client(access_token)
338
+
339
+ # Tool definition for token analysis (GPT-5 tools API format)
340
+ tools = nil
341
+ tool_choice = nil
342
+ if _tool_call_depth < max_tool_call_depth
343
+ tools = [
344
+ {
345
+ type: "function",
346
+ function: {
347
+ name: "get_tokens",
348
+ description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
349
+ parameters: {
350
+ type: "object",
351
+ properties: {
352
+ text: {
353
+ type: "string",
354
+ description: "text to be tokenized"
355
+ }
356
+ },
357
+ required: ["text"]
266
358
  }
267
- ]
359
+ }
268
360
  }
269
- )
361
+ ]
362
+ tool_choice = "auto"
363
+ end
364
+
365
+ response = client.chat(
366
+ model: model,
367
+ messages: messages,
368
+ max_completion_tokens: max_completion_tokens,
369
+ temperature: temperature,
370
+ tools: tools,
371
+ tool_choice: tool_choice,
372
+ response_format: response_format
373
+ )
374
+
375
+ message = response.dig("choices", 0, "message")
270
376
 
271
- message = response.dig("choices", 0, "message")
377
+ # Handle tool calls (GPT-5 format)
378
+ if message["tool_calls"] && !message["tool_calls"].empty?
379
+ messages << message
380
+
381
+ message["tool_calls"].each do |tool_call|
382
+ function_name = tool_call.dig("function", "name")
383
+ tool_call_id = tool_call["id"]
272
384
 
273
- if message["role"] == "assistant" && message["function_call"]
274
- messages << message
275
- function_name = message.dig("function_call", "name")
276
- _args = JSON.parse(message.dig("function_call", "arguments"))
277
385
  case function_name
278
386
  when "get_tokens"
279
- res = tokens.map do |t|
387
+ result = tokens.map do |t|
280
388
  {
281
- "surface": t.text,
282
- "lemma": t.lemma,
283
- "pos": t.pos,
284
- "tag": t.tag,
285
- "dep": t.dep,
286
- "ent_type": t.ent_type,
287
- "morphology": t.morphology
389
+ surface: t.text,
390
+ lemma: t.lemma,
391
+ pos: t.pos,
392
+ tag: t.tag,
393
+ dep: t.dep,
394
+ ent_type: t.ent_type,
395
+ morphology: t.morphology
288
396
  }
289
397
  end.to_json
398
+
399
+ messages << {
400
+ role: "tool",
401
+ tool_call_id: tool_call_id,
402
+ content: result
403
+ }
290
404
  end
291
- messages << { role: "system", content: res }
292
- openai_query(access_token: access_token, max_tokens: max_tokens,
293
- temperature: temperature, model: model,
294
- messages: messages, prompt: prompt)
295
- else
296
- message["content"]
297
405
  end
298
- rescue StandardError => e
299
- puts "Error: OpenAI API call failed."
300
- pp e.message
301
- pp e.backtrace
406
+
407
+ # Recursive call to get final response after tool execution
408
+ openai_query(
409
+ access_token: access_token,
410
+ max_completion_tokens: max_completion_tokens,
411
+ temperature: temperature,
412
+ model: model,
413
+ messages: messages,
414
+ prompt: prompt,
415
+ response_format: response_format,
416
+ max_tool_call_depth: max_tool_call_depth,
417
+ _tool_call_depth: _tool_call_depth + 1
418
+ )
419
+ else
420
+ message["content"]
302
421
  end
303
- end
422
+ rescue OpenAIClient::APIError => e
423
+ puts "Error: OpenAI API call failed - #{e.message}"
424
+ nil
425
+ end
426
+
427
+ # Sends a text completion request to OpenAI's chat API.
428
+ #
429
+ # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
430
+ # @param max_completion_tokens [Integer] Maximum tokens in the response
431
+ # @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
432
+ # @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
433
+ # @param model [String] The model to use (default: gpt-5-mini)
434
+ # @return [String, nil] The completed text
435
+ def openai_completion(access_token: nil, max_completion_tokens: nil, max_tokens: nil, temperature: 0.7, model: "gpt-5-mini")
436
+ # Support both max_completion_tokens and max_tokens for backward compatibility
437
+ max_completion_tokens ||= max_tokens || 1000
304
438
 
305
- def openai_completion(access_token: nil, max_tokens: 1000, temperature: 0.7, model: "gpt-4o-mini")
306
439
  messages = [
307
440
  { role: "system", content: "Complete the text input by the user." },
308
441
  { role: "user", content: @text }
309
442
  ]
310
- access_token ||= ENV["OPENAI_API_KEY"]
311
- raise "Error: OPENAI_API_KEY is not set" unless access_token
312
-
313
- begin
314
- response = Spacy.openai_client(access_token: access_token).chat(
315
- parameters: {
316
- model: model,
317
- messages: messages,
318
- max_tokens: max_tokens,
319
- temperature: temperature
320
- }
321
- )
322
- response.dig("choices", 0, "message", "content")
323
- rescue StandardError => e
324
- puts "Error: OpenAI API call failed."
325
- pp e.message
326
- pp e.backtrace
327
- end
328
- end
329
443
 
330
- def openai_embeddings(access_token: nil, model: "text-embedding-ada-002")
444
+ client = openai_client(access_token)
445
+ response = client.chat(
446
+ model: model,
447
+ messages: messages,
448
+ max_completion_tokens: max_completion_tokens,
449
+ temperature: temperature
450
+ )
451
+ response.dig("choices", 0, "message", "content")
452
+ rescue OpenAIClient::APIError => e
453
+ puts "Error: OpenAI API call failed - #{e.message}"
454
+ nil
455
+ end
456
+
457
+ # Generates text embeddings using OpenAI's embeddings API.
458
+ #
459
+ # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
460
+ # @param model [String] The embeddings model (default: text-embedding-3-small)
461
+ # @param dimensions [Integer, nil] The number of dimensions for the output embeddings (nil uses model default)
462
+ # @return [Array<Float>, nil] The embedding vector
463
+ def openai_embeddings(access_token: nil, model: "text-embedding-3-small", dimensions: nil)
464
+ client = openai_client(access_token)
465
+ response = client.embeddings(model: model, input: @text, dimensions: dimensions)
466
+ response.dig("data", 0, "embedding")
467
+ rescue OpenAIClient::APIError => e
468
+ puts "Error: OpenAI API call failed - #{e.message}"
469
+ nil
470
+ end
471
+
472
+ private
473
+
474
+ def openai_client(access_token)
331
475
  access_token ||= ENV["OPENAI_API_KEY"]
332
476
  raise "Error: OPENAI_API_KEY is not set" unless access_token
333
477
 
334
- begin
335
- response = Spacy.openai_client(access_token: access_token).embeddings(
336
- parameters: {
337
- model: model,
338
- input: @text
339
- }
340
- )
341
- response.dig("data", 0, "embedding")
342
- rescue StandardError => e
343
- puts "Error: OpenAI API call failed."
344
- pp e.message
345
- pp e.backtrace
346
- end
478
+ @openai_clients ||= {}
479
+ @openai_clients[access_token] ||= OpenAIClient.new(access_token: access_token)
347
480
  end
348
481
 
482
+ public
483
+
349
484
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
350
485
  def method_missing(name, *args)
351
486
  @py_doc.send(name, *args)
352
487
  end
353
488
 
354
- def respond_to_missing?(sym)
355
- sym ? true : super
489
+ def respond_to_missing?(sym, include_private = false)
490
+ Spacy.py_hasattr?(@py_doc, sym) || super
491
+ end
492
+
493
+ def instance_variables_to_inspect
494
+ [:@text]
356
495
  end
357
496
  end
358
497
 
@@ -366,8 +505,13 @@ module Spacy
366
505
 
367
506
  # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
368
507
  # @param model [String] A language model installed in the system
369
- def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, retrial: 0, timeout: 60)
508
+ def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, timeout: 60)
509
+ unless model.to_s.match?(/\A[a-zA-Z0-9_\-\.\/]+\z/)
510
+ raise ArgumentError, "Invalid model name: #{model.inspect}"
511
+ end
512
+
370
513
  @spacy_nlp_id = "nlp_#{model.object_id}"
514
+ retrial = 0
371
515
  begin
372
516
  Timeout.timeout(timeout) do
373
517
  PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
@@ -398,21 +542,29 @@ module Spacy
398
542
  Matcher.new(@py_nlp)
399
543
  end
400
544
 
545
+ # Generates a phrase matcher for the current language model.
546
+ # PhraseMatcher is more efficient than {Matcher} for matching large terminology lists.
547
+ # @param attr [String] the token attribute to match on (default: "ORTH").
548
+ # Use "LOWER" for case-insensitive matching.
549
+ # @return [PhraseMatcher]
550
+ # @example
551
+ # matcher = nlp.phrase_matcher(attr: "LOWER")
552
+ # matcher.add("PRODUCT", ["iPhone", "MacBook Pro"])
553
+ def phrase_matcher(attr: "ORTH")
554
+ PhraseMatcher.new(self, attr: attr)
555
+ end
556
+
401
557
  # A utility method to lookup a vocabulary item of the given id.
402
558
  # @param id [Integer] a vocabulary id
403
559
  # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
404
560
  def vocab_string_lookup(id)
405
- PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
561
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{Integer(id)}]")
406
562
  end
407
563
 
408
564
  # A utility method to list pipeline components.
409
565
  # @return [Array<String>] An array of text strings representing pipeline components
410
566
  def pipe_names
411
- pipe_array = []
412
- PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
413
- pipe_array << pipe
414
- end
415
- pipe_array
567
+ PyCall::List.call(@py_nlp.pipe_names).to_a
416
568
  end
417
569
 
418
570
  # A utility method to get a Python `Lexeme` object.
@@ -461,20 +613,62 @@ module Spacy
461
613
  # @param batch_size [Integer]
462
614
  # @return [Array<Doc>]
463
615
  def pipe(texts, disable: [], batch_size: 50)
464
- docs = []
465
- PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
466
- docs << Doc.new(@py_nlp, py_doc: py_doc)
616
+ PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).map do |py_doc|
617
+ Doc.new(@py_nlp, py_doc: py_doc)
467
618
  end
468
- docs
469
619
  end
470
620
 
471
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
621
+ # Yields an {OpenAIHelper} instance for making OpenAI API calls within a block.
622
+ # The helper is configured once and reused for all calls within the block,
623
+ # making it efficient for batch processing with {#pipe}.
624
+ #
625
+ # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
626
+ # @param model [String] the default model for chat requests
627
+ # @param max_completion_tokens [Integer] default maximum tokens in responses
628
+ # @param temperature [Float] default sampling temperature
629
+ # @yield [OpenAIHelper] the helper instance for making API calls
630
+ # @return [Object] the block's return value
631
+ # @example Batch processing with pipe
632
+ # nlp.with_openai(model: "gpt-5-mini") do |ai|
633
+ # nlp.pipe(texts).map do |doc|
634
+ # ai.chat(system: "Analyze.", user: doc.linguistic_summary)
635
+ # end
636
+ # end
637
+ def with_openai(access_token: nil, model: "gpt-5-mini",
638
+ max_completion_tokens: 1000, temperature: 0.7)
639
+ helper = OpenAIHelper.new(
640
+ access_token: access_token,
641
+ model: model,
642
+ max_completion_tokens: max_completion_tokens,
643
+ temperature: temperature
644
+ )
645
+ yield helper
646
+ end
647
+
648
+ # Executes a block within spaCy's memory zone for efficient memory management.
649
+ # Requires spaCy >= 3.8.
650
+ # @yield the block to execute within the memory zone
651
+ # @raise [NotImplementedError] if spaCy version does not support memory zones
652
+ def memory_zone(&block)
653
+ major, minor = SpacyVersion.split(".").map(&:to_i)
654
+ unless major > 3 || (major == 3 && minor >= 8)
655
+ raise NotImplementedError, "memory_zone requires spaCy >= 3.8 (current: #{SpacyVersion})"
656
+ end
657
+
658
+ PyCall.with(@py_nlp.memory_zone, &block)
659
+ end
660
+
661
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
472
662
  def method_missing(name, *args)
473
663
  @py_nlp.send(name, *args)
474
664
  end
475
665
 
476
- def respond_to_missing?(sym)
477
- sym ? true : super
666
+ def respond_to_missing?(sym, include_private = false)
667
+ Spacy.py_hasattr?(@py_nlp, sym) || super
668
+ end
669
+
670
+ def instance_variables_to_inspect
671
+ [:@spacy_nlp_id]
478
672
  end
479
673
  end
480
674
 
@@ -500,19 +694,52 @@ module Spacy
500
694
  # @param doc [Doc] an {Doc} instance
501
695
  # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
502
696
  def match(doc)
503
- str_results = @py_matcher.call(doc.py_doc).to_s
504
- s = StringScanner.new(str_results[1..-2])
505
- results = []
506
- while s.scan_until(/(\d+), (\d+), (\d+)/)
507
- next unless s.matched
508
-
509
- triple = s.matched.split(", ")
510
- match_id = triple[0].to_i
511
- start_index = triple[1].to_i
512
- end_index = triple[2].to_i - 1
513
- results << { match_id: match_id, start_index: start_index, end_index: end_index }
697
+ PyCall::List.call(@py_matcher.call(doc.py_doc)).map do |py_match|
698
+ { match_id: py_match[0].to_i, start_index: py_match[1].to_i, end_index: py_match[2].to_i - 1 }
514
699
  end
515
- results
700
+ end
701
+ end
702
+
703
+ # See also spaCy Python API document for [`PhraseMatcher`](https://spacy.io/api/phrasematcher).
704
+ # PhraseMatcher is useful for efficiently matching large terminology lists.
705
+ # It's faster than {Matcher} when matching many phrase patterns.
706
+ class PhraseMatcher
707
+ # @return [Object] a Python `PhraseMatcher` instance accessible via `PyCall`
708
+ attr_reader :py_matcher
709
+
710
+ # @return [Language] the language model used by this matcher
711
+ attr_reader :nlp
712
+
713
+ # Creates a {PhraseMatcher} instance.
714
+ # @param nlp [Language] an instance of {Language} class
715
+ # @param attr [String] the token attribute to match on (default: "ORTH").
716
+ # Use "LOWER" for case-insensitive matching.
717
+ # @example Case-insensitive matching
718
+ # matcher = Spacy::PhraseMatcher.new(nlp, attr: "LOWER")
719
+ def initialize(nlp, attr: "ORTH")
720
+ @nlp = nlp
721
+ @py_matcher = PyPhraseMatcher.call(nlp.py_nlp.vocab, attr: attr)
722
+ end
723
+
724
+ # Adds phrase patterns to the matcher.
725
+ # @param label [String] a label string given to the patterns
726
+ # @param phrases [Array<String>] an array of phrase strings to match
727
+ # @example Add product names
728
+ # matcher.add("PRODUCT", ["iPhone", "MacBook Pro", "iPad"])
729
+ def add(label, phrases)
730
+ patterns = phrases.map { |phrase| @nlp.py_nlp.call(phrase) }
731
+ @py_matcher.add(label, patterns)
732
+ end
733
+
734
+ # Execute the phrase match and return matching spans.
735
+ # @param doc [Doc] a {Doc} instance to search
736
+ # @return [Array<Span>] an array of {Span} objects with labels
737
+ # @example Find matches
738
+ # matches = matcher.match(doc)
739
+ # matches.each { |span| puts "#{span.text} => #{span.label}" }
740
+ def match(doc)
741
+ py_matches = @py_matcher.call(doc.py_doc, as_spans: true)
742
+ PyCall::List.call(py_matches).map { |py_span| Span.new(doc, py_span: py_span) }
516
743
  end
517
744
  end
518
745
 
@@ -524,6 +751,9 @@ module Spacy
524
751
  # @return [Doc] the document to which the span belongs
525
752
  attr_reader :doc
526
753
 
754
+ # @return [String] a text string of the span
755
+ attr_reader :text
756
+
527
757
  include Enumerable
528
758
 
529
759
  alias length count
@@ -539,17 +769,14 @@ module Spacy
539
769
  # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
540
770
  def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
541
771
  @doc = doc
542
- @py_span = py_span || @py_span = PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
772
+ @py_span = py_span || PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
773
+ @text = @py_span.text
543
774
  end
544
775
 
545
776
  # Returns an array of tokens contained in the span.
546
777
  # @return [Array<Token>]
547
778
  def tokens
548
- results = []
549
- PyCall::List.call(@py_span).each do |py_token|
550
- results << Token.new(py_token)
551
- end
552
- results
779
+ PyCall::List.call(@py_span).map { |py_token| Token.new(py_token) }
553
780
  end
554
781
 
555
782
  # Iterates over the elements in the span yielding a token instance each time.
@@ -562,12 +789,9 @@ module Spacy
562
789
  # Returns an array of spans of noun chunks.
563
790
  # @return [Array<Span>]
564
791
  def noun_chunks
565
- chunk_array = []
566
- py_chunks = PyCall::List.call(@py_span.noun_chunks)
567
- py_chunks.each do |py_span|
568
- chunk_array << Span.new(@doc, py_span: py_span)
792
+ PyCall::List.call(@py_span.noun_chunks).map do |py_span|
793
+ Span.new(@doc, py_span: py_span)
569
794
  end
570
- chunk_array
571
795
  end
572
796
 
573
797
  # Returns the head token
@@ -579,22 +803,17 @@ module Spacy
579
803
  # Returns an array of spans that represents sentences.
580
804
  # @return [Array<Span>]
581
805
  def sents
582
- sentence_array = []
583
- py_sentences = PyCall::List.call(@py_span.sents)
584
- py_sentences.each do |py_span|
585
- sentence_array << Span.new(@doc, py_span: py_span)
806
+ PyCall::List.call(@py_span.sents).map do |py_span|
807
+ Span.new(@doc, py_span: py_span)
586
808
  end
587
- sentence_array
588
809
  end
589
810
 
590
811
  # Returns an array of spans that represents named entities.
591
812
  # @return [Array<Span>]
592
813
  def ents
593
- ent_array = []
594
- PyCall::List.call(@py_span.ents).each do |py_span|
595
- ent_array << Span.new(@doc, py_span: py_span)
814
+ PyCall::List.call(@py_span.ents).map do |py_span|
815
+ Span.new(@doc, py_span: py_span)
596
816
  end
597
- ent_array
598
817
  end
599
818
 
600
819
  # Returns a span that represents the sentence that the given span is part of.
@@ -631,41 +850,25 @@ module Spacy
631
850
  # Returns tokens conjugated to the root of the span.
632
851
  # @return [Array<Token>] an array of tokens
633
852
  def conjuncts
634
- conjunct_array = []
635
- PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
636
- conjunct_array << Token.new(py_conjunct)
637
- end
638
- conjunct_array
853
+ PyCall::List.call(@py_span.conjuncts).map { |py_conjunct| Token.new(py_conjunct) }
639
854
  end
640
855
 
641
856
  # Returns tokens that are to the left of the span, whose heads are within the span.
642
857
  # @return [Array<Token>] an array of tokens
643
858
  def lefts
644
- left_array = []
645
- PyCall::List.call(@py_span.lefts).each do |py_left|
646
- left_array << Token.new(py_left)
647
- end
648
- left_array
859
+ PyCall::List.call(@py_span.lefts).map { |py_left| Token.new(py_left) }
649
860
  end
650
861
 
651
862
  # Returns Tokens that are to the right of the span, whose heads are within the span.
652
863
  # @return [Array<Token>] an array of Tokens
653
864
  def rights
654
- right_array = []
655
- PyCall::List.call(@py_span.rights).each do |py_right|
656
- right_array << Token.new(py_right)
657
- end
658
- right_array
865
+ PyCall::List.call(@py_span.rights).map { |py_right| Token.new(py_right) }
659
866
  end
660
867
 
661
868
  # Returns Tokens that are within the span and tokens that descend from them.
662
869
  # @return [Array<Token>] an array of tokens
663
870
  def subtree
664
- subtree_array = []
665
- PyCall::List.call(@py_span.subtree).each do |py_subtree|
666
- subtree_array << Token.new(py_subtree)
667
- end
668
- subtree_array
871
+ PyCall::List.call(@py_span.subtree).map { |py_subtree| Token.new(py_subtree) }
669
872
  end
670
873
 
671
874
  # Returns the label
@@ -674,13 +877,23 @@ module Spacy
674
877
  @py_span.label_
675
878
  end
676
879
 
880
+ # String representation of the span.
881
+ # @return [String]
882
+ def to_s
883
+ @text
884
+ end
885
+
677
886
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
678
887
  def method_missing(name, *args)
679
888
  @py_span.send(name, *args)
680
889
  end
681
890
 
682
- def respond_to_missing?(sym)
683
- sym ? true : super
891
+ def respond_to_missing?(sym, include_private = false)
892
+ Spacy.py_hasattr?(@py_span, sym) || super
893
+ end
894
+
895
+ def instance_variables_to_inspect
896
+ [:@text]
684
897
  end
685
898
  end
686
899
 
@@ -700,6 +913,12 @@ module Spacy
700
913
  @text = @py_token.text
701
914
  end
702
915
 
916
+ # Returns the character offset of the token within the parent document.
917
+ # @return [Integer]
918
+ def idx
919
+ @py_token.idx
920
+ end
921
+
703
922
  # Returns the head token
704
923
  # @return [Token]
705
924
  def head
@@ -709,51 +928,31 @@ module Spacy
709
928
  # Returns the token in question and the tokens that descend from it.
710
929
  # @return [Array<Token>] an array of tokens
711
930
  def subtree
712
- descendant_array = []
713
- PyCall::List.call(@py_token.subtree).each do |descendant|
714
- descendant_array << Token.new(descendant)
715
- end
716
- descendant_array
931
+ PyCall::List.call(@py_token.subtree).map { |descendant| Token.new(descendant) }
717
932
  end
718
933
 
719
934
  # Returns the token's ancestors.
720
935
  # @return [Array<Token>] an array of tokens
721
936
  def ancestors
722
- ancestor_array = []
723
- PyCall::List.call(@py_token.ancestors).each do |ancestor|
724
- ancestor_array << Token.new(ancestor)
725
- end
726
- ancestor_array
937
+ PyCall::List.call(@py_token.ancestors).map { |ancestor| Token.new(ancestor) }
727
938
  end
728
939
 
729
940
  # Returns a sequence of the token's immediate syntactic children.
730
941
  # @return [Array<Token>] an array of tokens
731
942
  def children
732
- child_array = []
733
- PyCall::List.call(@py_token.children).each do |child|
734
- child_array << Token.new(child)
735
- end
736
- child_array
943
+ PyCall::List.call(@py_token.children).map { |child| Token.new(child) }
737
944
  end
738
945
 
739
946
  # The leftward immediate children of the word in the syntactic dependency parse.
740
947
  # @return [Array<Token>] an array of tokens
741
948
  def lefts
742
- token_array = []
743
- PyCall::List.call(@py_token.lefts).each do |token|
744
- token_array << Token.new(token)
745
- end
746
- token_array
949
+ PyCall::List.call(@py_token.lefts).map { |token| Token.new(token) }
747
950
  end
748
951
 
749
952
  # The rightward immediate children of the word in the syntactic dependency parse.
750
953
  # @return [Array<Token>] an array of tokens
751
954
  def rights
752
- token_array = []
753
- PyCall::List.call(@py_token.rights).each do |token|
754
- token_array << Token.new(token)
755
- end
756
- token_array
955
+ PyCall::List.call(@py_token.rights).map { |token| Token.new(token) }
757
956
  end
758
957
 
759
958
  # String representation of the token.
@@ -845,8 +1044,12 @@ module Spacy
845
1044
  @py_token.send(name, *args)
846
1045
  end
847
1046
 
848
- def respond_to_missing?(sym)
849
- sym ? true : super
1047
+ def respond_to_missing?(sym, include_private = false)
1048
+ Spacy.py_hasattr?(@py_token, sym) || super
1049
+ end
1050
+
1051
+ def instance_variables_to_inspect
1052
+ [:@text]
850
1053
  end
851
1054
  end
852
1055
 
@@ -920,8 +1123,12 @@ module Spacy
920
1123
  @py_lexeme.send(name, *args)
921
1124
  end
922
1125
 
923
- def respond_to_missing?(sym)
924
- sym ? true : super
1126
+ def respond_to_missing?(sym, include_private = false)
1127
+ Spacy.py_hasattr?(@py_lexeme, sym) || super
1128
+ end
1129
+
1130
+ def instance_variables_to_inspect
1131
+ [:@text]
925
1132
  end
926
1133
  end
927
1134
  end