ruby-spacy 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d6005c638c2b268fe162b288e124439be6a525952557a48b0b50685bbd2a6ea1
4
- data.tar.gz: 41dbc057c9ec51ffa8d6f1149fb8acde3fb52a251299d0209b4e2d351942eac0
3
+ metadata.gz: 6185c586feb32fa51efcd4349398cd4ca9541280a5cc8a1b6a73eb93a987d4ac
4
+ data.tar.gz: a146a9c40e2d5293e2401cb16b8ac6866cbb577e11a10d9657c406f933e7a3aa
5
5
  SHA512:
6
- metadata.gz: 5be0efa9e649b3d46da859472ce403adaa3cdaa34d4158e7a531680eb2830ae64779ec6ada8f0f6e324cc9cb314fb1fcbc617daa26e37e91a7d14f703caeec2d
7
- data.tar.gz: b8f56b4842fea3bec1b35366624c7ab9297c3a3b25c9a8502dc32c623593e511d9da538bf3e5cac272baf854cf4c2c97d4129790b492329183d88873467f8dbb
6
+ metadata.gz: bf558d4e9a7a6765fd7d088bbf8324a6ee0e4f4186962551d71e5a991e0aefd1e51a186f19c2824fabcc6afd0c83960771f082237febece52c2a522ccb39a5cf
7
+ data.tar.gz: 3a64559cf8c169d1ac1ecdef526d26e5776989b9cc203a8ed30e0dd5d87ff62a4d1b741aff30c8cb49e5ffb716c6068f9af3a12d50d0d4de8ad6f22ebe80ea0d
@@ -0,0 +1,6 @@
1
+ # These are supported funding model platforms
2
+
3
+ github: [yohasebe]
4
+ ko_fi: yohasebe
5
+ buy_me_a_coffee: yohasebe
6
+ # custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
@@ -13,7 +13,7 @@ module Spacy
13
13
  API_ENDPOINT = "https://api.openai.com/v1"
14
14
  DEFAULT_TIMEOUT = 120
15
15
  MAX_RETRIES = 3
16
- RETRY_DELAY = 1
16
+ BASE_RETRY_DELAY = 1
17
17
 
18
18
  class APIError < StandardError
19
19
  attr_reader :status_code, :response_body
@@ -31,24 +31,25 @@ module Spacy
31
31
  end
32
32
 
33
33
  # Sends a chat completion request with optional tools support.
34
- # Note: GPT-5 series models do not support the temperature parameter.
34
+ # Note: GPT-5 series and o-series models do not support the temperature parameter.
35
35
  #
36
36
  # @param model [String] The model to use (e.g., "gpt-5-mini")
37
37
  # @param messages [Array<Hash>] The conversation messages
38
38
  # @param max_completion_tokens [Integer] Maximum tokens in the response
39
- # @param temperature [Float, nil] Sampling temperature (ignored for GPT-5 models)
39
+ # @param temperature [Float, nil] Sampling temperature (ignored for models that don't support it)
40
40
  # @param tools [Array<Hash>, nil] Tool definitions for function calling
41
41
  # @param tool_choice [String, Hash, nil] Tool selection strategy
42
+ # @param response_format [Hash, nil] Response format specification (e.g., { type: "json_object" })
42
43
  # @return [Hash] The API response
43
- def chat(model:, messages:, max_completion_tokens: 1000, temperature: nil, tools: nil, tool_choice: nil)
44
+ def chat(model:, messages:, max_completion_tokens: 1000, temperature: nil, tools: nil, tool_choice: nil, response_format: nil)
44
45
  body = {
45
46
  model: model,
46
47
  messages: messages,
47
48
  max_completion_tokens: max_completion_tokens
48
49
  }
49
50
 
50
- # GPT-5 series models do not support temperature parameter
51
- unless gpt5_model?(model)
51
+ # GPT-5 series and o-series models do not support temperature parameter
52
+ unless temperature_unsupported?(model)
52
53
  body[:temperature] = temperature || 0.7
53
54
  end
54
55
 
@@ -57,25 +58,32 @@ module Spacy
57
58
  body[:tool_choice] = tool_choice || "auto"
58
59
  end
59
60
 
61
+ body[:response_format] = response_format if response_format
62
+
60
63
  post("/chat/completions", body)
61
64
  end
62
65
 
63
- # Checks if the model is a GPT-5 series model.
64
- # GPT-5 models have different parameter requirements (no temperature support).
65
- def gpt5_model?(model)
66
- model.to_s.start_with?("gpt-5")
66
+ # Checks if the model does not support the temperature parameter.
67
+ # This includes GPT-5 series and o-series (o1, o3, o4-mini, etc.) models.
68
+ # @param model [String] The model name
69
+ # @return [Boolean]
70
+ def temperature_unsupported?(model)
71
+ name = model.to_s
72
+ name.start_with?("gpt-5") || name.match?(/\Ao\d/)
67
73
  end
68
74
 
69
75
  # Sends an embeddings request.
70
76
  #
71
77
  # @param model [String] The embeddings model (e.g., "text-embedding-3-small")
72
78
  # @param input [String] The text to embed
79
+ # @param dimensions [Integer, nil] The number of dimensions for the output embeddings
73
80
  # @return [Hash] The API response
74
- def embeddings(model:, input:)
81
+ def embeddings(model:, input:, dimensions: nil)
75
82
  body = {
76
83
  model: model,
77
84
  input: input
78
85
  }
86
+ body[:dimensions] = dimensions if dimensions
79
87
 
80
88
  post("/embeddings", body)
81
89
  end
@@ -94,36 +102,45 @@ module Spacy
94
102
  uri = URI.parse("#{API_ENDPOINT}#{path}")
95
103
  retries = 0
96
104
 
97
- begin
98
- http = Net::HTTP.new(uri.host, uri.port)
99
- http.use_ssl = true
100
- http.verify_mode = OpenSSL::SSL::VERIFY_PEER
101
- http.cert_store = default_cert_store
102
- http.open_timeout = @timeout
103
- http.read_timeout = @timeout
104
-
105
- request = Net::HTTP::Post.new(uri.path)
106
- request["Content-Type"] = "application/json"
107
- request["Authorization"] = "Bearer #{@access_token}"
108
- request.body = body.to_json
109
-
110
- response = http.request(request)
111
-
112
- handle_response(response)
113
- rescue Net::OpenTimeout, Net::ReadTimeout => e
114
- retries += 1
115
- if retries <= MAX_RETRIES
116
- sleep RETRY_DELAY
117
- retry
118
- end
119
- raise APIError.new("Request timed out after #{MAX_RETRIES} retries: #{e.message}")
120
- rescue Errno::ECONNREFUSED, Errno::ECONNRESET, SocketError => e
121
- retries += 1
122
- if retries <= MAX_RETRIES
123
- sleep RETRY_DELAY
124
- retry
105
+ loop do
106
+ begin
107
+ http = Net::HTTP.new(uri.host, uri.port)
108
+ http.use_ssl = true
109
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
110
+ http.cert_store = default_cert_store
111
+ http.open_timeout = @timeout
112
+ http.read_timeout = @timeout
113
+
114
+ request = Net::HTTP::Post.new(uri.path)
115
+ request["Content-Type"] = "application/json"
116
+ request["Authorization"] = "Bearer #{@access_token}"
117
+ request.body = body.to_json
118
+
119
+ response = http.request(request)
120
+
121
+ # Handle 429 rate limiting before general response handling
122
+ if response.code.to_i == 429
123
+ retries += 1
124
+ if retries <= MAX_RETRIES
125
+ retry_after = response["Retry-After"]&.to_f
126
+ delay = retry_after || (BASE_RETRY_DELAY * (2**(retries - 1)) + rand * 0.5)
127
+ sleep delay
128
+ next
129
+ end
130
+ raise APIError.new("Rate limited after #{MAX_RETRIES} retries",
131
+ status_code: 429, response_body: response.body)
132
+ end
133
+
134
+ return handle_response(response)
135
+ rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED, Errno::ECONNRESET, SocketError => e
136
+ retries += 1
137
+ if retries <= MAX_RETRIES
138
+ delay = BASE_RETRY_DELAY * (2**(retries - 1)) + rand * 0.5
139
+ sleep delay
140
+ next
141
+ end
142
+ raise APIError.new("Network error after #{MAX_RETRIES} retries: #{e.message}")
125
143
  end
126
- raise APIError.new("Network error after #{MAX_RETRIES} retries: #{e.message}")
127
144
  end
128
145
  end
129
146
 
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Spacy
4
+ # A helper class for OpenAI API interactions, designed to work with spaCy's
5
+ # linguistic analysis via the block-based {Language#with_openai} API.
6
+ #
7
+ # @example Basic usage with linguistic_summary
8
+ # nlp = Spacy::Language.new("en_core_web_sm")
9
+ # nlp.with_openai(model: "gpt-5-mini") do |ai|
10
+ # doc = nlp.read("Apple Inc. was founded by Steve Jobs.")
11
+ # ai.chat(system: "Analyze the linguistic data.", user: doc.linguistic_summary)
12
+ # end
13
+ class OpenAIHelper
14
+ # @return [String] the default model for chat requests
15
+ attr_reader :model
16
+
17
+ # Creates a new OpenAIHelper instance.
18
+ # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
19
+ # @param model [String] the default model for chat requests
20
+ # @param max_completion_tokens [Integer] default maximum tokens in responses
21
+ # @param temperature [Float] default sampling temperature
22
+ def initialize(access_token: nil, model: "gpt-5-mini",
23
+ max_completion_tokens: 1000, temperature: 0.7)
24
+ @access_token = access_token || ENV["OPENAI_API_KEY"]
25
+ raise "Error: OPENAI_API_KEY is not set" unless @access_token
26
+
27
+ @model = model
28
+ @default_max_completion_tokens = max_completion_tokens
29
+ @default_temperature = temperature
30
+ @client = OpenAIClient.new(access_token: @access_token)
31
+ end
32
+
33
+ # Sends a chat completion request to OpenAI.
34
+ #
35
+ # Provides convenient `system:` and `user:` keyword arguments as shortcuts
36
+ # for building simple message arrays. For more complex conversations, pass
37
+ # a full `messages:` array directly.
38
+ #
39
+ # @param system [String, nil] system message content (shortcut)
40
+ # @param user [String, nil] user message content (shortcut)
41
+ # @param messages [Array<Hash>, nil] full message array (overrides system:/user:)
42
+ # @param model [String, nil] model override (defaults to instance model)
43
+ # @param max_completion_tokens [Integer, nil] token limit override
44
+ # @param temperature [Float, nil] temperature override
45
+ # @param response_format [Hash, nil] response format (e.g., { type: "json_object" })
46
+ # @param raw [Boolean] if true, returns the full API response Hash instead of text
47
+ # @return [String, Hash, nil] the response text, full response Hash (if raw:), or nil on error
48
+ def chat(system: nil, user: nil, messages: nil,
49
+ model: nil, max_completion_tokens: nil,
50
+ temperature: nil, response_format: nil, raw: false)
51
+ msgs = messages || build_messages(system: system, user: user)
52
+ raise ArgumentError, "No messages provided. Use system:/user: or messages:" if msgs.empty?
53
+
54
+ response = @client.chat(
55
+ model: model || @model,
56
+ messages: msgs,
57
+ max_completion_tokens: max_completion_tokens || @default_max_completion_tokens,
58
+ temperature: temperature || @default_temperature,
59
+ response_format: response_format
60
+ )
61
+
62
+ raw ? response : response.dig("choices", 0, "message", "content")
63
+ rescue OpenAIClient::APIError => e
64
+ puts "Error: OpenAI API call failed - #{e.message}"
65
+ nil
66
+ end
67
+
68
+ # Generates text embeddings using OpenAI's embeddings API.
69
+ #
70
+ # @param text [String] the text to embed
71
+ # @param model [String] the embeddings model
72
+ # @param dimensions [Integer, nil] number of dimensions (nil uses model default)
73
+ # @return [Array<Float>, nil] the embedding vector, or nil on error
74
+ def embeddings(text, model: "text-embedding-3-small", dimensions: nil)
75
+ response = @client.embeddings(model: model, input: text, dimensions: dimensions)
76
+ response.dig("data", 0, "embedding")
77
+ rescue OpenAIClient::APIError => e
78
+ puts "Error: OpenAI API call failed - #{e.message}"
79
+ nil
80
+ end
81
+
82
+ private
83
+
84
+ def build_messages(system: nil, user: nil)
85
+ msgs = []
86
+ msgs << { role: "system", content: system } if system
87
+ msgs << { role: "user", content: user } if user
88
+ msgs
89
+ end
90
+ end
91
+ end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Spacy
4
4
  # The version number of the module
5
- VERSION = "0.3.0"
5
+ VERSION = "0.4.0"
6
6
  end
data/lib/ruby-spacy.rb CHANGED
@@ -2,27 +2,28 @@
2
2
 
3
3
  require_relative "ruby-spacy/version"
4
4
  require_relative "ruby-spacy/openai_client"
5
+ require_relative "ruby-spacy/openai_helper"
5
6
  require "numpy"
6
7
  require "pycall"
7
- require "strscan"
8
8
  require "timeout"
9
9
  require "json"
10
-
11
- begin
12
- PyCall.init
13
- _spacy = PyCall.import_module("spacy")
14
- rescue PyCall::PyError => e
15
- puts "Failed to initialize PyCall or import spacy: #{e.message}"
16
- puts "Python traceback:"
17
- puts e.traceback
18
- raise
19
- end
10
+ require "base64"
20
11
 
21
12
  # This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
22
13
  module Spacy
23
14
  MAX_RETRIAL = 5
24
15
 
25
- spacy = PyCall.import_module("spacy")
16
+ begin
17
+ PyCall.init
18
+ spacy = PyCall.import_module("spacy")
19
+ rescue PyCall::PyError => e
20
+ puts "Failed to initialize PyCall or import spacy: #{e.message}"
21
+ puts "Python traceback:"
22
+ puts e.traceback
23
+ raise
24
+ end
25
+
26
+ Builtins = PyCall.import_module("builtins")
26
27
  SpacyVersion = spacy.__version__
27
28
 
28
29
  # Python `Language` class
@@ -53,6 +54,17 @@ module Spacy
53
54
  PyCall::List.call(py_generator)
54
55
  end
55
56
 
57
+ # Checks if a Python object has a given attribute using builtins.hasattr.
58
+ # Falls back to true if the check itself fails (e.g. due to PyCall issues).
59
+ # @param py_obj [Object] a Python object
60
+ # @param attr [String, Symbol] the attribute name to check
61
+ # @return [Boolean]
62
+ def self.py_hasattr?(py_obj, attr)
63
+ Builtins.hasattr(py_obj, attr.to_s)
64
+ rescue StandardError
65
+ true
66
+ end
67
+
56
68
  # See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
57
69
  class Doc
58
70
  # @return [Object] a Python `Language` instance accessible via `PyCall`
@@ -76,17 +88,19 @@ module Spacy
76
88
  # @param nlp [Language] an instance of {Language} class
77
89
  # @param py_doc [Object] an instance of Python `Doc` class
78
90
  # @param text [String] the text string to be analyzed
79
- def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL,
80
- retrial: 0)
91
+ def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL)
81
92
  @py_nlp = nlp
82
- @py_doc = py_doc || @py_doc = nlp.call(text)
83
- @text = @py_doc.text
84
- rescue StandardError
85
- retrial += 1
86
- raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
93
+ retrial = 0
94
+ begin
95
+ @py_doc = py_doc || nlp.call(text)
96
+ @text = @py_doc.text
97
+ rescue StandardError
98
+ retrial += 1
99
+ raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
87
100
 
88
- sleep 0.5
89
- initialize(nlp, py_doc: py_doc, text: text, max_retrial: max_retrial, retrial: retrial)
101
+ sleep 0.5
102
+ retry
103
+ end
90
104
  end
91
105
 
92
106
  # Retokenizes the text merging a span into a single token.
@@ -120,11 +134,7 @@ module Spacy
120
134
  # Returns an array of tokens contained in the doc.
121
135
  # @return [Array<Token>]
122
136
  def tokens
123
- results = []
124
- PyCall::List.call(@py_doc).each do |py_token|
125
- results << Token.new(py_token)
126
- end
127
- results
137
+ PyCall::List.call(@py_doc).map { |py_token| Token.new(py_token) }
128
138
  end
129
139
 
130
140
  # Iterates over the elements in the doc yielding a token instance each time.
@@ -140,54 +150,50 @@ module Spacy
140
150
  # @param optional_size [Integer] an integer representing the size of the span
141
151
  # @return [Span]
142
152
  def span(range_or_start, optional_size = nil)
153
+ doc_len = PyCall.len(@py_doc)
154
+
143
155
  if optional_size
144
156
  start_index = range_or_start
145
- temp = tokens[start_index...start_index + optional_size]
157
+ start_index += doc_len if start_index < 0
158
+ end_index = start_index + optional_size - 1
146
159
  else
147
- start_index = range_or_start.first
148
160
  range = range_or_start
149
- temp = tokens[range]
161
+ start_index = range.first
162
+ start_index += doc_len if start_index < 0
163
+ end_val = range.end
164
+ if end_val.nil?
165
+ end_index = doc_len - 1
166
+ else
167
+ end_val += doc_len if end_val < 0
168
+ end_index = range.exclude_end? ? end_val - 1 : end_val
169
+ end
150
170
  end
151
171
 
152
- end_index = start_index + temp.size - 1
153
-
154
172
  Span.new(self, start_index: start_index, end_index: end_index)
155
173
  end
156
174
 
157
175
  # Returns an array of spans representing noun chunks.
158
176
  # @return [Array<Span>]
159
177
  def noun_chunks
160
- chunk_array = []
161
- py_chunks = PyCall::List.call(@py_doc.noun_chunks)
162
- py_chunks.each do |py_chunk|
163
- chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
178
+ PyCall::List.call(@py_doc.noun_chunks).map do |py_chunk|
179
+ Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
164
180
  end
165
- chunk_array
166
181
  end
167
182
 
168
183
  # Returns an array of spans each representing a sentence.
169
184
  # @return [Array<Span>]
170
185
  def sents
171
- sentence_array = []
172
- py_sentences = PyCall::List.call(@py_doc.sents)
173
- py_sentences.each do |py_sent|
174
- sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
186
+ PyCall::List.call(@py_doc.sents).map do |py_sent|
187
+ Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
175
188
  end
176
- sentence_array
177
189
  end
178
190
 
179
191
  # Returns an array of spans each representing a named entity.
180
192
  # @return [Array<Span>]
181
193
  def ents
182
- # so that ents canbe "each"-ed in Ruby
183
- ent_array = []
184
- PyCall::List.call(@py_doc.ents).each do |ent|
185
- ent.define_singleton_method :label do
186
- label_
187
- end
188
- ent_array << ent
194
+ PyCall::List.call(@py_doc.ents).map do |py_span|
195
+ Span.new(self, py_span: py_span)
189
196
  end
190
- ent_array
191
197
  end
192
198
 
193
199
  # Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
@@ -227,7 +233,8 @@ module Spacy
227
233
  # bytes = File.binread("doc.bin")
228
234
  # doc = Spacy::Doc.from_bytes(nlp, bytes)
229
235
  def self.from_bytes(nlp, byte_string)
230
- py_bytes = PyCall.eval("bytes(#{byte_string.bytes})")
236
+ b64 = Base64.strict_encode64(byte_string)
237
+ py_bytes = PyCall.eval("__import__('base64').b64decode('#{b64}')")
231
238
  py_doc = nlp.py_nlp.call("").from_bytes(py_bytes)
232
239
  new(nlp.py_nlp, py_doc: py_doc)
233
240
  end
@@ -240,6 +247,63 @@ module Spacy
240
247
  PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
241
248
  end
242
249
 
250
+ # Generates a JSON string summarizing the linguistic analysis of the document.
251
+ # Designed to be passed as context to an LLM (e.g., via {OpenAIHelper#chat}).
252
+ #
253
+ # @param sections [Array<Symbol>] which sections to include
254
+ # (:text, :tokens, :entities, :noun_chunks, :sentences)
255
+ # @param token_attributes [Array<Symbol>] which token attributes to include
256
+ # (:text, :lemma, :pos, :tag, :dep, :head, :ent_type, :morphology)
257
+ # @return [String] a JSON string of the linguistic summary
258
+ def linguistic_summary(sections: [:text, :tokens, :entities, :noun_chunks],
259
+ token_attributes: [:text, :lemma, :pos, :dep, :head])
260
+ result = {}
261
+
262
+ sections.each do |section|
263
+ case section
264
+ when :text
265
+ result[:text] = @text
266
+ when :tokens
267
+ result[:tokens] = tokens.map do |token|
268
+ token_hash = {}
269
+ token_attributes.each do |attr|
270
+ case attr
271
+ when :head
272
+ token_hash[:head] = token.head.text
273
+ when :morphology
274
+ # Use string form and parse to ensure a plain Ruby Hash for JSON serialization
275
+ morph_str = token.morphology(hash: false)
276
+ token_hash[:morphology] = if morph_str.empty?
277
+ {}
278
+ else
279
+ morph_str.split("|").each_with_object({}) do |pair, h|
280
+ k, v = pair.split("=", 2)
281
+ h[k] = v
282
+ end
283
+ end
284
+ else
285
+ token_hash[attr] = token.send(attr)
286
+ end
287
+ end
288
+ token_hash
289
+ end
290
+ when :entities
291
+ ent_list = ents
292
+ result[:entities] = ent_list.map do |ent|
293
+ { text: ent.text, label: ent.label }
294
+ end
295
+ when :noun_chunks
296
+ result[:noun_chunks] = noun_chunks.map do |chunk|
297
+ { text: chunk.text, root: chunk.root.text }
298
+ end
299
+ when :sentences
300
+ result[:sentences] = sents.map(&:text)
301
+ end
302
+ end
303
+
304
+ result.to_json
305
+ end
306
+
243
307
  # Sends a query to OpenAI's chat completion API with optional tool support.
244
308
  # The get_tokens tool allows the model to request token-level linguistic analysis.
245
309
  #
@@ -248,7 +312,7 @@ module Spacy
248
312
  # @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
249
313
  # @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
250
314
  # @param model [String] The model to use (default: gpt-5-mini)
251
- # @param messages [Array<Hash>] Conversation history (for recursive tool calls)
315
+ # @param messages [Array<Hash>] Conversation history (for recursive tool calls). Note: this array is modified in place when tool calls occur.
252
316
  # @param prompt [String, nil] System prompt for the query
253
317
  # @return [String, nil] The model's response content
254
318
  def openai_query(access_token: nil,
@@ -257,7 +321,10 @@ module Spacy
257
321
  temperature: 0.7,
258
322
  model: "gpt-5-mini",
259
323
  messages: [],
260
- prompt: nil)
324
+ prompt: nil,
325
+ response_format: nil,
326
+ max_tool_call_depth: 5,
327
+ _tool_call_depth: 0)
261
328
  # Support both max_completion_tokens and max_tokens for backward compatibility
262
329
  max_completion_tokens ||= max_tokens || 1000
263
330
  if messages.empty?
@@ -267,38 +334,42 @@ module Spacy
267
334
  ]
268
335
  end
269
336
 
270
- access_token ||= ENV["OPENAI_API_KEY"]
271
- raise "Error: OPENAI_API_KEY is not set" unless access_token
337
+ client = openai_client(access_token)
272
338
 
273
339
  # Tool definition for token analysis (GPT-5 tools API format)
274
- tools = [
275
- {
276
- type: "function",
277
- function: {
278
- name: "get_tokens",
279
- description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
280
- parameters: {
281
- type: "object",
282
- properties: {
283
- text: {
284
- type: "string",
285
- description: "text to be tokenized"
286
- }
287
- },
288
- required: ["text"]
340
+ tools = nil
341
+ tool_choice = nil
342
+ if _tool_call_depth < max_tool_call_depth
343
+ tools = [
344
+ {
345
+ type: "function",
346
+ function: {
347
+ name: "get_tokens",
348
+ description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
349
+ parameters: {
350
+ type: "object",
351
+ properties: {
352
+ text: {
353
+ type: "string",
354
+ description: "text to be tokenized"
355
+ }
356
+ },
357
+ required: ["text"]
358
+ }
289
359
  }
290
360
  }
291
- }
292
- ]
361
+ ]
362
+ tool_choice = "auto"
363
+ end
293
364
 
294
- client = OpenAIClient.new(access_token: access_token)
295
365
  response = client.chat(
296
366
  model: model,
297
367
  messages: messages,
298
368
  max_completion_tokens: max_completion_tokens,
299
369
  temperature: temperature,
300
370
  tools: tools,
301
- tool_choice: "auto"
371
+ tool_choice: tool_choice,
372
+ response_format: response_format
302
373
  )
303
374
 
304
375
  message = response.dig("choices", 0, "message")
@@ -340,7 +411,10 @@ module Spacy
340
411
  temperature: temperature,
341
412
  model: model,
342
413
  messages: messages,
343
- prompt: prompt
414
+ prompt: prompt,
415
+ response_format: response_format,
416
+ max_tool_call_depth: max_tool_call_depth,
417
+ _tool_call_depth: _tool_call_depth + 1
344
418
  )
345
419
  else
346
420
  message["content"]
@@ -367,10 +441,7 @@ module Spacy
367
441
  { role: "user", content: @text }
368
442
  ]
369
443
 
370
- access_token ||= ENV["OPENAI_API_KEY"]
371
- raise "Error: OPENAI_API_KEY is not set" unless access_token
372
-
373
- client = OpenAIClient.new(access_token: access_token)
444
+ client = openai_client(access_token)
374
445
  response = client.chat(
375
446
  model: model,
376
447
  messages: messages,
@@ -387,26 +458,40 @@ module Spacy
387
458
  #
388
459
  # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
389
460
  # @param model [String] The embeddings model (default: text-embedding-3-small)
461
+ # @param dimensions [Integer, nil] The number of dimensions for the output embeddings (nil uses model default)
390
462
  # @return [Array<Float>, nil] The embedding vector
391
- def openai_embeddings(access_token: nil, model: "text-embedding-3-small")
392
- access_token ||= ENV["OPENAI_API_KEY"]
393
- raise "Error: OPENAI_API_KEY is not set" unless access_token
394
-
395
- client = OpenAIClient.new(access_token: access_token)
396
- response = client.embeddings(model: model, input: @text)
463
+ def openai_embeddings(access_token: nil, model: "text-embedding-3-small", dimensions: nil)
464
+ client = openai_client(access_token)
465
+ response = client.embeddings(model: model, input: @text, dimensions: dimensions)
397
466
  response.dig("data", 0, "embedding")
398
467
  rescue OpenAIClient::APIError => e
399
468
  puts "Error: OpenAI API call failed - #{e.message}"
400
469
  nil
401
470
  end
402
471
 
472
+ private
473
+
474
+ def openai_client(access_token)
475
+ access_token ||= ENV["OPENAI_API_KEY"]
476
+ raise "Error: OPENAI_API_KEY is not set" unless access_token
477
+
478
+ @openai_clients ||= {}
479
+ @openai_clients[access_token] ||= OpenAIClient.new(access_token: access_token)
480
+ end
481
+
482
+ public
483
+
403
484
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
404
485
  def method_missing(name, *args)
405
486
  @py_doc.send(name, *args)
406
487
  end
407
488
 
408
- def respond_to_missing?(sym, *args)
409
- sym ? true : super
489
+ def respond_to_missing?(sym, include_private = false)
490
+ Spacy.py_hasattr?(@py_doc, sym) || super
491
+ end
492
+
493
+ def instance_variables_to_inspect
494
+ [:@text]
410
495
  end
411
496
  end
412
497
 
@@ -420,8 +505,13 @@ module Spacy
420
505
 
421
506
  # Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
422
507
  # @param model [String] A language model installed in the system
423
- def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, retrial: 0, timeout: 60)
508
+ def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, timeout: 60)
509
+ unless model.to_s.match?(/\A[a-zA-Z0-9_\-\.\/]+\z/)
510
+ raise ArgumentError, "Invalid model name: #{model.inspect}"
511
+ end
512
+
424
513
  @spacy_nlp_id = "nlp_#{model.object_id}"
514
+ retrial = 0
425
515
  begin
426
516
  Timeout.timeout(timeout) do
427
517
  PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
@@ -468,17 +558,13 @@ module Spacy
468
558
  # @param id [Integer] a vocabulary id
469
559
  # @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
470
560
  def vocab_string_lookup(id)
471
- PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
561
+ PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{Integer(id)}]")
472
562
  end
473
563
 
474
564
  # A utility method to list pipeline components.
475
565
  # @return [Array<String>] An array of text strings representing pipeline components
476
566
  def pipe_names
477
- pipe_array = []
478
- PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
479
- pipe_array << pipe
480
- end
481
- pipe_array
567
+ PyCall::List.call(@py_nlp.pipe_names).to_a
482
568
  end
483
569
 
484
570
  # A utility method to get a Python `Lexeme` object.
@@ -527,20 +613,62 @@ module Spacy
527
613
  # @param batch_size [Integer]
528
614
  # @return [Array<Doc>]
529
615
  def pipe(texts, disable: [], batch_size: 50)
530
- docs = []
531
- PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).each do |py_doc|
532
- docs << Doc.new(@py_nlp, py_doc: py_doc)
616
+ PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).map do |py_doc|
617
+ Doc.new(@py_nlp, py_doc: py_doc)
533
618
  end
534
- docs
535
619
  end
536
620
 
537
- # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism....
621
+ # Yields an {OpenAIHelper} instance for making OpenAI API calls within a block.
622
+ # The helper is configured once and reused for all calls within the block,
623
+ # making it efficient for batch processing with {#pipe}.
624
+ #
625
+ # @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
626
+ # @param model [String] the default model for chat requests
627
+ # @param max_completion_tokens [Integer] default maximum tokens in responses
628
+ # @param temperature [Float] default sampling temperature
629
+ # @yield [OpenAIHelper] the helper instance for making API calls
630
+ # @return [Object] the block's return value
631
+ # @example Batch processing with pipe
632
+ # nlp.with_openai(model: "gpt-5-mini") do |ai|
633
+ # nlp.pipe(texts).map do |doc|
634
+ # ai.chat(system: "Analyze.", user: doc.linguistic_summary)
635
+ # end
636
+ # end
637
+ def with_openai(access_token: nil, model: "gpt-5-mini",
638
+ max_completion_tokens: 1000, temperature: 0.7)
639
+ helper = OpenAIHelper.new(
640
+ access_token: access_token,
641
+ model: model,
642
+ max_completion_tokens: max_completion_tokens,
643
+ temperature: temperature
644
+ )
645
+ yield helper
646
+ end
647
+
648
+ # Executes a block within spaCy's memory zone for efficient memory management.
649
+ # Requires spaCy >= 3.8.
650
+ # @yield the block to execute within the memory zone
651
+ # @raise [NotImplementedError] if spaCy version does not support memory zones
652
+ def memory_zone(&block)
653
+ major, minor = SpacyVersion.split(".").map(&:to_i)
654
+ unless major > 3 || (major == 3 && minor >= 8)
655
+ raise NotImplementedError, "memory_zone requires spaCy >= 3.8 (current: #{SpacyVersion})"
656
+ end
657
+
658
+ PyCall.with(@py_nlp.memory_zone, &block)
659
+ end
660
+
661
+ # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
538
662
  def method_missing(name, *args)
539
663
  @py_nlp.send(name, *args)
540
664
  end
541
665
 
542
- def respond_to_missing?(sym, *args)
543
- sym ? true : super
666
+ def respond_to_missing?(sym, include_private = false)
667
+ Spacy.py_hasattr?(@py_nlp, sym) || super
668
+ end
669
+
670
+ def instance_variables_to_inspect
671
+ [:@spacy_nlp_id]
544
672
  end
545
673
  end
546
674
 
@@ -566,19 +694,9 @@ module Spacy
566
694
  # @param doc [Doc] an {Doc} instance
567
695
  # @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
568
696
  def match(doc)
569
- str_results = @py_matcher.call(doc.py_doc).to_s
570
- s = StringScanner.new(str_results[1..-2])
571
- results = []
572
- while s.scan_until(/(\d+), (\d+), (\d+)/)
573
- next unless s.matched
574
-
575
- triple = s.matched.split(", ")
576
- match_id = triple[0].to_i
577
- start_index = triple[1].to_i
578
- end_index = triple[2].to_i - 1
579
- results << { match_id: match_id, start_index: start_index, end_index: end_index }
697
+ PyCall::List.call(@py_matcher.call(doc.py_doc)).map do |py_match|
698
+ { match_id: py_match[0].to_i, start_index: py_match[1].to_i, end_index: py_match[2].to_i - 1 }
580
699
  end
581
- results
582
700
  end
583
701
  end
584
702
 
@@ -621,12 +739,7 @@ module Spacy
621
739
  # matches.each { |span| puts "#{span.text} => #{span.label}" }
622
740
  def match(doc)
623
741
  py_matches = @py_matcher.call(doc.py_doc, as_spans: true)
624
- results = []
625
- PyCall::List.call(py_matches).each do |py_span|
626
- span = Span.new(doc, py_span: py_span)
627
- results << span
628
- end
629
- results
742
+ PyCall::List.call(py_matches).map { |py_span| Span.new(doc, py_span: py_span) }
630
743
  end
631
744
  end
632
745
 
@@ -638,6 +751,9 @@ module Spacy
638
751
  # @return [Doc] the document to which the span belongs
639
752
  attr_reader :doc
640
753
 
754
+ # @return [String] a text string of the span
755
+ attr_reader :text
756
+
641
757
  include Enumerable
642
758
 
643
759
  alias length count
@@ -653,17 +769,14 @@ module Spacy
653
769
  # @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
654
770
  def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
655
771
  @doc = doc
656
- @py_span = py_span || @py_span = PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
772
+ @py_span = py_span || PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
773
+ @text = @py_span.text
657
774
  end
658
775
 
659
776
  # Returns an array of tokens contained in the span.
660
777
  # @return [Array<Token>]
661
778
  def tokens
662
- results = []
663
- PyCall::List.call(@py_span).each do |py_token|
664
- results << Token.new(py_token)
665
- end
666
- results
779
+ PyCall::List.call(@py_span).map { |py_token| Token.new(py_token) }
667
780
  end
668
781
 
669
782
  # Iterates over the elements in the span yielding a token instance each time.
@@ -676,12 +789,9 @@ module Spacy
676
789
  # Returns an array of spans of noun chunks.
677
790
  # @return [Array<Span>]
678
791
  def noun_chunks
679
- chunk_array = []
680
- py_chunks = PyCall::List.call(@py_span.noun_chunks)
681
- py_chunks.each do |py_span|
682
- chunk_array << Span.new(@doc, py_span: py_span)
792
+ PyCall::List.call(@py_span.noun_chunks).map do |py_span|
793
+ Span.new(@doc, py_span: py_span)
683
794
  end
684
- chunk_array
685
795
  end
686
796
 
687
797
  # Returns the head token
@@ -693,22 +803,17 @@ module Spacy
693
803
  # Returns an array of spans that represents sentences.
694
804
  # @return [Array<Span>]
695
805
  def sents
696
- sentence_array = []
697
- py_sentences = PyCall::List.call(@py_span.sents)
698
- py_sentences.each do |py_span|
699
- sentence_array << Span.new(@doc, py_span: py_span)
806
+ PyCall::List.call(@py_span.sents).map do |py_span|
807
+ Span.new(@doc, py_span: py_span)
700
808
  end
701
- sentence_array
702
809
  end
703
810
 
704
811
  # Returns an array of spans that represents named entities.
705
812
  # @return [Array<Span>]
706
813
  def ents
707
- ent_array = []
708
- PyCall::List.call(@py_span.ents).each do |py_span|
709
- ent_array << Span.new(@doc, py_span: py_span)
814
+ PyCall::List.call(@py_span.ents).map do |py_span|
815
+ Span.new(@doc, py_span: py_span)
710
816
  end
711
- ent_array
712
817
  end
713
818
 
714
819
  # Returns a span that represents the sentence that the given span is part of.
@@ -745,41 +850,25 @@ module Spacy
745
850
  # Returns tokens conjugated to the root of the span.
746
851
  # @return [Array<Token>] an array of tokens
747
852
  def conjuncts
748
- conjunct_array = []
749
- PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
750
- conjunct_array << Token.new(py_conjunct)
751
- end
752
- conjunct_array
853
+ PyCall::List.call(@py_span.conjuncts).map { |py_conjunct| Token.new(py_conjunct) }
753
854
  end
754
855
 
755
856
  # Returns tokens that are to the left of the span, whose heads are within the span.
756
857
  # @return [Array<Token>] an array of tokens
757
858
  def lefts
758
- left_array = []
759
- PyCall::List.call(@py_span.lefts).each do |py_left|
760
- left_array << Token.new(py_left)
761
- end
762
- left_array
859
+ PyCall::List.call(@py_span.lefts).map { |py_left| Token.new(py_left) }
763
860
  end
764
861
 
765
862
  # Returns Tokens that are to the right of the span, whose heads are within the span.
766
863
  # @return [Array<Token>] an array of Tokens
767
864
  def rights
768
- right_array = []
769
- PyCall::List.call(@py_span.rights).each do |py_right|
770
- right_array << Token.new(py_right)
771
- end
772
- right_array
865
+ PyCall::List.call(@py_span.rights).map { |py_right| Token.new(py_right) }
773
866
  end
774
867
 
775
868
  # Returns Tokens that are within the span and tokens that descend from them.
776
869
  # @return [Array<Token>] an array of tokens
777
870
  def subtree
778
- subtree_array = []
779
- PyCall::List.call(@py_span.subtree).each do |py_subtree|
780
- subtree_array << Token.new(py_subtree)
781
- end
782
- subtree_array
871
+ PyCall::List.call(@py_span.subtree).map { |py_subtree| Token.new(py_subtree) }
783
872
  end
784
873
 
785
874
  # Returns the label
@@ -788,13 +877,23 @@ module Spacy
788
877
  @py_span.label_
789
878
  end
790
879
 
880
+ # String representation of the span.
881
+ # @return [String]
882
+ def to_s
883
+ @text
884
+ end
885
+
791
886
  # Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
792
887
  def method_missing(name, *args)
793
888
  @py_span.send(name, *args)
794
889
  end
795
890
 
796
- def respond_to_missing?(sym, *args)
797
- sym ? true : super
891
+ def respond_to_missing?(sym, include_private = false)
892
+ Spacy.py_hasattr?(@py_span, sym) || super
893
+ end
894
+
895
+ def instance_variables_to_inspect
896
+ [:@text]
798
897
  end
799
898
  end
800
899
 
@@ -814,6 +913,12 @@ module Spacy
814
913
  @text = @py_token.text
815
914
  end
816
915
 
916
+ # Returns the character offset of the token within the parent document.
917
+ # @return [Integer]
918
+ def idx
919
+ @py_token.idx
920
+ end
921
+
817
922
  # Returns the head token
818
923
  # @return [Token]
819
924
  def head
@@ -823,51 +928,31 @@ module Spacy
823
928
  # Returns the token in question and the tokens that descend from it.
824
929
  # @return [Array<Token>] an array of tokens
825
930
  def subtree
826
- descendant_array = []
827
- PyCall::List.call(@py_token.subtree).each do |descendant|
828
- descendant_array << Token.new(descendant)
829
- end
830
- descendant_array
931
+ PyCall::List.call(@py_token.subtree).map { |descendant| Token.new(descendant) }
831
932
  end
832
933
 
833
934
  # Returns the token's ancestors.
834
935
  # @return [Array<Token>] an array of tokens
835
936
  def ancestors
836
- ancestor_array = []
837
- PyCall::List.call(@py_token.ancestors).each do |ancestor|
838
- ancestor_array << Token.new(ancestor)
839
- end
840
- ancestor_array
937
+ PyCall::List.call(@py_token.ancestors).map { |ancestor| Token.new(ancestor) }
841
938
  end
842
939
 
843
940
  # Returns a sequence of the token's immediate syntactic children.
844
941
  # @return [Array<Token>] an array of tokens
845
942
  def children
846
- child_array = []
847
- PyCall::List.call(@py_token.children).each do |child|
848
- child_array << Token.new(child)
849
- end
850
- child_array
943
+ PyCall::List.call(@py_token.children).map { |child| Token.new(child) }
851
944
  end
852
945
 
853
946
  # The leftward immediate children of the word in the syntactic dependency parse.
854
947
  # @return [Array<Token>] an array of tokens
855
948
  def lefts
856
- token_array = []
857
- PyCall::List.call(@py_token.lefts).each do |token|
858
- token_array << Token.new(token)
859
- end
860
- token_array
949
+ PyCall::List.call(@py_token.lefts).map { |token| Token.new(token) }
861
950
  end
862
951
 
863
952
  # The rightward immediate children of the word in the syntactic dependency parse.
864
953
  # @return [Array<Token>] an array of tokens
865
954
  def rights
866
- token_array = []
867
- PyCall::List.call(@py_token.rights).each do |token|
868
- token_array << Token.new(token)
869
- end
870
- token_array
955
+ PyCall::List.call(@py_token.rights).map { |token| Token.new(token) }
871
956
  end
872
957
 
873
958
  # String representation of the token.
@@ -959,8 +1044,12 @@ module Spacy
959
1044
  @py_token.send(name, *args)
960
1045
  end
961
1046
 
962
- def respond_to_missing?(sym, *args)
963
- sym ? true : super
1047
+ def respond_to_missing?(sym, include_private = false)
1048
+ Spacy.py_hasattr?(@py_token, sym) || super
1049
+ end
1050
+
1051
+ def instance_variables_to_inspect
1052
+ [:@text]
964
1053
  end
965
1054
  end
966
1055
 
@@ -1034,8 +1123,12 @@ module Spacy
1034
1123
  @py_lexeme.send(name, *args)
1035
1124
  end
1036
1125
 
1037
- def respond_to_missing?(sym, *args)
1038
- sym ? true : super
1126
+ def respond_to_missing?(sym, include_private = false)
1127
+ Spacy.py_hasattr?(@py_lexeme, sym) || super
1128
+ end
1129
+
1130
+ def instance_variables_to_inspect
1131
+ [:@text]
1039
1132
  end
1040
1133
  end
1041
1134
  end
data/ruby-spacy.gemspec CHANGED
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.add_development_dependency "rspec"
32
32
  spec.add_development_dependency "solargraph"
33
33
 
34
+ spec.add_dependency "base64" # Required for Ruby 3.4+ (moved from default to bundled gem)
34
35
  spec.add_dependency "fiddle" # Required for Ruby 4.0+ (moved from default to bundled gem)
35
36
  spec.add_dependency "numpy", "~> 0.4.0"
36
37
  spec.add_dependency "pycall", "~> 1.5.1"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spacy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
@@ -65,6 +65,20 @@ dependencies:
65
65
  - - ">="
66
66
  - !ruby/object:Gem::Version
67
67
  version: '0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: base64
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
68
82
  - !ruby/object:Gem::Dependency
69
83
  name: fiddle
70
84
  requirement: !ruby/object:Gem::Requirement
@@ -133,6 +147,7 @@ executables: []
133
147
  extensions: []
134
148
  extra_rdoc_files: []
135
149
  files:
150
+ - ".github/FUNDING.yml"
136
151
  - ".gitignore"
137
152
  - CHANGELOG.md
138
153
  - Gemfile
@@ -203,6 +218,7 @@ files:
203
218
  - examples/rule_based_matching/matcher.rb
204
219
  - lib/ruby-spacy.rb
205
220
  - lib/ruby-spacy/openai_client.rb
221
+ - lib/ruby-spacy/openai_helper.rb
206
222
  - lib/ruby-spacy/version.rb
207
223
  - ruby-spacy.gemspec
208
224
  homepage: https://github.com/yohasebe/ruby-spacy