ruby-spacy 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/FUNDING.yml +6 -0
- data/lib/ruby-spacy/openai_client.rb +57 -40
- data/lib/ruby-spacy/openai_helper.rb +91 -0
- data/lib/ruby-spacy/version.rb +1 -1
- data/lib/ruby-spacy.rb +282 -189
- data/ruby-spacy.gemspec +1 -0
- metadata +17 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6185c586feb32fa51efcd4349398cd4ca9541280a5cc8a1b6a73eb93a987d4ac
|
|
4
|
+
data.tar.gz: a146a9c40e2d5293e2401cb16b8ac6866cbb577e11a10d9657c406f933e7a3aa
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bf558d4e9a7a6765fd7d088bbf8324a6ee0e4f4186962551d71e5a991e0aefd1e51a186f19c2824fabcc6afd0c83960771f082237febece52c2a522ccb39a5cf
|
|
7
|
+
data.tar.gz: 3a64559cf8c169d1ac1ecdef526d26e5776989b9cc203a8ed30e0dd5d87ff62a4d1b741aff30c8cb49e5ffb716c6068f9af3a12d50d0d4de8ad6f22ebe80ea0d
|
data/.github/FUNDING.yml
ADDED
|
@@ -13,7 +13,7 @@ module Spacy
|
|
|
13
13
|
API_ENDPOINT = "https://api.openai.com/v1"
|
|
14
14
|
DEFAULT_TIMEOUT = 120
|
|
15
15
|
MAX_RETRIES = 3
|
|
16
|
-
|
|
16
|
+
BASE_RETRY_DELAY = 1
|
|
17
17
|
|
|
18
18
|
class APIError < StandardError
|
|
19
19
|
attr_reader :status_code, :response_body
|
|
@@ -31,24 +31,25 @@ module Spacy
|
|
|
31
31
|
end
|
|
32
32
|
|
|
33
33
|
# Sends a chat completion request with optional tools support.
|
|
34
|
-
# Note: GPT-5 series models do not support the temperature parameter.
|
|
34
|
+
# Note: GPT-5 series and o-series models do not support the temperature parameter.
|
|
35
35
|
#
|
|
36
36
|
# @param model [String] The model to use (e.g., "gpt-5-mini")
|
|
37
37
|
# @param messages [Array<Hash>] The conversation messages
|
|
38
38
|
# @param max_completion_tokens [Integer] Maximum tokens in the response
|
|
39
|
-
# @param temperature [Float, nil] Sampling temperature (ignored for
|
|
39
|
+
# @param temperature [Float, nil] Sampling temperature (ignored for models that don't support it)
|
|
40
40
|
# @param tools [Array<Hash>, nil] Tool definitions for function calling
|
|
41
41
|
# @param tool_choice [String, Hash, nil] Tool selection strategy
|
|
42
|
+
# @param response_format [Hash, nil] Response format specification (e.g., { type: "json_object" })
|
|
42
43
|
# @return [Hash] The API response
|
|
43
|
-
def chat(model:, messages:, max_completion_tokens: 1000, temperature: nil, tools: nil, tool_choice: nil)
|
|
44
|
+
def chat(model:, messages:, max_completion_tokens: 1000, temperature: nil, tools: nil, tool_choice: nil, response_format: nil)
|
|
44
45
|
body = {
|
|
45
46
|
model: model,
|
|
46
47
|
messages: messages,
|
|
47
48
|
max_completion_tokens: max_completion_tokens
|
|
48
49
|
}
|
|
49
50
|
|
|
50
|
-
# GPT-5 series models do not support temperature parameter
|
|
51
|
-
unless
|
|
51
|
+
# GPT-5 series and o-series models do not support temperature parameter
|
|
52
|
+
unless temperature_unsupported?(model)
|
|
52
53
|
body[:temperature] = temperature || 0.7
|
|
53
54
|
end
|
|
54
55
|
|
|
@@ -57,25 +58,32 @@ module Spacy
|
|
|
57
58
|
body[:tool_choice] = tool_choice || "auto"
|
|
58
59
|
end
|
|
59
60
|
|
|
61
|
+
body[:response_format] = response_format if response_format
|
|
62
|
+
|
|
60
63
|
post("/chat/completions", body)
|
|
61
64
|
end
|
|
62
65
|
|
|
63
|
-
# Checks if the model
|
|
64
|
-
# GPT-5
|
|
65
|
-
|
|
66
|
-
|
|
66
|
+
# Checks if the model does not support the temperature parameter.
|
|
67
|
+
# This includes GPT-5 series and o-series (o1, o3, o4-mini, etc.) models.
|
|
68
|
+
# @param model [String] The model name
|
|
69
|
+
# @return [Boolean]
|
|
70
|
+
def temperature_unsupported?(model)
|
|
71
|
+
name = model.to_s
|
|
72
|
+
name.start_with?("gpt-5") || name.match?(/\Ao\d/)
|
|
67
73
|
end
|
|
68
74
|
|
|
69
75
|
# Sends an embeddings request.
|
|
70
76
|
#
|
|
71
77
|
# @param model [String] The embeddings model (e.g., "text-embedding-3-small")
|
|
72
78
|
# @param input [String] The text to embed
|
|
79
|
+
# @param dimensions [Integer, nil] The number of dimensions for the output embeddings
|
|
73
80
|
# @return [Hash] The API response
|
|
74
|
-
def embeddings(model:, input:)
|
|
81
|
+
def embeddings(model:, input:, dimensions: nil)
|
|
75
82
|
body = {
|
|
76
83
|
model: model,
|
|
77
84
|
input: input
|
|
78
85
|
}
|
|
86
|
+
body[:dimensions] = dimensions if dimensions
|
|
79
87
|
|
|
80
88
|
post("/embeddings", body)
|
|
81
89
|
end
|
|
@@ -94,36 +102,45 @@ module Spacy
|
|
|
94
102
|
uri = URI.parse("#{API_ENDPOINT}#{path}")
|
|
95
103
|
retries = 0
|
|
96
104
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
105
|
+
loop do
|
|
106
|
+
begin
|
|
107
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
108
|
+
http.use_ssl = true
|
|
109
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
110
|
+
http.cert_store = default_cert_store
|
|
111
|
+
http.open_timeout = @timeout
|
|
112
|
+
http.read_timeout = @timeout
|
|
113
|
+
|
|
114
|
+
request = Net::HTTP::Post.new(uri.path)
|
|
115
|
+
request["Content-Type"] = "application/json"
|
|
116
|
+
request["Authorization"] = "Bearer #{@access_token}"
|
|
117
|
+
request.body = body.to_json
|
|
118
|
+
|
|
119
|
+
response = http.request(request)
|
|
120
|
+
|
|
121
|
+
# Handle 429 rate limiting before general response handling
|
|
122
|
+
if response.code.to_i == 429
|
|
123
|
+
retries += 1
|
|
124
|
+
if retries <= MAX_RETRIES
|
|
125
|
+
retry_after = response["Retry-After"]&.to_f
|
|
126
|
+
delay = retry_after || (BASE_RETRY_DELAY * (2**(retries - 1)) + rand * 0.5)
|
|
127
|
+
sleep delay
|
|
128
|
+
next
|
|
129
|
+
end
|
|
130
|
+
raise APIError.new("Rate limited after #{MAX_RETRIES} retries",
|
|
131
|
+
status_code: 429, response_body: response.body)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
return handle_response(response)
|
|
135
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED, Errno::ECONNRESET, SocketError => e
|
|
136
|
+
retries += 1
|
|
137
|
+
if retries <= MAX_RETRIES
|
|
138
|
+
delay = BASE_RETRY_DELAY * (2**(retries - 1)) + rand * 0.5
|
|
139
|
+
sleep delay
|
|
140
|
+
next
|
|
141
|
+
end
|
|
142
|
+
raise APIError.new("Network error after #{MAX_RETRIES} retries: #{e.message}")
|
|
125
143
|
end
|
|
126
|
-
raise APIError.new("Network error after #{MAX_RETRIES} retries: #{e.message}")
|
|
127
144
|
end
|
|
128
145
|
end
|
|
129
146
|
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Spacy
|
|
4
|
+
# A helper class for OpenAI API interactions, designed to work with spaCy's
|
|
5
|
+
# linguistic analysis via the block-based {Language#with_openai} API.
|
|
6
|
+
#
|
|
7
|
+
# @example Basic usage with linguistic_summary
|
|
8
|
+
# nlp = Spacy::Language.new("en_core_web_sm")
|
|
9
|
+
# nlp.with_openai(model: "gpt-5-mini") do |ai|
|
|
10
|
+
# doc = nlp.read("Apple Inc. was founded by Steve Jobs.")
|
|
11
|
+
# ai.chat(system: "Analyze the linguistic data.", user: doc.linguistic_summary)
|
|
12
|
+
# end
|
|
13
|
+
class OpenAIHelper
|
|
14
|
+
# @return [String] the default model for chat requests
|
|
15
|
+
attr_reader :model
|
|
16
|
+
|
|
17
|
+
# Creates a new OpenAIHelper instance.
|
|
18
|
+
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
19
|
+
# @param model [String] the default model for chat requests
|
|
20
|
+
# @param max_completion_tokens [Integer] default maximum tokens in responses
|
|
21
|
+
# @param temperature [Float] default sampling temperature
|
|
22
|
+
def initialize(access_token: nil, model: "gpt-5-mini",
|
|
23
|
+
max_completion_tokens: 1000, temperature: 0.7)
|
|
24
|
+
@access_token = access_token || ENV["OPENAI_API_KEY"]
|
|
25
|
+
raise "Error: OPENAI_API_KEY is not set" unless @access_token
|
|
26
|
+
|
|
27
|
+
@model = model
|
|
28
|
+
@default_max_completion_tokens = max_completion_tokens
|
|
29
|
+
@default_temperature = temperature
|
|
30
|
+
@client = OpenAIClient.new(access_token: @access_token)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Sends a chat completion request to OpenAI.
|
|
34
|
+
#
|
|
35
|
+
# Provides convenient `system:` and `user:` keyword arguments as shortcuts
|
|
36
|
+
# for building simple message arrays. For more complex conversations, pass
|
|
37
|
+
# a full `messages:` array directly.
|
|
38
|
+
#
|
|
39
|
+
# @param system [String, nil] system message content (shortcut)
|
|
40
|
+
# @param user [String, nil] user message content (shortcut)
|
|
41
|
+
# @param messages [Array<Hash>, nil] full message array (overrides system:/user:)
|
|
42
|
+
# @param model [String, nil] model override (defaults to instance model)
|
|
43
|
+
# @param max_completion_tokens [Integer, nil] token limit override
|
|
44
|
+
# @param temperature [Float, nil] temperature override
|
|
45
|
+
# @param response_format [Hash, nil] response format (e.g., { type: "json_object" })
|
|
46
|
+
# @param raw [Boolean] if true, returns the full API response Hash instead of text
|
|
47
|
+
# @return [String, Hash, nil] the response text, full response Hash (if raw:), or nil on error
|
|
48
|
+
def chat(system: nil, user: nil, messages: nil,
|
|
49
|
+
model: nil, max_completion_tokens: nil,
|
|
50
|
+
temperature: nil, response_format: nil, raw: false)
|
|
51
|
+
msgs = messages || build_messages(system: system, user: user)
|
|
52
|
+
raise ArgumentError, "No messages provided. Use system:/user: or messages:" if msgs.empty?
|
|
53
|
+
|
|
54
|
+
response = @client.chat(
|
|
55
|
+
model: model || @model,
|
|
56
|
+
messages: msgs,
|
|
57
|
+
max_completion_tokens: max_completion_tokens || @default_max_completion_tokens,
|
|
58
|
+
temperature: temperature || @default_temperature,
|
|
59
|
+
response_format: response_format
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
raw ? response : response.dig("choices", 0, "message", "content")
|
|
63
|
+
rescue OpenAIClient::APIError => e
|
|
64
|
+
puts "Error: OpenAI API call failed - #{e.message}"
|
|
65
|
+
nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Generates text embeddings using OpenAI's embeddings API.
|
|
69
|
+
#
|
|
70
|
+
# @param text [String] the text to embed
|
|
71
|
+
# @param model [String] the embeddings model
|
|
72
|
+
# @param dimensions [Integer, nil] number of dimensions (nil uses model default)
|
|
73
|
+
# @return [Array<Float>, nil] the embedding vector, or nil on error
|
|
74
|
+
def embeddings(text, model: "text-embedding-3-small", dimensions: nil)
|
|
75
|
+
response = @client.embeddings(model: model, input: text, dimensions: dimensions)
|
|
76
|
+
response.dig("data", 0, "embedding")
|
|
77
|
+
rescue OpenAIClient::APIError => e
|
|
78
|
+
puts "Error: OpenAI API call failed - #{e.message}"
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
def build_messages(system: nil, user: nil)
|
|
85
|
+
msgs = []
|
|
86
|
+
msgs << { role: "system", content: system } if system
|
|
87
|
+
msgs << { role: "user", content: user } if user
|
|
88
|
+
msgs
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
data/lib/ruby-spacy/version.rb
CHANGED
data/lib/ruby-spacy.rb
CHANGED
|
@@ -2,27 +2,28 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative "ruby-spacy/version"
|
|
4
4
|
require_relative "ruby-spacy/openai_client"
|
|
5
|
+
require_relative "ruby-spacy/openai_helper"
|
|
5
6
|
require "numpy"
|
|
6
7
|
require "pycall"
|
|
7
|
-
require "strscan"
|
|
8
8
|
require "timeout"
|
|
9
9
|
require "json"
|
|
10
|
-
|
|
11
|
-
begin
|
|
12
|
-
PyCall.init
|
|
13
|
-
_spacy = PyCall.import_module("spacy")
|
|
14
|
-
rescue PyCall::PyError => e
|
|
15
|
-
puts "Failed to initialize PyCall or import spacy: #{e.message}"
|
|
16
|
-
puts "Python traceback:"
|
|
17
|
-
puts e.traceback
|
|
18
|
-
raise
|
|
19
|
-
end
|
|
10
|
+
require "base64"
|
|
20
11
|
|
|
21
12
|
# This module covers the areas of spaCy functionality for _using_ many varieties of its language models, not for _building_ ones.
|
|
22
13
|
module Spacy
|
|
23
14
|
MAX_RETRIAL = 5
|
|
24
15
|
|
|
25
|
-
|
|
16
|
+
begin
|
|
17
|
+
PyCall.init
|
|
18
|
+
spacy = PyCall.import_module("spacy")
|
|
19
|
+
rescue PyCall::PyError => e
|
|
20
|
+
puts "Failed to initialize PyCall or import spacy: #{e.message}"
|
|
21
|
+
puts "Python traceback:"
|
|
22
|
+
puts e.traceback
|
|
23
|
+
raise
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
Builtins = PyCall.import_module("builtins")
|
|
26
27
|
SpacyVersion = spacy.__version__
|
|
27
28
|
|
|
28
29
|
# Python `Language` class
|
|
@@ -53,6 +54,17 @@ module Spacy
|
|
|
53
54
|
PyCall::List.call(py_generator)
|
|
54
55
|
end
|
|
55
56
|
|
|
57
|
+
# Checks if a Python object has a given attribute using builtins.hasattr.
|
|
58
|
+
# Falls back to true if the check itself fails (e.g. due to PyCall issues).
|
|
59
|
+
# @param py_obj [Object] a Python object
|
|
60
|
+
# @param attr [String, Symbol] the attribute name to check
|
|
61
|
+
# @return [Boolean]
|
|
62
|
+
def self.py_hasattr?(py_obj, attr)
|
|
63
|
+
Builtins.hasattr(py_obj, attr.to_s)
|
|
64
|
+
rescue StandardError
|
|
65
|
+
true
|
|
66
|
+
end
|
|
67
|
+
|
|
56
68
|
# See also spaCy Python API document for [`Doc`](https://spacy.io/api/doc).
|
|
57
69
|
class Doc
|
|
58
70
|
# @return [Object] a Python `Language` instance accessible via `PyCall`
|
|
@@ -76,17 +88,19 @@ module Spacy
|
|
|
76
88
|
# @param nlp [Language] an instance of {Language} class
|
|
77
89
|
# @param py_doc [Object] an instance of Python `Doc` class
|
|
78
90
|
# @param text [String] the text string to be analyzed
|
|
79
|
-
def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL
|
|
80
|
-
retrial: 0)
|
|
91
|
+
def initialize(nlp, py_doc: nil, text: nil, max_retrial: MAX_RETRIAL)
|
|
81
92
|
@py_nlp = nlp
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
93
|
+
retrial = 0
|
|
94
|
+
begin
|
|
95
|
+
@py_doc = py_doc || nlp.call(text)
|
|
96
|
+
@text = @py_doc.text
|
|
97
|
+
rescue StandardError
|
|
98
|
+
retrial += 1
|
|
99
|
+
raise "Error: Failed to construct a Doc object" unless retrial <= max_retrial
|
|
87
100
|
|
|
88
|
-
|
|
89
|
-
|
|
101
|
+
sleep 0.5
|
|
102
|
+
retry
|
|
103
|
+
end
|
|
90
104
|
end
|
|
91
105
|
|
|
92
106
|
# Retokenizes the text merging a span into a single token.
|
|
@@ -120,11 +134,7 @@ module Spacy
|
|
|
120
134
|
# Returns an array of tokens contained in the doc.
|
|
121
135
|
# @return [Array<Token>]
|
|
122
136
|
def tokens
|
|
123
|
-
|
|
124
|
-
PyCall::List.call(@py_doc).each do |py_token|
|
|
125
|
-
results << Token.new(py_token)
|
|
126
|
-
end
|
|
127
|
-
results
|
|
137
|
+
PyCall::List.call(@py_doc).map { |py_token| Token.new(py_token) }
|
|
128
138
|
end
|
|
129
139
|
|
|
130
140
|
# Iterates over the elements in the doc yielding a token instance each time.
|
|
@@ -140,54 +150,50 @@ module Spacy
|
|
|
140
150
|
# @param optional_size [Integer] an integer representing the size of the span
|
|
141
151
|
# @return [Span]
|
|
142
152
|
def span(range_or_start, optional_size = nil)
|
|
153
|
+
doc_len = PyCall.len(@py_doc)
|
|
154
|
+
|
|
143
155
|
if optional_size
|
|
144
156
|
start_index = range_or_start
|
|
145
|
-
|
|
157
|
+
start_index += doc_len if start_index < 0
|
|
158
|
+
end_index = start_index + optional_size - 1
|
|
146
159
|
else
|
|
147
|
-
start_index = range_or_start.first
|
|
148
160
|
range = range_or_start
|
|
149
|
-
|
|
161
|
+
start_index = range.first
|
|
162
|
+
start_index += doc_len if start_index < 0
|
|
163
|
+
end_val = range.end
|
|
164
|
+
if end_val.nil?
|
|
165
|
+
end_index = doc_len - 1
|
|
166
|
+
else
|
|
167
|
+
end_val += doc_len if end_val < 0
|
|
168
|
+
end_index = range.exclude_end? ? end_val - 1 : end_val
|
|
169
|
+
end
|
|
150
170
|
end
|
|
151
171
|
|
|
152
|
-
end_index = start_index + temp.size - 1
|
|
153
|
-
|
|
154
172
|
Span.new(self, start_index: start_index, end_index: end_index)
|
|
155
173
|
end
|
|
156
174
|
|
|
157
175
|
# Returns an array of spans representing noun chunks.
|
|
158
176
|
# @return [Array<Span>]
|
|
159
177
|
def noun_chunks
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
py_chunks.each do |py_chunk|
|
|
163
|
-
chunk_array << Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
|
178
|
+
PyCall::List.call(@py_doc.noun_chunks).map do |py_chunk|
|
|
179
|
+
Span.new(self, start_index: py_chunk.start, end_index: py_chunk.end - 1)
|
|
164
180
|
end
|
|
165
|
-
chunk_array
|
|
166
181
|
end
|
|
167
182
|
|
|
168
183
|
# Returns an array of spans each representing a sentence.
|
|
169
184
|
# @return [Array<Span>]
|
|
170
185
|
def sents
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
py_sentences.each do |py_sent|
|
|
174
|
-
sentence_array << Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
|
186
|
+
PyCall::List.call(@py_doc.sents).map do |py_sent|
|
|
187
|
+
Span.new(self, start_index: py_sent.start, end_index: py_sent.end - 1)
|
|
175
188
|
end
|
|
176
|
-
sentence_array
|
|
177
189
|
end
|
|
178
190
|
|
|
179
191
|
# Returns an array of spans each representing a named entity.
|
|
180
192
|
# @return [Array<Span>]
|
|
181
193
|
def ents
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
PyCall::List.call(@py_doc.ents).each do |ent|
|
|
185
|
-
ent.define_singleton_method :label do
|
|
186
|
-
label_
|
|
187
|
-
end
|
|
188
|
-
ent_array << ent
|
|
194
|
+
PyCall::List.call(@py_doc.ents).map do |py_span|
|
|
195
|
+
Span.new(self, py_span: py_span)
|
|
189
196
|
end
|
|
190
|
-
ent_array
|
|
191
197
|
end
|
|
192
198
|
|
|
193
199
|
# Returns a span if given a range object; or returns a token if given an integer representing a position in the doc.
|
|
@@ -227,7 +233,8 @@ module Spacy
|
|
|
227
233
|
# bytes = File.binread("doc.bin")
|
|
228
234
|
# doc = Spacy::Doc.from_bytes(nlp, bytes)
|
|
229
235
|
def self.from_bytes(nlp, byte_string)
|
|
230
|
-
|
|
236
|
+
b64 = Base64.strict_encode64(byte_string)
|
|
237
|
+
py_bytes = PyCall.eval("__import__('base64').b64decode('#{b64}')")
|
|
231
238
|
py_doc = nlp.py_nlp.call("").from_bytes(py_bytes)
|
|
232
239
|
new(nlp.py_nlp, py_doc: py_doc)
|
|
233
240
|
end
|
|
@@ -240,6 +247,63 @@ module Spacy
|
|
|
240
247
|
PyDisplacy.render(py_doc, style: style, options: { compact: compact }, jupyter: false)
|
|
241
248
|
end
|
|
242
249
|
|
|
250
|
+
# Generates a JSON string summarizing the linguistic analysis of the document.
|
|
251
|
+
# Designed to be passed as context to an LLM (e.g., via {OpenAIHelper#chat}).
|
|
252
|
+
#
|
|
253
|
+
# @param sections [Array<Symbol>] which sections to include
|
|
254
|
+
# (:text, :tokens, :entities, :noun_chunks, :sentences)
|
|
255
|
+
# @param token_attributes [Array<Symbol>] which token attributes to include
|
|
256
|
+
# (:text, :lemma, :pos, :tag, :dep, :head, :ent_type, :morphology)
|
|
257
|
+
# @return [String] a JSON string of the linguistic summary
|
|
258
|
+
def linguistic_summary(sections: [:text, :tokens, :entities, :noun_chunks],
|
|
259
|
+
token_attributes: [:text, :lemma, :pos, :dep, :head])
|
|
260
|
+
result = {}
|
|
261
|
+
|
|
262
|
+
sections.each do |section|
|
|
263
|
+
case section
|
|
264
|
+
when :text
|
|
265
|
+
result[:text] = @text
|
|
266
|
+
when :tokens
|
|
267
|
+
result[:tokens] = tokens.map do |token|
|
|
268
|
+
token_hash = {}
|
|
269
|
+
token_attributes.each do |attr|
|
|
270
|
+
case attr
|
|
271
|
+
when :head
|
|
272
|
+
token_hash[:head] = token.head.text
|
|
273
|
+
when :morphology
|
|
274
|
+
# Use string form and parse to ensure a plain Ruby Hash for JSON serialization
|
|
275
|
+
morph_str = token.morphology(hash: false)
|
|
276
|
+
token_hash[:morphology] = if morph_str.empty?
|
|
277
|
+
{}
|
|
278
|
+
else
|
|
279
|
+
morph_str.split("|").each_with_object({}) do |pair, h|
|
|
280
|
+
k, v = pair.split("=", 2)
|
|
281
|
+
h[k] = v
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
else
|
|
285
|
+
token_hash[attr] = token.send(attr)
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
token_hash
|
|
289
|
+
end
|
|
290
|
+
when :entities
|
|
291
|
+
ent_list = ents
|
|
292
|
+
result[:entities] = ent_list.map do |ent|
|
|
293
|
+
{ text: ent.text, label: ent.label }
|
|
294
|
+
end
|
|
295
|
+
when :noun_chunks
|
|
296
|
+
result[:noun_chunks] = noun_chunks.map do |chunk|
|
|
297
|
+
{ text: chunk.text, root: chunk.root.text }
|
|
298
|
+
end
|
|
299
|
+
when :sentences
|
|
300
|
+
result[:sentences] = sents.map(&:text)
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
result.to_json
|
|
305
|
+
end
|
|
306
|
+
|
|
243
307
|
# Sends a query to OpenAI's chat completion API with optional tool support.
|
|
244
308
|
# The get_tokens tool allows the model to request token-level linguistic analysis.
|
|
245
309
|
#
|
|
@@ -248,7 +312,7 @@ module Spacy
|
|
|
248
312
|
# @param max_tokens [Integer] Alias for max_completion_tokens (deprecated, for backward compatibility)
|
|
249
313
|
# @param temperature [Float] Sampling temperature (ignored for GPT-5 models)
|
|
250
314
|
# @param model [String] The model to use (default: gpt-5-mini)
|
|
251
|
-
# @param messages [Array<Hash>] Conversation history (for recursive tool calls)
|
|
315
|
+
# @param messages [Array<Hash>] Conversation history (for recursive tool calls). Note: this array is modified in place when tool calls occur.
|
|
252
316
|
# @param prompt [String, nil] System prompt for the query
|
|
253
317
|
# @return [String, nil] The model's response content
|
|
254
318
|
def openai_query(access_token: nil,
|
|
@@ -257,7 +321,10 @@ module Spacy
|
|
|
257
321
|
temperature: 0.7,
|
|
258
322
|
model: "gpt-5-mini",
|
|
259
323
|
messages: [],
|
|
260
|
-
prompt: nil
|
|
324
|
+
prompt: nil,
|
|
325
|
+
response_format: nil,
|
|
326
|
+
max_tool_call_depth: 5,
|
|
327
|
+
_tool_call_depth: 0)
|
|
261
328
|
# Support both max_completion_tokens and max_tokens for backward compatibility
|
|
262
329
|
max_completion_tokens ||= max_tokens || 1000
|
|
263
330
|
if messages.empty?
|
|
@@ -267,38 +334,42 @@ module Spacy
|
|
|
267
334
|
]
|
|
268
335
|
end
|
|
269
336
|
|
|
270
|
-
|
|
271
|
-
raise "Error: OPENAI_API_KEY is not set" unless access_token
|
|
337
|
+
client = openai_client(access_token)
|
|
272
338
|
|
|
273
339
|
# Tool definition for token analysis (GPT-5 tools API format)
|
|
274
|
-
tools =
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
340
|
+
tools = nil
|
|
341
|
+
tool_choice = nil
|
|
342
|
+
if _tool_call_depth < max_tool_call_depth
|
|
343
|
+
tools = [
|
|
344
|
+
{
|
|
345
|
+
type: "function",
|
|
346
|
+
function: {
|
|
347
|
+
name: "get_tokens",
|
|
348
|
+
description: "Tokenize given text and return a list of tokens with their attributes: surface, lemma, tag, pos (part-of-speech), dep (dependency), ent_type (entity type), and morphology",
|
|
349
|
+
parameters: {
|
|
350
|
+
type: "object",
|
|
351
|
+
properties: {
|
|
352
|
+
text: {
|
|
353
|
+
type: "string",
|
|
354
|
+
description: "text to be tokenized"
|
|
355
|
+
}
|
|
356
|
+
},
|
|
357
|
+
required: ["text"]
|
|
358
|
+
}
|
|
289
359
|
}
|
|
290
360
|
}
|
|
291
|
-
|
|
292
|
-
|
|
361
|
+
]
|
|
362
|
+
tool_choice = "auto"
|
|
363
|
+
end
|
|
293
364
|
|
|
294
|
-
client = OpenAIClient.new(access_token: access_token)
|
|
295
365
|
response = client.chat(
|
|
296
366
|
model: model,
|
|
297
367
|
messages: messages,
|
|
298
368
|
max_completion_tokens: max_completion_tokens,
|
|
299
369
|
temperature: temperature,
|
|
300
370
|
tools: tools,
|
|
301
|
-
tool_choice:
|
|
371
|
+
tool_choice: tool_choice,
|
|
372
|
+
response_format: response_format
|
|
302
373
|
)
|
|
303
374
|
|
|
304
375
|
message = response.dig("choices", 0, "message")
|
|
@@ -340,7 +411,10 @@ module Spacy
|
|
|
340
411
|
temperature: temperature,
|
|
341
412
|
model: model,
|
|
342
413
|
messages: messages,
|
|
343
|
-
prompt: prompt
|
|
414
|
+
prompt: prompt,
|
|
415
|
+
response_format: response_format,
|
|
416
|
+
max_tool_call_depth: max_tool_call_depth,
|
|
417
|
+
_tool_call_depth: _tool_call_depth + 1
|
|
344
418
|
)
|
|
345
419
|
else
|
|
346
420
|
message["content"]
|
|
@@ -367,10 +441,7 @@ module Spacy
|
|
|
367
441
|
{ role: "user", content: @text }
|
|
368
442
|
]
|
|
369
443
|
|
|
370
|
-
|
|
371
|
-
raise "Error: OPENAI_API_KEY is not set" unless access_token
|
|
372
|
-
|
|
373
|
-
client = OpenAIClient.new(access_token: access_token)
|
|
444
|
+
client = openai_client(access_token)
|
|
374
445
|
response = client.chat(
|
|
375
446
|
model: model,
|
|
376
447
|
messages: messages,
|
|
@@ -387,26 +458,40 @@ module Spacy
|
|
|
387
458
|
#
|
|
388
459
|
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
389
460
|
# @param model [String] The embeddings model (default: text-embedding-3-small)
|
|
461
|
+
# @param dimensions [Integer, nil] The number of dimensions for the output embeddings (nil uses model default)
|
|
390
462
|
# @return [Array<Float>, nil] The embedding vector
|
|
391
|
-
def openai_embeddings(access_token: nil, model: "text-embedding-3-small")
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
client = OpenAIClient.new(access_token: access_token)
|
|
396
|
-
response = client.embeddings(model: model, input: @text)
|
|
463
|
+
def openai_embeddings(access_token: nil, model: "text-embedding-3-small", dimensions: nil)
|
|
464
|
+
client = openai_client(access_token)
|
|
465
|
+
response = client.embeddings(model: model, input: @text, dimensions: dimensions)
|
|
397
466
|
response.dig("data", 0, "embedding")
|
|
398
467
|
rescue OpenAIClient::APIError => e
|
|
399
468
|
puts "Error: OpenAI API call failed - #{e.message}"
|
|
400
469
|
nil
|
|
401
470
|
end
|
|
402
471
|
|
|
472
|
+
private
|
|
473
|
+
|
|
474
|
+
def openai_client(access_token)
|
|
475
|
+
access_token ||= ENV["OPENAI_API_KEY"]
|
|
476
|
+
raise "Error: OPENAI_API_KEY is not set" unless access_token
|
|
477
|
+
|
|
478
|
+
@openai_clients ||= {}
|
|
479
|
+
@openai_clients[access_token] ||= OpenAIClient.new(access_token: access_token)
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
public
|
|
483
|
+
|
|
403
484
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
|
404
485
|
def method_missing(name, *args)
|
|
405
486
|
@py_doc.send(name, *args)
|
|
406
487
|
end
|
|
407
488
|
|
|
408
|
-
def respond_to_missing?(sym,
|
|
409
|
-
|
|
489
|
+
def respond_to_missing?(sym, include_private = false)
|
|
490
|
+
Spacy.py_hasattr?(@py_doc, sym) || super
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
def instance_variables_to_inspect
|
|
494
|
+
[:@text]
|
|
410
495
|
end
|
|
411
496
|
end
|
|
412
497
|
|
|
@@ -420,8 +505,13 @@ module Spacy
|
|
|
420
505
|
|
|
421
506
|
# Creates a language model instance, which is conventionally referred to by a variable named `nlp`.
|
|
422
507
|
# @param model [String] A language model installed in the system
|
|
423
|
-
def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL,
|
|
508
|
+
def initialize(model = "en_core_web_sm", max_retrial: MAX_RETRIAL, timeout: 60)
|
|
509
|
+
unless model.to_s.match?(/\A[a-zA-Z0-9_\-\.\/]+\z/)
|
|
510
|
+
raise ArgumentError, "Invalid model name: #{model.inspect}"
|
|
511
|
+
end
|
|
512
|
+
|
|
424
513
|
@spacy_nlp_id = "nlp_#{model.object_id}"
|
|
514
|
+
retrial = 0
|
|
425
515
|
begin
|
|
426
516
|
Timeout.timeout(timeout) do
|
|
427
517
|
PyCall.exec("import spacy; #{@spacy_nlp_id} = spacy.load('#{model}')")
|
|
@@ -468,17 +558,13 @@ module Spacy
|
|
|
468
558
|
# @param id [Integer] a vocabulary id
|
|
469
559
|
# @return [Object] a Python `Lexeme` object (https://spacy.io/api/lexeme)
|
|
470
560
|
def vocab_string_lookup(id)
|
|
471
|
-
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{id}]")
|
|
561
|
+
PyCall.eval("#{@spacy_nlp_id}.vocab.strings[#{Integer(id)}]")
|
|
472
562
|
end
|
|
473
563
|
|
|
474
564
|
# A utility method to list pipeline components.
|
|
475
565
|
# @return [Array<String>] An array of text strings representing pipeline components
|
|
476
566
|
def pipe_names
|
|
477
|
-
|
|
478
|
-
PyCall::List.call(@py_nlp.pipe_names).each do |pipe|
|
|
479
|
-
pipe_array << pipe
|
|
480
|
-
end
|
|
481
|
-
pipe_array
|
|
567
|
+
PyCall::List.call(@py_nlp.pipe_names).to_a
|
|
482
568
|
end
|
|
483
569
|
|
|
484
570
|
# A utility method to get a Python `Lexeme` object.
|
|
@@ -527,20 +613,62 @@ module Spacy
|
|
|
527
613
|
# @param batch_size [Integer]
|
|
528
614
|
# @return [Array<Doc>]
|
|
529
615
|
def pipe(texts, disable: [], batch_size: 50)
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
docs << Doc.new(@py_nlp, py_doc: py_doc)
|
|
616
|
+
PyCall::List.call(@py_nlp.pipe(texts, disable: disable, batch_size: batch_size)).map do |py_doc|
|
|
617
|
+
Doc.new(@py_nlp, py_doc: py_doc)
|
|
533
618
|
end
|
|
534
|
-
docs
|
|
535
619
|
end
|
|
536
620
|
|
|
537
|
-
#
|
|
621
|
+
# Yields an {OpenAIHelper} instance for making OpenAI API calls within a block.
|
|
622
|
+
# The helper is configured once and reused for all calls within the block,
|
|
623
|
+
# making it efficient for batch processing with {#pipe}.
|
|
624
|
+
#
|
|
625
|
+
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
626
|
+
# @param model [String] the default model for chat requests
|
|
627
|
+
# @param max_completion_tokens [Integer] default maximum tokens in responses
|
|
628
|
+
# @param temperature [Float] default sampling temperature
|
|
629
|
+
# @yield [OpenAIHelper] the helper instance for making API calls
|
|
630
|
+
# @return [Object] the block's return value
|
|
631
|
+
# @example Batch processing with pipe
|
|
632
|
+
# nlp.with_openai(model: "gpt-5-mini") do |ai|
|
|
633
|
+
# nlp.pipe(texts).map do |doc|
|
|
634
|
+
# ai.chat(system: "Analyze.", user: doc.linguistic_summary)
|
|
635
|
+
# end
|
|
636
|
+
# end
|
|
637
|
+
def with_openai(access_token: nil, model: "gpt-5-mini",
|
|
638
|
+
max_completion_tokens: 1000, temperature: 0.7)
|
|
639
|
+
helper = OpenAIHelper.new(
|
|
640
|
+
access_token: access_token,
|
|
641
|
+
model: model,
|
|
642
|
+
max_completion_tokens: max_completion_tokens,
|
|
643
|
+
temperature: temperature
|
|
644
|
+
)
|
|
645
|
+
yield helper
|
|
646
|
+
end
|
|
647
|
+
|
|
648
|
+
# Executes a block within spaCy's memory zone for efficient memory management.
|
|
649
|
+
# Requires spaCy >= 3.8.
|
|
650
|
+
# @yield the block to execute within the memory zone
|
|
651
|
+
# @raise [NotImplementedError] if spaCy version does not support memory zones
|
|
652
|
+
def memory_zone(&block)
|
|
653
|
+
major, minor = SpacyVersion.split(".").map(&:to_i)
|
|
654
|
+
unless major > 3 || (major == 3 && minor >= 8)
|
|
655
|
+
raise NotImplementedError, "memory_zone requires spaCy >= 3.8 (current: #{SpacyVersion})"
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
PyCall.with(@py_nlp.memory_zone, &block)
|
|
659
|
+
end
|
|
660
|
+
|
|
661
|
+
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
|
538
662
|
def method_missing(name, *args)
|
|
539
663
|
@py_nlp.send(name, *args)
|
|
540
664
|
end
|
|
541
665
|
|
|
542
|
-
def respond_to_missing?(sym,
|
|
543
|
-
|
|
666
|
+
def respond_to_missing?(sym, include_private = false)
|
|
667
|
+
Spacy.py_hasattr?(@py_nlp, sym) || super
|
|
668
|
+
end
|
|
669
|
+
|
|
670
|
+
def instance_variables_to_inspect
|
|
671
|
+
[:@spacy_nlp_id]
|
|
544
672
|
end
|
|
545
673
|
end
|
|
546
674
|
|
|
@@ -566,19 +694,9 @@ module Spacy
|
|
|
566
694
|
# @param doc [Doc] an {Doc} instance
|
|
567
695
|
# @return [Array<Hash{:match_id => Integer, :start_index => Integer, :end_index => Integer}>] the id of the matched pattern, the starting position, and the end position
|
|
568
696
|
def match(doc)
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
results = []
|
|
572
|
-
while s.scan_until(/(\d+), (\d+), (\d+)/)
|
|
573
|
-
next unless s.matched
|
|
574
|
-
|
|
575
|
-
triple = s.matched.split(", ")
|
|
576
|
-
match_id = triple[0].to_i
|
|
577
|
-
start_index = triple[1].to_i
|
|
578
|
-
end_index = triple[2].to_i - 1
|
|
579
|
-
results << { match_id: match_id, start_index: start_index, end_index: end_index }
|
|
697
|
+
PyCall::List.call(@py_matcher.call(doc.py_doc)).map do |py_match|
|
|
698
|
+
{ match_id: py_match[0].to_i, start_index: py_match[1].to_i, end_index: py_match[2].to_i - 1 }
|
|
580
699
|
end
|
|
581
|
-
results
|
|
582
700
|
end
|
|
583
701
|
end
|
|
584
702
|
|
|
@@ -621,12 +739,7 @@ module Spacy
|
|
|
621
739
|
# matches.each { |span| puts "#{span.text} => #{span.label}" }
|
|
622
740
|
def match(doc)
|
|
623
741
|
py_matches = @py_matcher.call(doc.py_doc, as_spans: true)
|
|
624
|
-
|
|
625
|
-
PyCall::List.call(py_matches).each do |py_span|
|
|
626
|
-
span = Span.new(doc, py_span: py_span)
|
|
627
|
-
results << span
|
|
628
|
-
end
|
|
629
|
-
results
|
|
742
|
+
PyCall::List.call(py_matches).map { |py_span| Span.new(doc, py_span: py_span) }
|
|
630
743
|
end
|
|
631
744
|
end
|
|
632
745
|
|
|
@@ -638,6 +751,9 @@ module Spacy
|
|
|
638
751
|
# @return [Doc] the document to which the span belongs
|
|
639
752
|
attr_reader :doc
|
|
640
753
|
|
|
754
|
+
# @return [String] a text string of the span
|
|
755
|
+
attr_reader :text
|
|
756
|
+
|
|
641
757
|
include Enumerable
|
|
642
758
|
|
|
643
759
|
alias length count
|
|
@@ -653,17 +769,14 @@ module Spacy
|
|
|
653
769
|
# @param options [Hash] options (`:label`, `:kb_id`, `:vector`)
|
|
654
770
|
def initialize(doc, py_span: nil, start_index: nil, end_index: nil, options: {})
|
|
655
771
|
@doc = doc
|
|
656
|
-
@py_span = py_span ||
|
|
772
|
+
@py_span = py_span || PySpan.call(@doc.py_doc, start_index, end_index + 1, options)
|
|
773
|
+
@text = @py_span.text
|
|
657
774
|
end
|
|
658
775
|
|
|
659
776
|
# Returns an array of tokens contained in the span.
|
|
660
777
|
# @return [Array<Token>]
|
|
661
778
|
def tokens
|
|
662
|
-
|
|
663
|
-
PyCall::List.call(@py_span).each do |py_token|
|
|
664
|
-
results << Token.new(py_token)
|
|
665
|
-
end
|
|
666
|
-
results
|
|
779
|
+
PyCall::List.call(@py_span).map { |py_token| Token.new(py_token) }
|
|
667
780
|
end
|
|
668
781
|
|
|
669
782
|
# Iterates over the elements in the span yielding a token instance each time.
|
|
@@ -676,12 +789,9 @@ module Spacy
|
|
|
676
789
|
# Returns an array of spans of noun chunks.
|
|
677
790
|
# @return [Array<Span>]
|
|
678
791
|
def noun_chunks
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
py_chunks.each do |py_span|
|
|
682
|
-
chunk_array << Span.new(@doc, py_span: py_span)
|
|
792
|
+
PyCall::List.call(@py_span.noun_chunks).map do |py_span|
|
|
793
|
+
Span.new(@doc, py_span: py_span)
|
|
683
794
|
end
|
|
684
|
-
chunk_array
|
|
685
795
|
end
|
|
686
796
|
|
|
687
797
|
# Returns the head token
|
|
@@ -693,22 +803,17 @@ module Spacy
|
|
|
693
803
|
# Returns an array of spans that represents sentences.
|
|
694
804
|
# @return [Array<Span>]
|
|
695
805
|
def sents
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
py_sentences.each do |py_span|
|
|
699
|
-
sentence_array << Span.new(@doc, py_span: py_span)
|
|
806
|
+
PyCall::List.call(@py_span.sents).map do |py_span|
|
|
807
|
+
Span.new(@doc, py_span: py_span)
|
|
700
808
|
end
|
|
701
|
-
sentence_array
|
|
702
809
|
end
|
|
703
810
|
|
|
704
811
|
# Returns an array of spans that represents named entities.
|
|
705
812
|
# @return [Array<Span>]
|
|
706
813
|
def ents
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
ent_array << Span.new(@doc, py_span: py_span)
|
|
814
|
+
PyCall::List.call(@py_span.ents).map do |py_span|
|
|
815
|
+
Span.new(@doc, py_span: py_span)
|
|
710
816
|
end
|
|
711
|
-
ent_array
|
|
712
817
|
end
|
|
713
818
|
|
|
714
819
|
# Returns a span that represents the sentence that the given span is part of.
|
|
@@ -745,41 +850,25 @@ module Spacy
|
|
|
745
850
|
# Returns tokens conjugated to the root of the span.
|
|
746
851
|
# @return [Array<Token>] an array of tokens
|
|
747
852
|
def conjuncts
|
|
748
|
-
|
|
749
|
-
PyCall::List.call(@py_span.conjuncts).each do |py_conjunct|
|
|
750
|
-
conjunct_array << Token.new(py_conjunct)
|
|
751
|
-
end
|
|
752
|
-
conjunct_array
|
|
853
|
+
PyCall::List.call(@py_span.conjuncts).map { |py_conjunct| Token.new(py_conjunct) }
|
|
753
854
|
end
|
|
754
855
|
|
|
755
856
|
# Returns tokens that are to the left of the span, whose heads are within the span.
|
|
756
857
|
# @return [Array<Token>] an array of tokens
|
|
757
858
|
def lefts
|
|
758
|
-
|
|
759
|
-
PyCall::List.call(@py_span.lefts).each do |py_left|
|
|
760
|
-
left_array << Token.new(py_left)
|
|
761
|
-
end
|
|
762
|
-
left_array
|
|
859
|
+
PyCall::List.call(@py_span.lefts).map { |py_left| Token.new(py_left) }
|
|
763
860
|
end
|
|
764
861
|
|
|
765
862
|
# Returns Tokens that are to the right of the span, whose heads are within the span.
|
|
766
863
|
# @return [Array<Token>] an array of Tokens
|
|
767
864
|
def rights
|
|
768
|
-
|
|
769
|
-
PyCall::List.call(@py_span.rights).each do |py_right|
|
|
770
|
-
right_array << Token.new(py_right)
|
|
771
|
-
end
|
|
772
|
-
right_array
|
|
865
|
+
PyCall::List.call(@py_span.rights).map { |py_right| Token.new(py_right) }
|
|
773
866
|
end
|
|
774
867
|
|
|
775
868
|
# Returns Tokens that are within the span and tokens that descend from them.
|
|
776
869
|
# @return [Array<Token>] an array of tokens
|
|
777
870
|
def subtree
|
|
778
|
-
|
|
779
|
-
PyCall::List.call(@py_span.subtree).each do |py_subtree|
|
|
780
|
-
subtree_array << Token.new(py_subtree)
|
|
781
|
-
end
|
|
782
|
-
subtree_array
|
|
871
|
+
PyCall::List.call(@py_span.subtree).map { |py_subtree| Token.new(py_subtree) }
|
|
783
872
|
end
|
|
784
873
|
|
|
785
874
|
# Returns the label
|
|
@@ -788,13 +877,23 @@ module Spacy
|
|
|
788
877
|
@py_span.label_
|
|
789
878
|
end
|
|
790
879
|
|
|
880
|
+
# String representation of the span.
|
|
881
|
+
# @return [String]
|
|
882
|
+
def to_s
|
|
883
|
+
@text
|
|
884
|
+
end
|
|
885
|
+
|
|
791
886
|
# Methods defined in Python but not wrapped in ruby-spacy can be called by this dynamic method handling mechanism.
|
|
792
887
|
def method_missing(name, *args)
|
|
793
888
|
@py_span.send(name, *args)
|
|
794
889
|
end
|
|
795
890
|
|
|
796
|
-
def respond_to_missing?(sym,
|
|
797
|
-
|
|
891
|
+
def respond_to_missing?(sym, include_private = false)
|
|
892
|
+
Spacy.py_hasattr?(@py_span, sym) || super
|
|
893
|
+
end
|
|
894
|
+
|
|
895
|
+
def instance_variables_to_inspect
|
|
896
|
+
[:@text]
|
|
798
897
|
end
|
|
799
898
|
end
|
|
800
899
|
|
|
@@ -814,6 +913,12 @@ module Spacy
|
|
|
814
913
|
@text = @py_token.text
|
|
815
914
|
end
|
|
816
915
|
|
|
916
|
+
# Returns the character offset of the token within the parent document.
|
|
917
|
+
# @return [Integer]
|
|
918
|
+
def idx
|
|
919
|
+
@py_token.idx
|
|
920
|
+
end
|
|
921
|
+
|
|
817
922
|
# Returns the head token
|
|
818
923
|
# @return [Token]
|
|
819
924
|
def head
|
|
@@ -823,51 +928,31 @@ module Spacy
|
|
|
823
928
|
# Returns the token in question and the tokens that descend from it.
|
|
824
929
|
# @return [Array<Token>] an array of tokens
|
|
825
930
|
def subtree
|
|
826
|
-
|
|
827
|
-
PyCall::List.call(@py_token.subtree).each do |descendant|
|
|
828
|
-
descendant_array << Token.new(descendant)
|
|
829
|
-
end
|
|
830
|
-
descendant_array
|
|
931
|
+
PyCall::List.call(@py_token.subtree).map { |descendant| Token.new(descendant) }
|
|
831
932
|
end
|
|
832
933
|
|
|
833
934
|
# Returns the token's ancestors.
|
|
834
935
|
# @return [Array<Token>] an array of tokens
|
|
835
936
|
def ancestors
|
|
836
|
-
|
|
837
|
-
PyCall::List.call(@py_token.ancestors).each do |ancestor|
|
|
838
|
-
ancestor_array << Token.new(ancestor)
|
|
839
|
-
end
|
|
840
|
-
ancestor_array
|
|
937
|
+
PyCall::List.call(@py_token.ancestors).map { |ancestor| Token.new(ancestor) }
|
|
841
938
|
end
|
|
842
939
|
|
|
843
940
|
# Returns a sequence of the token's immediate syntactic children.
|
|
844
941
|
# @return [Array<Token>] an array of tokens
|
|
845
942
|
def children
|
|
846
|
-
|
|
847
|
-
PyCall::List.call(@py_token.children).each do |child|
|
|
848
|
-
child_array << Token.new(child)
|
|
849
|
-
end
|
|
850
|
-
child_array
|
|
943
|
+
PyCall::List.call(@py_token.children).map { |child| Token.new(child) }
|
|
851
944
|
end
|
|
852
945
|
|
|
853
946
|
# The leftward immediate children of the word in the syntactic dependency parse.
|
|
854
947
|
# @return [Array<Token>] an array of tokens
|
|
855
948
|
def lefts
|
|
856
|
-
|
|
857
|
-
PyCall::List.call(@py_token.lefts).each do |token|
|
|
858
|
-
token_array << Token.new(token)
|
|
859
|
-
end
|
|
860
|
-
token_array
|
|
949
|
+
PyCall::List.call(@py_token.lefts).map { |token| Token.new(token) }
|
|
861
950
|
end
|
|
862
951
|
|
|
863
952
|
# The rightward immediate children of the word in the syntactic dependency parse.
|
|
864
953
|
# @return [Array<Token>] an array of tokens
|
|
865
954
|
def rights
|
|
866
|
-
|
|
867
|
-
PyCall::List.call(@py_token.rights).each do |token|
|
|
868
|
-
token_array << Token.new(token)
|
|
869
|
-
end
|
|
870
|
-
token_array
|
|
955
|
+
PyCall::List.call(@py_token.rights).map { |token| Token.new(token) }
|
|
871
956
|
end
|
|
872
957
|
|
|
873
958
|
# String representation of the token.
|
|
@@ -959,8 +1044,12 @@ module Spacy
|
|
|
959
1044
|
@py_token.send(name, *args)
|
|
960
1045
|
end
|
|
961
1046
|
|
|
962
|
-
def respond_to_missing?(sym,
|
|
963
|
-
|
|
1047
|
+
def respond_to_missing?(sym, include_private = false)
|
|
1048
|
+
Spacy.py_hasattr?(@py_token, sym) || super
|
|
1049
|
+
end
|
|
1050
|
+
|
|
1051
|
+
def instance_variables_to_inspect
|
|
1052
|
+
[:@text]
|
|
964
1053
|
end
|
|
965
1054
|
end
|
|
966
1055
|
|
|
@@ -1034,8 +1123,12 @@ module Spacy
|
|
|
1034
1123
|
@py_lexeme.send(name, *args)
|
|
1035
1124
|
end
|
|
1036
1125
|
|
|
1037
|
-
def respond_to_missing?(sym,
|
|
1038
|
-
|
|
1126
|
+
def respond_to_missing?(sym, include_private = false)
|
|
1127
|
+
Spacy.py_hasattr?(@py_lexeme, sym) || super
|
|
1128
|
+
end
|
|
1129
|
+
|
|
1130
|
+
def instance_variables_to_inspect
|
|
1131
|
+
[:@text]
|
|
1039
1132
|
end
|
|
1040
1133
|
end
|
|
1041
1134
|
end
|
data/ruby-spacy.gemspec
CHANGED
|
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
|
|
|
31
31
|
spec.add_development_dependency "rspec"
|
|
32
32
|
spec.add_development_dependency "solargraph"
|
|
33
33
|
|
|
34
|
+
spec.add_dependency "base64" # Required for Ruby 3.4+ (moved from default to bundled gem)
|
|
34
35
|
spec.add_dependency "fiddle" # Required for Ruby 4.0+ (moved from default to bundled gem)
|
|
35
36
|
spec.add_dependency "numpy", "~> 0.4.0"
|
|
36
37
|
spec.add_dependency "pycall", "~> 1.5.1"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby-spacy
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.4.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Yoichiro Hasebe
|
|
@@ -65,6 +65,20 @@ dependencies:
|
|
|
65
65
|
- - ">="
|
|
66
66
|
- !ruby/object:Gem::Version
|
|
67
67
|
version: '0'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: base64
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '0'
|
|
75
|
+
type: :runtime
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '0'
|
|
68
82
|
- !ruby/object:Gem::Dependency
|
|
69
83
|
name: fiddle
|
|
70
84
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -133,6 +147,7 @@ executables: []
|
|
|
133
147
|
extensions: []
|
|
134
148
|
extra_rdoc_files: []
|
|
135
149
|
files:
|
|
150
|
+
- ".github/FUNDING.yml"
|
|
136
151
|
- ".gitignore"
|
|
137
152
|
- CHANGELOG.md
|
|
138
153
|
- Gemfile
|
|
@@ -203,6 +218,7 @@ files:
|
|
|
203
218
|
- examples/rule_based_matching/matcher.rb
|
|
204
219
|
- lib/ruby-spacy.rb
|
|
205
220
|
- lib/ruby-spacy/openai_client.rb
|
|
221
|
+
- lib/ruby-spacy/openai_helper.rb
|
|
206
222
|
- lib/ruby-spacy/version.rb
|
|
207
223
|
- ruby-spacy.gemspec
|
|
208
224
|
homepage: https://github.com/yohasebe/ruby-spacy
|