ruby-spacy 0.2.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/FUNDING.yml +6 -0
- data/.gitignore +1 -0
- data/CHANGELOG.md +24 -7
- data/Gemfile +1 -1
- data/README.md +120 -22
- data/lib/ruby-spacy/openai_client.rb +166 -0
- data/lib/ruby-spacy/openai_helper.rb +91 -0
- data/lib/ruby-spacy/version.rb +1 -1
- data/lib/ruby-spacy.rb +455 -248
- data/ruby-spacy.gemspec +3 -2
- metadata +34 -20
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6185c586feb32fa51efcd4349398cd4ca9541280a5cc8a1b6a73eb93a987d4ac
|
|
4
|
+
data.tar.gz: a146a9c40e2d5293e2401cb16b8ac6866cbb577e11a10d9657c406f933e7a3aa
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bf558d4e9a7a6765fd7d088bbf8324a6ee0e4f4186962551d71e5a991e0aefd1e51a186f19c2824fabcc6afd0c83960771f082237febece52c2a522ccb39a5cf
|
|
7
|
+
data.tar.gz: 3a64559cf8c169d1ac1ecdef526d26e5776989b9cc203a8ed30e0dd5d87ff62a4d1b741aff30c8cb49e5ffb716c6068f9af3a12d50d0d4de8ad6f22ebe80ea0d
|
data/.github/FUNDING.yml
ADDED
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,17 +1,34 @@
|
|
|
1
1
|
# Change Log
|
|
2
2
|
|
|
3
|
+
## 0.3.0 - 2025-01-06
|
|
4
|
+
### Added
|
|
5
|
+
- Ruby 4.0 support
|
|
6
|
+
- `Doc#to_bytes` for serializing documents to binary format
|
|
7
|
+
- `Doc.from_bytes` for restoring documents from binary data
|
|
8
|
+
- `PhraseMatcher` class for efficient phrase matching
|
|
9
|
+
- `Language#phrase_matcher` helper method
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
- Replaced `ruby-openai` gem with custom `OpenAIClient` implementation
|
|
13
|
+
- Updated default OpenAI model to `gpt-5-mini`
|
|
14
|
+
- Updated embeddings model to `text-embedding-3-small`
|
|
15
|
+
- Changed `max_tokens` parameter to `max_completion_tokens` (backward compatible)
|
|
16
|
+
- Added `fiddle` gem dependency (required for Ruby 4.0)
|
|
17
|
+
|
|
18
|
+
## 0.2.4 - 2024-12-11
|
|
19
|
+
### Changed
|
|
20
|
+
- Timeout and retry feature for `Spacy::Language.new`
|
|
21
|
+
|
|
3
22
|
## 0.2.3 - 2024-08-27
|
|
4
23
|
- Timeout option added to `Spacy::Language.new`
|
|
5
|
-
- Default
|
|
6
|
-
|
|
7
|
-
## 0.2.0 - 2022-10-02
|
|
8
|
-
- spaCy 3.7.0 supported
|
|
24
|
+
- Default OpenAI models updated to `gpt-4o-mini`
|
|
9
25
|
|
|
10
26
|
## 0.2.0 - 2022-10-02
|
|
11
27
|
### Added
|
|
12
|
-
-
|
|
13
|
-
- `Doc
|
|
14
|
-
- `Doc
|
|
28
|
+
- spaCy 3.7.0 supported
|
|
29
|
+
- `Doc#openai_query`
|
|
30
|
+
- `Doc#openai_completion`
|
|
31
|
+
- `Doc#openai_embeddings`
|
|
15
32
|
|
|
16
33
|
## 0.1.4.1 - 2021-07-06
|
|
17
34
|
- Test code refined
|
data/Gemfile
CHANGED
|
@@ -5,9 +5,9 @@ source "https://rubygems.org"
|
|
|
5
5
|
# Specify your gem's dependencies in ruby-spacy.gemspec
|
|
6
6
|
gemspec
|
|
7
7
|
|
|
8
|
+
gem "fiddle" # Required for Ruby 4.0+ (moved from default to bundled gem)
|
|
8
9
|
gem "numpy"
|
|
9
10
|
gem "pycall", "~> 1.5.1"
|
|
10
|
-
gem "ruby-openai"
|
|
11
11
|
gem "terminal-table"
|
|
12
12
|
|
|
13
13
|
group :development do
|
data/README.md
CHANGED
|
@@ -13,10 +13,11 @@
|
|
|
13
13
|
| ✅ | Access to pre-trained word vectors |
|
|
14
14
|
| ✅ | OpenAI Chat/Completion/Embeddings API integration |
|
|
15
15
|
|
|
16
|
-
Current Version: `0.
|
|
16
|
+
Current Version: `0.3.0`
|
|
17
17
|
|
|
18
|
-
-
|
|
19
|
-
-
|
|
18
|
+
- Ruby 4.0 supported
|
|
19
|
+
- spaCy 3.8 supported
|
|
20
|
+
- OpenAI GPT-5 API integration
|
|
20
21
|
|
|
21
22
|
## Installation of Prerequisites
|
|
22
23
|
|
|
@@ -522,12 +523,73 @@ Output:
|
|
|
522
523
|
| 9 | アルザス | 0.5644999742507935 |
|
|
523
524
|
| 10 | 南仏 | 0.5547999739646912 |
|
|
524
525
|
|
|
526
|
+
### PhraseMatcher
|
|
527
|
+
|
|
528
|
+
`PhraseMatcher` is more efficient than `Matcher` for matching large terminology lists. It's ideal for extracting known entities like product names, company names, or domain-specific terms.
|
|
529
|
+
|
|
530
|
+
**Basic usage:**
|
|
531
|
+
|
|
532
|
+
```ruby
|
|
533
|
+
require "ruby-spacy"
|
|
534
|
+
|
|
535
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
|
536
|
+
|
|
537
|
+
# Create a phrase matcher
|
|
538
|
+
matcher = nlp.phrase_matcher
|
|
539
|
+
matcher.add("PRODUCT", ["iPhone", "MacBook Pro", "iPad"])
|
|
540
|
+
|
|
541
|
+
doc = nlp.read("I bought an iPhone and a MacBook Pro yesterday.")
|
|
542
|
+
matches = matcher.match(doc)
|
|
543
|
+
|
|
544
|
+
matches.each do |span|
|
|
545
|
+
puts "#{span.text} => #{span.label}"
|
|
546
|
+
end
|
|
547
|
+
# => iPhone => PRODUCT
|
|
548
|
+
# => MacBook Pro => PRODUCT
|
|
549
|
+
```
|
|
550
|
+
|
|
551
|
+
**Case-insensitive matching:**
|
|
552
|
+
|
|
553
|
+
```ruby
|
|
554
|
+
# Use attr: "LOWER" for case-insensitive matching
|
|
555
|
+
matcher = nlp.phrase_matcher(attr: "LOWER")
|
|
556
|
+
matcher.add("COMPANY", ["apple", "google", "microsoft"])
|
|
557
|
+
|
|
558
|
+
doc = nlp.read("Apple and GOOGLE are competitors of Microsoft.")
|
|
559
|
+
matches = matcher.match(doc)
|
|
560
|
+
|
|
561
|
+
matches.each do |span|
|
|
562
|
+
puts span.text
|
|
563
|
+
end
|
|
564
|
+
# => Apple
|
|
565
|
+
# => GOOGLE
|
|
566
|
+
# => Microsoft
|
|
567
|
+
```
|
|
568
|
+
|
|
569
|
+
**Multiple categories:**
|
|
570
|
+
|
|
571
|
+
```ruby
|
|
572
|
+
matcher = nlp.phrase_matcher(attr: "LOWER")
|
|
573
|
+
matcher.add("TECH_COMPANY", ["apple", "google", "microsoft", "amazon"])
|
|
574
|
+
matcher.add("PRODUCT", ["iphone", "pixel", "surface", "kindle"])
|
|
575
|
+
|
|
576
|
+
doc = nlp.read("Apple released the new iPhone while Google announced Pixel updates.")
|
|
577
|
+
matches = matcher.match(doc)
|
|
578
|
+
|
|
579
|
+
matches.each do |span|
|
|
580
|
+
puts "#{span.text}: #{span.label}"
|
|
581
|
+
end
|
|
582
|
+
# => Apple: TECH_COMPANY
|
|
583
|
+
# => iPhone: PRODUCT
|
|
584
|
+
# => Google: TECH_COMPANY
|
|
585
|
+
# => Pixel: PRODUCT
|
|
586
|
+
```
|
|
525
587
|
|
|
526
588
|
## OpenAI API Integration
|
|
527
589
|
|
|
528
|
-
> ⚠️ This feature
|
|
590
|
+
> ⚠️ This feature requires GPT-5 series models. Please refer to OpenAI's [API reference](https://platform.openai.com/docs/api-reference) for details. Note: GPT-5 models do not support the `temperature` parameter.
|
|
529
591
|
|
|
530
|
-
Easily leverage GPT models within ruby-spacy by using an OpenAI API key. When constructing prompts for the `Doc::openai_query` method, you can incorporate the following token properties of the document. These properties are retrieved through
|
|
592
|
+
Easily leverage GPT models within ruby-spacy by using an OpenAI API key. When constructing prompts for the `Doc::openai_query` method, you can incorporate the following token properties of the document. These properties are retrieved through tool calls (made internally by GPT when necessary) and seamlessly integrated into your prompt. The available properties include:
|
|
531
593
|
|
|
532
594
|
- `surface`
|
|
533
595
|
- `lemma`
|
|
@@ -550,9 +612,8 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
|
550
612
|
doc = nlp.read("The Beatles released 12 studio albums")
|
|
551
613
|
|
|
552
614
|
# default parameter values
|
|
553
|
-
#
|
|
554
|
-
#
|
|
555
|
-
# model: "gpt-4o-mini"
|
|
615
|
+
# max_completion_tokens: 1000
|
|
616
|
+
# model: "gpt-5-mini"
|
|
556
617
|
res1 = doc.openai_query(
|
|
557
618
|
access_token: api_key,
|
|
558
619
|
prompt: "Translate the text to Japanese."
|
|
@@ -576,9 +637,8 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
|
576
637
|
doc = nlp.read("The Beatles were an English rock band formed in Liverpool in 1960.")
|
|
577
638
|
|
|
578
639
|
# default parameter values
|
|
579
|
-
#
|
|
580
|
-
#
|
|
581
|
-
# model: "gpt-4o-mini"
|
|
640
|
+
# max_completion_tokens: 1000
|
|
641
|
+
# model: "gpt-5-mini"
|
|
582
642
|
res = doc.openai_query(
|
|
583
643
|
access_token: api_key,
|
|
584
644
|
prompt: "Extract the topic of the document and list 10 entities (names, concepts, locations, etc.) that are relevant to the topic."
|
|
@@ -614,9 +674,8 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
|
614
674
|
doc = nlp.read("The Beatles released 12 studio albums")
|
|
615
675
|
|
|
616
676
|
# default parameter values
|
|
617
|
-
#
|
|
618
|
-
#
|
|
619
|
-
# model: "gpt-4o-mini"
|
|
677
|
+
# max_completion_tokens: 1000
|
|
678
|
+
# model: "gpt-5-mini"
|
|
620
679
|
res = doc.openai_query(
|
|
621
680
|
access_token: api_key,
|
|
622
681
|
prompt: "List token data of each of the words used in the sentence. Add 'meaning' property and value (brief semantic definition) to each token data. Output as a JSON object."
|
|
@@ -692,7 +751,7 @@ Output:
|
|
|
692
751
|
}
|
|
693
752
|
```
|
|
694
753
|
|
|
695
|
-
### GPT Prompting (Generate a
|
|
754
|
+
### GPT Prompting (Generate a Syntax Tree using Token Properties)
|
|
696
755
|
|
|
697
756
|
Ruby code:
|
|
698
757
|
|
|
@@ -704,11 +763,10 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
|
704
763
|
doc = nlp.read("The Beatles released 12 studio albums")
|
|
705
764
|
|
|
706
765
|
# default parameter values
|
|
707
|
-
#
|
|
708
|
-
#
|
|
766
|
+
# max_completion_tokens: 1000
|
|
767
|
+
# model: "gpt-5-mini"
|
|
709
768
|
res = doc.openai_query(
|
|
710
769
|
access_token: api_key,
|
|
711
|
-
model: "gpt-4",
|
|
712
770
|
prompt: "Generate a tree diagram from the text using given token data. Use the following bracketing style: [S [NP [Det the] [N cat]] [VP [V sat] [PP [P on] [NP the mat]]]"
|
|
713
771
|
)
|
|
714
772
|
puts res
|
|
@@ -747,9 +805,8 @@ nlp = Spacy::Language.new("en_core_web_sm")
|
|
|
747
805
|
doc = nlp.read("Vladimir Nabokov was a")
|
|
748
806
|
|
|
749
807
|
# default parameter values
|
|
750
|
-
#
|
|
751
|
-
#
|
|
752
|
-
# model: "gpt-4o-mini"
|
|
808
|
+
# max_completion_tokens: 1000
|
|
809
|
+
# model: "gpt-5-mini"
|
|
753
810
|
res = doc.openai_completion(access_token: api_key)
|
|
754
811
|
puts res
|
|
755
812
|
```
|
|
@@ -769,7 +826,7 @@ api_key = ENV["OPENAI_API_KEY"]
|
|
|
769
826
|
nlp = Spacy::Language.new("en_core_web_sm")
|
|
770
827
|
doc = nlp.read("Vladimir Nabokov was a Russian-American novelist, poet, translator and entomologist.")
|
|
771
828
|
|
|
772
|
-
# default model: text-embedding-
|
|
829
|
+
# default model: text-embedding-3-small
|
|
773
830
|
res = doc.openai_embeddings(access_token: api_key)
|
|
774
831
|
|
|
775
832
|
puts res
|
|
@@ -796,6 +853,47 @@ You can set a timeout for the `Spacy::Language.new` method:
|
|
|
796
853
|
nlp = Spacy::Language.new("en_core_web_sm", timeout: 120) # Set timeout to 120 seconds
|
|
797
854
|
```
|
|
798
855
|
|
|
856
|
+
### Document Serialization
|
|
857
|
+
|
|
858
|
+
You can serialize processed documents to binary format for caching or storage. This is useful when you want to avoid re-processing the same text multiple times.
|
|
859
|
+
|
|
860
|
+
**Saving a document:**
|
|
861
|
+
|
|
862
|
+
```ruby
|
|
863
|
+
require "ruby-spacy"
|
|
864
|
+
|
|
865
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
|
866
|
+
doc = nlp.read("Apple Inc. was founded by Steve Jobs in California.")
|
|
867
|
+
|
|
868
|
+
# Serialize to binary
|
|
869
|
+
bytes = doc.to_bytes
|
|
870
|
+
|
|
871
|
+
# Save to file
|
|
872
|
+
File.binwrite("doc_cache.bin", bytes)
|
|
873
|
+
```
|
|
874
|
+
|
|
875
|
+
**Restoring a document:**
|
|
876
|
+
|
|
877
|
+
```ruby
|
|
878
|
+
nlp = Spacy::Language.new("en_core_web_sm")
|
|
879
|
+
|
|
880
|
+
# Load from file
|
|
881
|
+
bytes = File.binread("doc_cache.bin")
|
|
882
|
+
|
|
883
|
+
# Restore the document (all annotations are preserved)
|
|
884
|
+
restored_doc = Spacy::Doc.from_bytes(nlp, bytes)
|
|
885
|
+
|
|
886
|
+
puts restored_doc.text
|
|
887
|
+
# => "Apple Inc. was founded by Steve Jobs in California."
|
|
888
|
+
|
|
889
|
+
restored_doc.ents.each do |ent|
|
|
890
|
+
puts "#{ent.text} (#{ent.label})"
|
|
891
|
+
end
|
|
892
|
+
# => Apple Inc. (ORG)
|
|
893
|
+
# => Steve Jobs (PERSON)
|
|
894
|
+
# => California (GPE)
|
|
895
|
+
```
|
|
896
|
+
|
|
799
897
|
## Author
|
|
800
898
|
|
|
801
899
|
Yoichiro Hasebe [<yohasebe@gmail.com>]
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "net/http"
|
|
4
|
+
require "openssl"
|
|
5
|
+
require "uri"
|
|
6
|
+
require "json"
|
|
7
|
+
|
|
8
|
+
module Spacy
|
|
9
|
+
# A lightweight OpenAI API client with tools support for GPT-5 series models.
|
|
10
|
+
# This client implements the chat completions and embeddings endpoints
|
|
11
|
+
# without external dependencies.
|
|
12
|
+
class OpenAIClient
|
|
13
|
+
API_ENDPOINT = "https://api.openai.com/v1"
|
|
14
|
+
DEFAULT_TIMEOUT = 120
|
|
15
|
+
MAX_RETRIES = 3
|
|
16
|
+
BASE_RETRY_DELAY = 1
|
|
17
|
+
|
|
18
|
+
class APIError < StandardError
|
|
19
|
+
attr_reader :status_code, :response_body
|
|
20
|
+
|
|
21
|
+
def initialize(message, status_code: nil, response_body: nil)
|
|
22
|
+
@status_code = status_code
|
|
23
|
+
@response_body = response_body
|
|
24
|
+
super(message)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def initialize(access_token:, timeout: DEFAULT_TIMEOUT)
|
|
29
|
+
@access_token = access_token
|
|
30
|
+
@timeout = timeout
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Sends a chat completion request with optional tools support.
|
|
34
|
+
# Note: GPT-5 series and o-series models do not support the temperature parameter.
|
|
35
|
+
#
|
|
36
|
+
# @param model [String] The model to use (e.g., "gpt-5-mini")
|
|
37
|
+
# @param messages [Array<Hash>] The conversation messages
|
|
38
|
+
# @param max_completion_tokens [Integer] Maximum tokens in the response
|
|
39
|
+
# @param temperature [Float, nil] Sampling temperature (ignored for models that don't support it)
|
|
40
|
+
# @param tools [Array<Hash>, nil] Tool definitions for function calling
|
|
41
|
+
# @param tool_choice [String, Hash, nil] Tool selection strategy
|
|
42
|
+
# @param response_format [Hash, nil] Response format specification (e.g., { type: "json_object" })
|
|
43
|
+
# @return [Hash] The API response
|
|
44
|
+
def chat(model:, messages:, max_completion_tokens: 1000, temperature: nil, tools: nil, tool_choice: nil, response_format: nil)
|
|
45
|
+
body = {
|
|
46
|
+
model: model,
|
|
47
|
+
messages: messages,
|
|
48
|
+
max_completion_tokens: max_completion_tokens
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# GPT-5 series and o-series models do not support temperature parameter
|
|
52
|
+
unless temperature_unsupported?(model)
|
|
53
|
+
body[:temperature] = temperature || 0.7
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
if tools && !tools.empty?
|
|
57
|
+
body[:tools] = tools
|
|
58
|
+
body[:tool_choice] = tool_choice || "auto"
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
body[:response_format] = response_format if response_format
|
|
62
|
+
|
|
63
|
+
post("/chat/completions", body)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Checks if the model does not support the temperature parameter.
|
|
67
|
+
# This includes GPT-5 series and o-series (o1, o3, o4-mini, etc.) models.
|
|
68
|
+
# @param model [String] The model name
|
|
69
|
+
# @return [Boolean]
|
|
70
|
+
def temperature_unsupported?(model)
|
|
71
|
+
name = model.to_s
|
|
72
|
+
name.start_with?("gpt-5") || name.match?(/\Ao\d/)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Sends an embeddings request.
|
|
76
|
+
#
|
|
77
|
+
# @param model [String] The embeddings model (e.g., "text-embedding-3-small")
|
|
78
|
+
# @param input [String] The text to embed
|
|
79
|
+
# @param dimensions [Integer, nil] The number of dimensions for the output embeddings
|
|
80
|
+
# @return [Hash] The API response
|
|
81
|
+
def embeddings(model:, input:, dimensions: nil)
|
|
82
|
+
body = {
|
|
83
|
+
model: model,
|
|
84
|
+
input: input
|
|
85
|
+
}
|
|
86
|
+
body[:dimensions] = dimensions if dimensions
|
|
87
|
+
|
|
88
|
+
post("/embeddings", body)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
# Creates a certificate store with system CA certificates but without CRL checking.
|
|
94
|
+
# This avoids "unable to get certificate CRL" errors on some systems.
|
|
95
|
+
def default_cert_store
|
|
96
|
+
store = OpenSSL::X509::Store.new
|
|
97
|
+
store.set_default_paths
|
|
98
|
+
store
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def post(path, body)
|
|
102
|
+
uri = URI.parse("#{API_ENDPOINT}#{path}")
|
|
103
|
+
retries = 0
|
|
104
|
+
|
|
105
|
+
loop do
|
|
106
|
+
begin
|
|
107
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
108
|
+
http.use_ssl = true
|
|
109
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
110
|
+
http.cert_store = default_cert_store
|
|
111
|
+
http.open_timeout = @timeout
|
|
112
|
+
http.read_timeout = @timeout
|
|
113
|
+
|
|
114
|
+
request = Net::HTTP::Post.new(uri.path)
|
|
115
|
+
request["Content-Type"] = "application/json"
|
|
116
|
+
request["Authorization"] = "Bearer #{@access_token}"
|
|
117
|
+
request.body = body.to_json
|
|
118
|
+
|
|
119
|
+
response = http.request(request)
|
|
120
|
+
|
|
121
|
+
# Handle 429 rate limiting before general response handling
|
|
122
|
+
if response.code.to_i == 429
|
|
123
|
+
retries += 1
|
|
124
|
+
if retries <= MAX_RETRIES
|
|
125
|
+
retry_after = response["Retry-After"]&.to_f
|
|
126
|
+
delay = retry_after || (BASE_RETRY_DELAY * (2**(retries - 1)) + rand * 0.5)
|
|
127
|
+
sleep delay
|
|
128
|
+
next
|
|
129
|
+
end
|
|
130
|
+
raise APIError.new("Rate limited after #{MAX_RETRIES} retries",
|
|
131
|
+
status_code: 429, response_body: response.body)
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
return handle_response(response)
|
|
135
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED, Errno::ECONNRESET, SocketError => e
|
|
136
|
+
retries += 1
|
|
137
|
+
if retries <= MAX_RETRIES
|
|
138
|
+
delay = BASE_RETRY_DELAY * (2**(retries - 1)) + rand * 0.5
|
|
139
|
+
sleep delay
|
|
140
|
+
next
|
|
141
|
+
end
|
|
142
|
+
raise APIError.new("Network error after #{MAX_RETRIES} retries: #{e.message}")
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def handle_response(response)
|
|
148
|
+
body = JSON.parse(response.body)
|
|
149
|
+
|
|
150
|
+
case response.code.to_i
|
|
151
|
+
when 200
|
|
152
|
+
body
|
|
153
|
+
when 400..499
|
|
154
|
+
error_message = body.dig("error", "message") || "Client error"
|
|
155
|
+
raise APIError.new(error_message, status_code: response.code.to_i, response_body: body)
|
|
156
|
+
when 500..599
|
|
157
|
+
error_message = body.dig("error", "message") || "Server error"
|
|
158
|
+
raise APIError.new(error_message, status_code: response.code.to_i, response_body: body)
|
|
159
|
+
else
|
|
160
|
+
raise APIError.new("Unexpected response: #{response.code}", status_code: response.code.to_i, response_body: body)
|
|
161
|
+
end
|
|
162
|
+
rescue JSON::ParserError
|
|
163
|
+
raise APIError.new("Invalid JSON response", status_code: response.code.to_i, response_body: response.body)
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Spacy
|
|
4
|
+
# A helper class for OpenAI API interactions, designed to work with spaCy's
|
|
5
|
+
# linguistic analysis via the block-based {Language#with_openai} API.
|
|
6
|
+
#
|
|
7
|
+
# @example Basic usage with linguistic_summary
|
|
8
|
+
# nlp = Spacy::Language.new("en_core_web_sm")
|
|
9
|
+
# nlp.with_openai(model: "gpt-5-mini") do |ai|
|
|
10
|
+
# doc = nlp.read("Apple Inc. was founded by Steve Jobs.")
|
|
11
|
+
# ai.chat(system: "Analyze the linguistic data.", user: doc.linguistic_summary)
|
|
12
|
+
# end
|
|
13
|
+
class OpenAIHelper
|
|
14
|
+
# @return [String] the default model for chat requests
|
|
15
|
+
attr_reader :model
|
|
16
|
+
|
|
17
|
+
# Creates a new OpenAIHelper instance.
|
|
18
|
+
# @param access_token [String, nil] OpenAI API key (defaults to OPENAI_API_KEY env var)
|
|
19
|
+
# @param model [String] the default model for chat requests
|
|
20
|
+
# @param max_completion_tokens [Integer] default maximum tokens in responses
|
|
21
|
+
# @param temperature [Float] default sampling temperature
|
|
22
|
+
def initialize(access_token: nil, model: "gpt-5-mini",
|
|
23
|
+
max_completion_tokens: 1000, temperature: 0.7)
|
|
24
|
+
@access_token = access_token || ENV["OPENAI_API_KEY"]
|
|
25
|
+
raise "Error: OPENAI_API_KEY is not set" unless @access_token
|
|
26
|
+
|
|
27
|
+
@model = model
|
|
28
|
+
@default_max_completion_tokens = max_completion_tokens
|
|
29
|
+
@default_temperature = temperature
|
|
30
|
+
@client = OpenAIClient.new(access_token: @access_token)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Sends a chat completion request to OpenAI.
|
|
34
|
+
#
|
|
35
|
+
# Provides convenient `system:` and `user:` keyword arguments as shortcuts
|
|
36
|
+
# for building simple message arrays. For more complex conversations, pass
|
|
37
|
+
# a full `messages:` array directly.
|
|
38
|
+
#
|
|
39
|
+
# @param system [String, nil] system message content (shortcut)
|
|
40
|
+
# @param user [String, nil] user message content (shortcut)
|
|
41
|
+
# @param messages [Array<Hash>, nil] full message array (overrides system:/user:)
|
|
42
|
+
# @param model [String, nil] model override (defaults to instance model)
|
|
43
|
+
# @param max_completion_tokens [Integer, nil] token limit override
|
|
44
|
+
# @param temperature [Float, nil] temperature override
|
|
45
|
+
# @param response_format [Hash, nil] response format (e.g., { type: "json_object" })
|
|
46
|
+
# @param raw [Boolean] if true, returns the full API response Hash instead of text
|
|
47
|
+
# @return [String, Hash, nil] the response text, full response Hash (if raw:), or nil on error
|
|
48
|
+
def chat(system: nil, user: nil, messages: nil,
|
|
49
|
+
model: nil, max_completion_tokens: nil,
|
|
50
|
+
temperature: nil, response_format: nil, raw: false)
|
|
51
|
+
msgs = messages || build_messages(system: system, user: user)
|
|
52
|
+
raise ArgumentError, "No messages provided. Use system:/user: or messages:" if msgs.empty?
|
|
53
|
+
|
|
54
|
+
response = @client.chat(
|
|
55
|
+
model: model || @model,
|
|
56
|
+
messages: msgs,
|
|
57
|
+
max_completion_tokens: max_completion_tokens || @default_max_completion_tokens,
|
|
58
|
+
temperature: temperature || @default_temperature,
|
|
59
|
+
response_format: response_format
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
raw ? response : response.dig("choices", 0, "message", "content")
|
|
63
|
+
rescue OpenAIClient::APIError => e
|
|
64
|
+
puts "Error: OpenAI API call failed - #{e.message}"
|
|
65
|
+
nil
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Generates text embeddings using OpenAI's embeddings API.
|
|
69
|
+
#
|
|
70
|
+
# @param text [String] the text to embed
|
|
71
|
+
# @param model [String] the embeddings model
|
|
72
|
+
# @param dimensions [Integer, nil] number of dimensions (nil uses model default)
|
|
73
|
+
# @return [Array<Float>, nil] the embedding vector, or nil on error
|
|
74
|
+
def embeddings(text, model: "text-embedding-3-small", dimensions: nil)
|
|
75
|
+
response = @client.embeddings(model: model, input: text, dimensions: dimensions)
|
|
76
|
+
response.dig("data", 0, "embedding")
|
|
77
|
+
rescue OpenAIClient::APIError => e
|
|
78
|
+
puts "Error: OpenAI API call failed - #{e.message}"
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
def build_messages(system: nil, user: nil)
|
|
85
|
+
msgs = []
|
|
86
|
+
msgs << { role: "system", content: system } if system
|
|
87
|
+
msgs << { role: "user", content: user } if user
|
|
88
|
+
msgs
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
data/lib/ruby-spacy/version.rb
CHANGED