vector_amp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +201 -0
- data/NOTICE +8 -0
- data/README.md +377 -0
- data/lib/vector_amp/client.rb +61 -0
- data/lib/vector_amp/connections.rb +49 -0
- data/lib/vector_amp/dataset.rb +182 -0
- data/lib/vector_amp/datasets.rb +237 -0
- data/lib/vector_amp/embedding.rb +67 -0
- data/lib/vector_amp/error.rb +17 -0
- data/lib/vector_amp/ingestion.rb +416 -0
- data/lib/vector_amp/intelligence.rb +101 -0
- data/lib/vector_amp/schedules.rb +81 -0
- data/lib/vector_amp/source.rb +366 -0
- data/lib/vector_amp/transport/base.rb +11 -0
- data/lib/vector_amp/transport/http.rb +149 -0
- data/lib/vector_amp/utils.rb +54 -0
- data/lib/vector_amp/version.rb +5 -0
- data/lib/vector_amp.rb +10 -0
- metadata +150 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "utils"
|
|
4
|
+
|
|
5
|
+
module VectorAmp
|
|
6
|
+
# Managed OAuth/credential connections used by ingestion sources.
|
|
7
|
+
#
|
|
8
|
+
# Connections live at the gateway root (`/connections`), not under
|
|
9
|
+
# `/ingestion`. A connection captures a provider authorization (e.g. Google
|
|
10
|
+
# Drive, Confluence) that ingestion sources can then reference via
|
|
11
|
+
# `connection_id` instead of embedding raw credentials.
|
|
12
|
+
class ConnectionsResource
|
|
13
|
+
# @param transport [#request] API transport.
|
|
14
|
+
# @return [ConnectionsResource]
|
|
15
|
+
def initialize(transport)
|
|
16
|
+
@transport = transport
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# List connections, optionally filtered by provider.
|
|
20
|
+
# @param provider [String, nil] optional provider filter (e.g. `google_drive`, `confluence`).
|
|
21
|
+
# @return [Hash] response envelope with `connections`.
|
|
22
|
+
def list(provider: nil)
|
|
23
|
+
@transport.request(:get, "/connections", query: Utils.compact_hash(provider: provider))
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Create a connection and begin the provider authorization flow.
|
|
27
|
+
# @param provider [String] provider identifier (e.g. `google_drive`, `confluence`).
|
|
28
|
+
# @param source_type [String, nil] optional source type the connection will be used with.
|
|
29
|
+
# @return [Hash] created connection with `id`, `provider`, `status`, and `authorization_url`.
|
|
30
|
+
def create(provider, source_type: nil)
|
|
31
|
+
body = Utils.compact_hash(provider: provider, source_type: source_type)
|
|
32
|
+
@transport.request(:post, "/connections", body: body)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Fetch a connection.
|
|
36
|
+
# @param connection_id [String] connection id.
|
|
37
|
+
# @return [Hash] connection resource.
|
|
38
|
+
def get(connection_id)
|
|
39
|
+
@transport.request(:get, "/connections/#{connection_id}")
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Delete a connection.
|
|
43
|
+
# @param connection_id [String] connection id.
|
|
44
|
+
# @return [Hash] delete response.
|
|
45
|
+
def delete(connection_id)
|
|
46
|
+
@transport.request(:delete, "/connections/#{connection_id}")
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module VectorAmp
|
|
4
|
+
# Rich dataset resource returned by DatasetsResource#create/get/list.
|
|
5
|
+
#
|
|
6
|
+
# It keeps the raw API payload while adding convenient instance methods that
|
|
7
|
+
# delegate to the existing service-style APIs.
|
|
8
|
+
class Dataset
|
|
9
|
+
# @return [DatasetsResource] backing dataset service.
|
|
10
|
+
# @return [Client, nil] client that created this object, required for convenience ingestion/ask helpers.
|
|
11
|
+
# @return [String] dataset id.
|
|
12
|
+
# @return [Hash] normalized raw API payload.
|
|
13
|
+
attr_reader :service, :client, :id, :data
|
|
14
|
+
alias raw_data data
|
|
15
|
+
|
|
16
|
+
# @param data [Hash] dataset API payload; `id` or `dataset_id` is required.
|
|
17
|
+
# @param service [DatasetsResource] backing dataset service.
|
|
18
|
+
# @param client [Client, nil] optional client for convenience helpers.
|
|
19
|
+
# @return [Dataset]
|
|
20
|
+
def initialize(data, service:, client: nil)
|
|
21
|
+
@data = normalize_data(data)
|
|
22
|
+
@service = service
|
|
23
|
+
@client = client
|
|
24
|
+
@id = extract_id(@data)
|
|
25
|
+
raise ArgumentError, "dataset id is required" if @id.nil? || @id.to_s.empty?
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Read a raw dataset field by string or symbol key.
|
|
29
|
+
# @param key [String, Symbol] field name.
|
|
30
|
+
# @return [Object, nil]
|
|
31
|
+
def [](key)
|
|
32
|
+
@data[key.to_s]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Fetch a raw dataset field by string or symbol key.
|
|
36
|
+
# @param key [String, Symbol] field name.
|
|
37
|
+
# @return [Object]
|
|
38
|
+
def fetch(key, *args, &block)
|
|
39
|
+
@data.fetch(key.to_s, *args, &block)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @param key [String, Symbol] field name.
|
|
43
|
+
# @return [Boolean] whether the raw payload has the field.
|
|
44
|
+
def key?(key)
|
|
45
|
+
@data.key?(key.to_s)
|
|
46
|
+
end
|
|
47
|
+
alias has_key? key?
|
|
48
|
+
|
|
49
|
+
# @return [Hash] shallow copy of the raw API payload.
|
|
50
|
+
def to_h
|
|
51
|
+
@data.dup
|
|
52
|
+
end
|
|
53
|
+
alias to_hash to_h
|
|
54
|
+
|
|
55
|
+
def inspect
|
|
56
|
+
"#<#{self.class} id=#{id.inspect} data=#{@data.inspect}>"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Search this dataset.
|
|
60
|
+
# @param query_text [String, nil] optional text query; alternatively pass `query:` vector/options.
|
|
61
|
+
# @param options [Hash] forwarded to {DatasetsResource#search}.
|
|
62
|
+
# @return [Hash] search response.
|
|
63
|
+
def search(query_text = nil, **options)
|
|
64
|
+
service.search(id, query_text, **options)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Insert vectors into this dataset.
|
|
68
|
+
# @param vectors [Array<Hash>] vector records with ids, values, and optional metadata.
|
|
69
|
+
# @return [Hash] insert response.
|
|
70
|
+
def insert(vectors:)
|
|
71
|
+
service.insert(id, vectors: vectors)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Embed and insert texts into this dataset.
|
|
75
|
+
# @param texts_arg [Array<String>, nil] positional texts for convenience.
|
|
76
|
+
# @param texts [Array<String>, nil] keyword texts.
|
|
77
|
+
# @param ids [Array<String>, nil] optional ids; generated UUIDs when omitted.
|
|
78
|
+
# @param metadata [Hash, Array<Hash>, nil] metadata applied to all texts or per-text.
|
|
79
|
+
# @return [Hash] insert response.
|
|
80
|
+
def add_texts(texts_arg = nil, texts: nil, ids: nil, metadata: nil)
|
|
81
|
+
service.add_texts(id, texts_arg, texts: texts, ids: ids, metadata: metadata)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Delete this dataset.
|
|
85
|
+
# @return [Hash] delete response.
|
|
86
|
+
def delete
|
|
87
|
+
service.delete(id)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Fetch stats for this dataset.
|
|
91
|
+
# @return [Hash] dataset statistics.
|
|
92
|
+
def stats
|
|
93
|
+
service.stats(id)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# List retained source documents for this dataset using cursor pagination.
|
|
98
|
+
# @param limit [Integer, nil] maximum documents to return.
|
|
99
|
+
# @param cursor [String, nil] cursor from a previous response's `next_cursor`.
|
|
100
|
+
# @param status [String, nil] optional document status filter.
|
|
101
|
+
# @return [Hash] response envelope with `documents` and `next_cursor`.
|
|
102
|
+
def list_documents(limit: 50, cursor: nil, status: nil)
|
|
103
|
+
service.list_documents(id, limit: limit, cursor: cursor, status: status)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Download retained original bytes for a dataset source document.
|
|
107
|
+
# @param document_id [String] document id returned by {#list_documents}.
|
|
108
|
+
# @return [String] raw document bytes.
|
|
109
|
+
def download_document(document_id)
|
|
110
|
+
service.download_document(id, document_id)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Ask an intelligence question constrained to this dataset.
|
|
114
|
+
# @param query [String] natural-language question.
|
|
115
|
+
# @param options [Hash] forwarded to {Client#ask}; `dataset_id` is set to this dataset id.
|
|
116
|
+
# @return [Hash] intelligence response.
|
|
117
|
+
def ask(query, **options)
|
|
118
|
+
require_client!("ask")
|
|
119
|
+
client.ask(query, **options.merge(dataset_id: id))
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Upload local files by auto-creating a `file_upload` source, initializing presigned uploads, and completing the upload job.
|
|
123
|
+
# @param paths [String, Array<String>] local file paths to upload.
|
|
124
|
+
# @param source_name [String, nil] optional source name; defaults to timestamped Ruby SDK file-upload name.
|
|
125
|
+
# @param description [String, nil] optional source description.
|
|
126
|
+
# @param metadata [Hash] optional source metadata; dataset_id is added automatically.
|
|
127
|
+
# @return [Hash] upload completion/job response.
|
|
128
|
+
def ingest_files(paths:, source_name: nil, description: nil, metadata: {})
|
|
129
|
+
require_client!("ingest_files")
|
|
130
|
+
client.ingestion.ingest_files(
|
|
131
|
+
dataset_id: id,
|
|
132
|
+
paths: paths,
|
|
133
|
+
source_name: source_name,
|
|
134
|
+
description: description,
|
|
135
|
+
metadata: metadata
|
|
136
|
+
)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Start an ingestion job from an existing source into this dataset.
|
|
140
|
+
# @param source_id [String, Source, Hash, nil] source id or object/hash containing an id.
|
|
141
|
+
# @param source [Source, Hash, nil] alternate source object/hash containing an id.
|
|
142
|
+
# @param pipeline_id [String, nil] optional pipeline id.
|
|
143
|
+
# @return [Hash] ingestion job response.
|
|
144
|
+
def ingest_source(source_id = nil, source: nil, pipeline_id: nil)
|
|
145
|
+
require_client!("ingest_source")
|
|
146
|
+
resolved_source_id = extract_source_id(source_id || source)
|
|
147
|
+
raise ArgumentError, "source_id is required" if resolved_source_id.nil? || resolved_source_id.to_s.empty?
|
|
148
|
+
|
|
149
|
+
client.ingestion.start_job(source_id: resolved_source_id, dataset_id: id, pipeline_id: pipeline_id)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
private
|
|
153
|
+
|
|
154
|
+
def normalize_data(value)
|
|
155
|
+
case value
|
|
156
|
+
when Hash
|
|
157
|
+
value.each_with_object({}) { |(key, item), memo| memo[key.to_s] = item }
|
|
158
|
+
else
|
|
159
|
+
{}
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def extract_id(hash)
|
|
164
|
+
return nil unless hash
|
|
165
|
+
|
|
166
|
+
hash["id"] || hash[:id] || hash["dataset_id"] || hash[:dataset_id]
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def extract_source_id(value)
|
|
170
|
+
return value if value.is_a?(String) || value.is_a?(Symbol)
|
|
171
|
+
return value.id if value.respond_to?(:id)
|
|
172
|
+
|
|
173
|
+
extract_id(normalize_data(value))
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def require_client!(method_name)
|
|
177
|
+
return if client
|
|
178
|
+
|
|
179
|
+
raise ArgumentError, "#{method_name} requires a Dataset created by VectorAmp::Client"
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
end
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "securerandom"
|
|
4
|
+
require_relative "dataset"
|
|
5
|
+
require_relative "embedding"
|
|
6
|
+
require_relative "utils"
|
|
7
|
+
|
|
8
|
+
module VectorAmp
|
|
9
|
+
# Dataset API resource for create/list/get/search/insert operations.
|
|
10
|
+
class DatasetsResource
|
|
11
|
+
# @param transport [#request] API transport.
|
|
12
|
+
# @param client [Client, nil] optional client attached to returned Dataset objects.
|
|
13
|
+
# @return [DatasetsResource]
|
|
14
|
+
def initialize(transport, client: nil)
|
|
15
|
+
@transport = transport
|
|
16
|
+
@client = client
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# List datasets.
|
|
20
|
+
# @param limit [Integer] page size; defaults to 50.
|
|
21
|
+
# @param offset [Integer] page offset; defaults to 0.
|
|
22
|
+
# @return [Hash] response envelope with `datasets` wrapped as Dataset objects when present.
|
|
23
|
+
def list(limit: 50, offset: 0)
|
|
24
|
+
wrap_list(@transport.request(:get, "/datasets", query: { limit: limit, offset: offset }))
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Fetch a dataset by id.
|
|
28
|
+
# @param dataset_id [String] dataset id.
|
|
29
|
+
# @return [Dataset]
|
|
30
|
+
def get(dataset_id)
|
|
31
|
+
wrap_dataset(@transport.request(:get, "/datasets/#{dataset_id}"))
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Delete a dataset by id.
|
|
35
|
+
# @param dataset_id [String] dataset id.
|
|
36
|
+
# @return [Hash] delete response.
|
|
37
|
+
def delete(dataset_id)
|
|
38
|
+
@transport.request(:delete, "/datasets/#{dataset_id}")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Fetch dataset statistics.
|
|
42
|
+
# @param dataset_id [String] dataset id.
|
|
43
|
+
# @return [Hash] stats response.
|
|
44
|
+
def stats(dataset_id)
|
|
45
|
+
@transport.request(:get, "/datasets/#{dataset_id}/stats")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Create a SABLE dataset. `index_type` is managed by the SDK and always sent as `sable`.
|
|
49
|
+
#
|
|
50
|
+
# Only `name` is required. The embedding defaults to the managed
|
|
51
|
+
# `vectoramp/VectorAmp-Embedding-4B` model and the dimension is inferred
|
|
52
|
+
# (2560 for the default model). Pass `embedding:` (a Hash or model String, or
|
|
53
|
+
# the result of {VectorAmp::Embedding.openai}) to use a different model; if
|
|
54
|
+
# the model's dimension is not known to the SDK you must also pass `dim:`.
|
|
55
|
+
#
|
|
56
|
+
# @param name [String] dataset name.
|
|
57
|
+
# @param dim [Integer, nil] embedding/vector dimension; inferred when omitted for known models.
|
|
58
|
+
# @param embedding [Hash, String, nil] embedding configuration; defaults to the managed VectorAmp model.
|
|
59
|
+
# @param metric [String] distance metric; defaults to `cosine`.
|
|
60
|
+
# @param hybrid [Boolean, nil] enable hybrid (dense + sparse) search; sends `hybrid: true`.
|
|
61
|
+
# @param filters [Hash, nil] optional filter schema/config.
|
|
62
|
+
# @param metadata_schema [Hash, nil] optional metadata schema.
|
|
63
|
+
# @param tuning [Hash, nil] optional SABLE tuning parameters.
|
|
64
|
+
# @param metadata [Hash, nil] optional dataset metadata.
|
|
65
|
+
# @return [Dataset] created dataset.
|
|
66
|
+
# @raise [ArgumentError] when `index_type` is supplied, dim cannot be inferred, or unknown options are passed.
|
|
67
|
+
def create(name:, dim: nil, embedding: nil, metric: "cosine", hybrid: nil, filters: nil, metadata_schema: nil, tuning: nil, metadata: nil, **unknown)
|
|
68
|
+
if unknown.key?(:index_type) || unknown.key?("index_type")
|
|
69
|
+
raise ArgumentError, "index_type is managed by VectorAmp Ruby SDK and is always 'sable'"
|
|
70
|
+
end
|
|
71
|
+
Utils.ensure_no_unknown!(unknown, "create")
|
|
72
|
+
|
|
73
|
+
resolved_embedding = Embedding.normalize(embedding)
|
|
74
|
+
resolved_dim = dim || Embedding.infer_dim(resolved_embedding)
|
|
75
|
+
if resolved_dim.nil?
|
|
76
|
+
model = resolved_embedding[:model] || resolved_embedding["model"]
|
|
77
|
+
raise ArgumentError, "dim is required for embedding model #{model.inspect}; pass dim: explicitly"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
body = Utils.compact_hash(
|
|
81
|
+
name: name,
|
|
82
|
+
dim: resolved_dim,
|
|
83
|
+
metric: metric,
|
|
84
|
+
embedding: resolved_embedding,
|
|
85
|
+
index_type: "sable",
|
|
86
|
+
hybrid: hybrid,
|
|
87
|
+
filters: filters,
|
|
88
|
+
metadata_schema: metadata_schema,
|
|
89
|
+
tuning: tuning,
|
|
90
|
+
metadata: metadata
|
|
91
|
+
)
|
|
92
|
+
wrap_dataset(@transport.request(:post, "/datasets", body: body))
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# List retained source documents for a dataset using cursor pagination.
|
|
97
|
+
# @param dataset_id [String] dataset id.
|
|
98
|
+
# @param limit [Integer, nil] maximum documents to return.
|
|
99
|
+
# @param cursor [String, nil] cursor from a previous response's `next_cursor`.
|
|
100
|
+
# @param status [String, nil] optional document status filter, e.g. `ready`.
|
|
101
|
+
# @return [Hash] response envelope with `documents` and `next_cursor`.
|
|
102
|
+
def list_documents(dataset_id, limit: 50, cursor: nil, status: nil)
|
|
103
|
+
@transport.request(:get, "/datasets/#{dataset_id}/documents", query: Utils.compact_hash(limit: limit, cursor: cursor, status: status))
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Download the retained original bytes for a dataset document.
|
|
107
|
+
# The HTTP transport follows redirects so this returns the final raw object bytes.
|
|
108
|
+
# @param dataset_id [String] dataset id.
|
|
109
|
+
# @param document_id [String] document id returned by {#list_documents}.
|
|
110
|
+
# @return [String] raw document bytes.
|
|
111
|
+
def download_document(dataset_id, document_id)
|
|
112
|
+
@transport.request(:get, "/datasets/#{dataset_id}/documents/#{document_id}/download", raw: true, headers: { Accept: "*/*" })
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Search a dataset by text or vector query.
|
|
116
|
+
# @param dataset_id [String] dataset id.
|
|
117
|
+
# @param query_text_or_options [String, Hash, nil] text query or legacy options hash.
|
|
118
|
+
# @param query [Array<Numeric>, nil] vector query.
|
|
119
|
+
# @param query_text [String, nil] explicit text query; overrides positional text when provided.
|
|
120
|
+
# @param search_text [String, nil] alias for query_text for one-field hybrid/BM25 search.
|
|
121
|
+
# @param top_k [Integer] number of results; defaults to 10.
|
|
122
|
+
# @param filters [Hash, nil] metadata filters.
|
|
123
|
+
# @param advanced_filters [Hash, nil] advanced filter expression.
|
|
124
|
+
# @param embedding_model [String, nil] optional embedding model override.
|
|
125
|
+
# @param embedding_provider [String, nil] optional embedding provider override.
|
|
126
|
+
# @param nprobe_override [Integer, nil] optional SABLE search probe override.
|
|
127
|
+
# @param rerank_depth_override [Integer, nil] optional rerank depth override.
|
|
128
|
+
# @param hybrid [Boolean, nil] enable hybrid search when supported.
|
|
129
|
+
# @param sparse_query [Hash, nil] sparse query for hybrid search.
|
|
130
|
+
# @param alpha [Numeric, nil] dense/sparse weighting for hybrid search.
|
|
131
|
+
# @param include_embeddings [Boolean, nil] include vector values in results.
|
|
132
|
+
# @param include_documents [Boolean, nil] include document fields in results.
|
|
133
|
+
# @param include_metadata [Boolean, nil] include metadata; API default is true.
|
|
134
|
+
# @param rerank [Boolean, Hash, nil] enable semantic reranking. `true` or `{ enabled: true }` uses vectoramp / VectorAmp-Rerank-v1.
|
|
135
|
+
# @return [Hash] search response.
|
|
136
|
+
def search(dataset_id, query_text_or_options = nil, query: nil, query_text: nil, search_text: nil, top_k: 10, filters: nil, advanced_filters: nil,
|
|
137
|
+
embedding_model: nil, embedding_provider: nil, nprobe_override: nil, rerank_depth_override: nil,
|
|
138
|
+
hybrid: nil, sparse_query: nil, alpha: nil, include_embeddings: nil, include_documents: nil,
|
|
139
|
+
include_metadata: nil, rerank: nil, **unknown)
|
|
140
|
+
if query_text_or_options.is_a?(Hash)
|
|
141
|
+
unknown = query_text_or_options.merge(unknown)
|
|
142
|
+
query_text_or_options = nil
|
|
143
|
+
end
|
|
144
|
+
Utils.ensure_no_unknown!(unknown, "search")
|
|
145
|
+
raise ArgumentError, "provide query_text or search_text, not both" if query_text && search_text
|
|
146
|
+
resolved_query_text = query_text || search_text || query_text_or_options
|
|
147
|
+
body = Utils.compact_hash(
|
|
148
|
+
query: query,
|
|
149
|
+
query_text: resolved_query_text,
|
|
150
|
+
top_k: top_k,
|
|
151
|
+
filters: filters,
|
|
152
|
+
advanced_filters: advanced_filters,
|
|
153
|
+
embedding_model: embedding_model,
|
|
154
|
+
embedding_provider: embedding_provider,
|
|
155
|
+
nprobe_override: nprobe_override,
|
|
156
|
+
rerank_depth_override: rerank_depth_override,
|
|
157
|
+
hybrid: hybrid,
|
|
158
|
+
sparse_query: sparse_query,
|
|
159
|
+
alpha: alpha,
|
|
160
|
+
include_embeddings: include_embeddings,
|
|
161
|
+
include_documents: include_documents,
|
|
162
|
+
include_metadata: include_metadata,
|
|
163
|
+
rerank: rerank
|
|
164
|
+
)
|
|
165
|
+
@transport.request(:post, "/datasets/#{dataset_id}/search", body: body)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Insert vectors into a dataset.
|
|
169
|
+
#
|
|
170
|
+
# Vector record ids may be strings or integers. Integer ids are preserved as
|
|
171
|
+
# JSON numbers (not coerced to strings) so the API does not rewrite them.
|
|
172
|
+
# @param dataset_id [String] dataset id.
|
|
173
|
+
# @param vectors [Array<Hash>] vector records with ids, values, and optional metadata.
|
|
174
|
+
# @return [Hash] insert response.
|
|
175
|
+
def insert(dataset_id, vectors:)
|
|
176
|
+
@transport.request(:post, "/datasets/#{dataset_id}/insert", body: { vectors: Utils.normalize_vectors(vectors) })
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Generate embeddings using the dataset embedding configuration.
|
|
180
|
+
# @param dataset_id [String] dataset id.
|
|
181
|
+
# @param text [String, nil] single text to embed.
|
|
182
|
+
# @param texts [Array<String>, nil] multiple texts to embed.
|
|
183
|
+
# @return [Hash] embed response.
|
|
184
|
+
def embed(dataset_id, text: nil, texts: nil)
|
|
185
|
+
raise ArgumentError, "provide text or texts" if text.nil? && texts.nil?
|
|
186
|
+
|
|
187
|
+
@transport.request(:post, "/datasets/#{dataset_id}/embed", body: Utils.compact_hash(text: text, texts: texts))
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Embed and insert texts into a dataset.
|
|
191
|
+
#
|
|
192
|
+
# Accepts a single string or a list of strings. Ids are auto-generated when
|
|
193
|
+
# omitted; supplied ids may be strings or integers, and integer ids are
|
|
194
|
+
# preserved as JSON numbers. The source text is copied into `metadata.text`.
|
|
195
|
+
# @param dataset_id [String] dataset id.
|
|
196
|
+
# @param texts_arg [String, Array<String>, nil] positional text(s) for convenience.
|
|
197
|
+
# @param texts [String, Array<String>, nil] keyword text(s).
|
|
198
|
+
# @param ids [Array, nil] optional ids; generated UUIDs when omitted.
|
|
199
|
+
# @param metadata [Hash, Array<Hash>, nil] metadata applied to all texts or per-text.
|
|
200
|
+
# @return [Hash] insert response.
|
|
201
|
+
def add_texts(dataset_id, texts_arg = nil, texts: nil, ids: nil, metadata: nil)
|
|
202
|
+
texts = Array(texts || texts_arg)
|
|
203
|
+
raise ArgumentError, "texts must not be empty" if texts.empty?
|
|
204
|
+
raise ArgumentError, "ids length must match texts length" if ids && ids.length != texts.length
|
|
205
|
+
if metadata.is_a?(Array) && metadata.length != texts.length
|
|
206
|
+
raise ArgumentError, "metadata length must match texts length"
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
response = embed(dataset_id, texts: texts)
|
|
210
|
+
embeddings = response.fetch("embeddings")
|
|
211
|
+
vectors = texts.each_with_index.map do |text, index|
|
|
212
|
+
item_metadata = metadata.is_a?(Array) ? metadata[index] : metadata
|
|
213
|
+
{
|
|
214
|
+
id: ids ? Utils.coerce_vector_id(ids[index]) : SecureRandom.uuid,
|
|
215
|
+
values: embeddings[index],
|
|
216
|
+
metadata: Utils.compact_hash((item_metadata || {}).merge(text: text))
|
|
217
|
+
}
|
|
218
|
+
end
|
|
219
|
+
insert(dataset_id, vectors: vectors)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
private
|
|
223
|
+
|
|
224
|
+
def wrap_dataset(data)
|
|
225
|
+
Dataset.new(data, service: self, client: @client)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def wrap_list(response)
|
|
229
|
+
return response unless response.is_a?(Hash)
|
|
230
|
+
|
|
231
|
+
datasets = response["datasets"] || response[:datasets]
|
|
232
|
+
return response unless datasets.is_a?(Array)
|
|
233
|
+
|
|
234
|
+
response.merge("datasets" => datasets.map { |dataset| wrap_dataset(dataset) })
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module VectorAmp
|
|
4
|
+
# Embedding configuration defaults and helpers.
|
|
5
|
+
#
|
|
6
|
+
# SDK callers rarely need to specify an embedding model. By default datasets use
|
|
7
|
+
# the managed VectorAmp embedding model and the SDK infers the vector dimension.
|
|
8
|
+
module Embedding
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
# Default embedding provider used when none is supplied.
|
|
12
|
+
DEFAULT_PROVIDER = "vectoramp"
|
|
13
|
+
# Default embedding model used when none is supplied.
|
|
14
|
+
DEFAULT_MODEL = "VectorAmp-Embedding-4B"
|
|
15
|
+
|
|
16
|
+
# Built-in dimension inference for known provider/model pairs.
|
|
17
|
+
DIM_TABLE = {
|
|
18
|
+
["vectoramp", "VectorAmp-Embedding-4B"] => 2560,
|
|
19
|
+
["openai", "text-embedding-3-small"] => 1536,
|
|
20
|
+
["openai", "text-embedding-3-large"] => 3072,
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
# The fully managed default embedding configuration.
|
|
24
|
+
# @return [Hash] `{ provider:, model: }`
|
|
25
|
+
def default_embedding
|
|
26
|
+
{ provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Build an OpenAI embedding configuration.
|
|
30
|
+
# @param size [String, Symbol] `"small"` (text-embedding-3-small) or `"large"` (text-embedding-3-large).
|
|
31
|
+
# @return [Hash] `{ provider: "openai", model: ... }`
|
|
32
|
+
# @raise [ArgumentError] when size is not "small" or "large".
|
|
33
|
+
def openai(size = "small")
|
|
34
|
+
model = case size.to_s
|
|
35
|
+
when "small" then "text-embedding-3-small"
|
|
36
|
+
when "large" then "text-embedding-3-large"
|
|
37
|
+
else
|
|
38
|
+
raise ArgumentError, %(openai size must be "small" or "large", got #{size.inspect})
|
|
39
|
+
end
|
|
40
|
+
{ provider: "openai", model: model }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Normalize a user-supplied embedding value into a config hash.
|
|
44
|
+
# Accepts a Hash (`{ provider:, model:, secret_ref? }`) or a String model name
|
|
45
|
+
# (assumed to use the default provider).
|
|
46
|
+
# @param embedding [Hash, String, nil]
|
|
47
|
+
# @return [Hash] normalized embedding config with string keys for provider/model lookups.
|
|
48
|
+
def normalize(embedding)
|
|
49
|
+
case embedding
|
|
50
|
+
when nil then default_embedding
|
|
51
|
+
when String then { provider: DEFAULT_PROVIDER, model: embedding }
|
|
52
|
+
when Hash then embedding
|
|
53
|
+
else
|
|
54
|
+
raise ArgumentError, "embedding must be a Hash or String"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Infer the vector dimension for an embedding config, or nil when unknown.
|
|
59
|
+
# @param embedding [Hash] normalized embedding config.
|
|
60
|
+
# @return [Integer, nil] inferred dimension or nil for custom/unknown models.
|
|
61
|
+
def infer_dim(embedding)
|
|
62
|
+
provider = embedding[:provider] || embedding["provider"]
|
|
63
|
+
model = embedding[:model] || embedding["model"]
|
|
64
|
+
DIM_TABLE[[provider.to_s, model.to_s]]
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module VectorAmp
|
|
4
|
+
class Error < StandardError; end
|
|
5
|
+
class ConfigurationError < Error; end
|
|
6
|
+
|
|
7
|
+
class APIError < Error
|
|
8
|
+
attr_reader :status, :body, :headers
|
|
9
|
+
|
|
10
|
+
def initialize(message, status: nil, body: nil, headers: {})
|
|
11
|
+
super(message)
|
|
12
|
+
@status = status
|
|
13
|
+
@body = body
|
|
14
|
+
@headers = headers
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|