carbon_ruby_sdk 0.2.24 → 0.2.26
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +8 -8
- data/lib/carbon_ruby_sdk/api/files_api.rb +8 -8
- data/lib/carbon_ruby_sdk/api/integrations_api.rb +4 -4
- data/lib/carbon_ruby_sdk/models/external_data_source_type.rb +49 -0
- data/lib/carbon_ruby_sdk/models/o_auth_url_request.rb +2 -2
- data/lib/carbon_ruby_sdk/version.rb +1 -1
- data/lib/carbon_ruby_sdk.rb +1 -0
- data/spec/api/files_api_spec.rb +2 -2
- data/spec/models/external_data_source_type_spec.rb +22 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c562b3f56aff93a861af81dd88f0793eec36d8d8a9fc82c3f636649795dd50e5
|
4
|
+
data.tar.gz: 2f13ed904f9cfac8e64a28e3334e6436c062b579b729abeea7c66bf983245d94
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 343fc3b835634fb8b21c4673067e63ef7b7912a4ea5afea96bdbefd924e82281a0f70ef283bed37e0ab535b1b1b0a6292aa0da93a1a93bd22770a25bc775f741
|
7
|
+
data.tar.gz: 95ad01e04435a5513e248d35cdd560c2635c604d8e70aa46cd173853afd211b120722e3d9e12138f349a28e0572c7287f24a72fd14a25a7c9ad3560240f33702
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
carbon_ruby_sdk (0.2.
|
4
|
+
carbon_ruby_sdk (0.2.26)
|
5
5
|
faraday (>= 1.0.1, < 3.0)
|
6
6
|
faraday-multipart (~> 1.0, >= 1.0.4)
|
7
7
|
|
@@ -52,7 +52,7 @@ GEM
|
|
52
52
|
rspec-mocks (~> 3.13.0)
|
53
53
|
rspec-core (3.13.0)
|
54
54
|
rspec-support (~> 3.13.0)
|
55
|
-
rspec-expectations (3.13.
|
55
|
+
rspec-expectations (3.13.2)
|
56
56
|
diff-lcs (>= 1.2.0, < 2.0)
|
57
57
|
rspec-support (~> 3.13.0)
|
58
58
|
rspec-mocks (3.13.1)
|
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
|
7
7
|
Connect external data to LLMs, no matter the source.
|
8
8
|
|
9
|
-
[![npm](https://img.shields.io/badge/gem-v0.2.
|
9
|
+
[![npm](https://img.shields.io/badge/gem-v0.2.26-blue)](https://rubygems.org/gems/carbon_ruby_sdk/versions/0.2.26)
|
10
10
|
|
11
11
|
</div>
|
12
12
|
|
@@ -93,7 +93,7 @@ Connect external data to LLMs, no matter the source.
|
|
93
93
|
Add to Gemfile:
|
94
94
|
|
95
95
|
```ruby
|
96
|
-
gem 'carbon_ruby_sdk', '~> 0.2.
|
96
|
+
gem 'carbon_ruby_sdk', '~> 0.2.26'
|
97
97
|
```
|
98
98
|
|
99
99
|
## Getting Started<a id="getting-started"></a>
|
@@ -1114,7 +1114,7 @@ of all possible query parameters:
|
|
1114
1114
|
- `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings
|
1115
1115
|
- `set_page_as_boundary`: described above
|
1116
1116
|
- `embedding_model`: the model used to generate embeddings for the document chunks
|
1117
|
-
- `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks
|
1117
|
+
- `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks. Valid for PDFs, JPEGs, and PNGs
|
1118
1118
|
- `generate_sparse_vectors`: whether or not to generate sparse vectors for the file. Required for hybrid search.
|
1119
1119
|
- `prepend_filename_to_chunks`: whether or not to prepend the filename to the chunk text
|
1120
1120
|
|
@@ -1178,8 +1178,8 @@ description route description for more information.
|
|
1178
1178
|
Embedding model that will be used to embed file chunks.
|
1179
1179
|
|
1180
1180
|
##### use_ocr: `Boolean`<a id="use_ocr-boolean"></a>
|
1181
|
-
Whether or not to use OCR when processing files.
|
1182
|
-
documents with tables, images, and/or scanned text.
|
1181
|
+
Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and
|
1182
|
+
PNGs. Useful for documents with tables, images, and/or scanned text.
|
1183
1183
|
|
1184
1184
|
##### generate_sparse_vectors: `Boolean`<a id="generate_sparse_vectors-boolean"></a>
|
1185
1185
|
Whether or not to generate sparse vectors for the file. This is *required* for
|
@@ -1617,7 +1617,7 @@ success state.
|
|
1617
1617
|
|
1618
1618
|
```ruby
|
1619
1619
|
result = carbon.integrations.get_oauth_url(
|
1620
|
-
service: "
|
1620
|
+
service: "BOX",
|
1621
1621
|
tags: None,
|
1622
1622
|
scope: "string_example",
|
1623
1623
|
chunk_size: 1500,
|
@@ -1657,7 +1657,7 @@ p result
|
|
1657
1657
|
|
1658
1658
|
#### ⚙️ Parameters<a id="⚙️-parameters"></a>
|
1659
1659
|
|
1660
|
-
##### service: [`
|
1660
|
+
##### service: [`ExternalDataSourceType`](./lib/carbon_ruby_sdk/models/external_data_source_type.rb)<a id="service-externaldatasourcetypelibcarbon_ruby_sdkmodelsexternal_data_source_typerb"></a>
|
1661
1661
|
##### tags: `Object`<a id="tags-object"></a>
|
1662
1662
|
##### scope: `String`<a id="scope-string"></a>
|
1663
1663
|
##### chunk_size: `Integer`<a id="chunk_size-integer"></a>
|
@@ -1695,7 +1695,7 @@ This request id will be added to all files that get synced using the generated
|
|
1695
1695
|
OAuth URL
|
1696
1696
|
|
1697
1697
|
##### use_ocr: `Boolean`<a id="use_ocr-boolean"></a>
|
1698
|
-
Enable OCR for files that support it. Supported formats: png, jpg
|
1698
|
+
Enable OCR for files that support it. Supported formats: pdf, png, jpg
|
1699
1699
|
|
1700
1700
|
##### parse_pdf_tables_with_ocr: `Boolean`<a id="parse_pdf_tables_with_ocr-boolean"></a>
|
1701
1701
|
##### enable_file_picker: `Boolean`<a id="enable_file_picker-boolean"></a>
|
@@ -1341,7 +1341,7 @@ module Carbon
|
|
1341
1341
|
# - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings
|
1342
1342
|
# - `set_page_as_boundary`: described above
|
1343
1343
|
# - `embedding_model`: the model used to generate embeddings for the document chunks
|
1344
|
-
# - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks
|
1344
|
+
# - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks. Valid for PDFs, JPEGs, and PNGs
|
1345
1345
|
# - `generate_sparse_vectors`: whether or not to generate sparse vectors for the file. Required for hybrid search.
|
1346
1346
|
# - `prepend_filename_to_chunks`: whether or not to prepend the filename to the chunk text
|
1347
1347
|
#
|
@@ -1363,7 +1363,7 @@ module Carbon
|
|
1363
1363
|
# @param skip_embedding_generation [Boolean] Flag to control whether or not embeddings should be generated and stored when processing file.
|
1364
1364
|
# @param set_page_as_boundary [Boolean] Flag to control whether or not to set the a page's worth of content as the maximum amount of content that can appear in a chunk. Only valid for PDFs. See description route description for more information.
|
1365
1365
|
# @param embedding_model [EmbeddingModel] Embedding model that will be used to embed file chunks.
|
1366
|
-
# @param use_ocr [Boolean] Whether or not to use OCR when processing files.
|
1366
|
+
# @param use_ocr [Boolean] Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and PNGs. Useful for documents with tables, images, and/or scanned text.
|
1367
1367
|
# @param generate_sparse_vectors [Boolean] Whether or not to generate sparse vectors for the file. This is *required* for the file to be a candidate for hybrid search.
|
1368
1368
|
# @param prepend_filename_to_chunks [Boolean] Whether or not to prepend the file's name to chunks.
|
1369
1369
|
# @param max_items_per_chunk [Integer] Number of objects per chunk. For csv, tsv, xlsx, and json files only.
|
@@ -1414,7 +1414,7 @@ module Carbon
|
|
1414
1414
|
# - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings
|
1415
1415
|
# - `set_page_as_boundary`: described above
|
1416
1416
|
# - `embedding_model`: the model used to generate embeddings for the document chunks
|
1417
|
-
# - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks
|
1417
|
+
# - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks. Valid for PDFs, JPEGs, and PNGs
|
1418
1418
|
# - `generate_sparse_vectors`: whether or not to generate sparse vectors for the file. Required for hybrid search.
|
1419
1419
|
# - `prepend_filename_to_chunks`: whether or not to prepend the filename to the chunk text
|
1420
1420
|
#
|
@@ -1436,7 +1436,7 @@ module Carbon
|
|
1436
1436
|
# @param skip_embedding_generation [Boolean] Flag to control whether or not embeddings should be generated and stored when processing file.
|
1437
1437
|
# @param set_page_as_boundary [Boolean] Flag to control whether or not to set the a page's worth of content as the maximum amount of content that can appear in a chunk. Only valid for PDFs. See description route description for more information.
|
1438
1438
|
# @param embedding_model [EmbeddingModel] Embedding model that will be used to embed file chunks.
|
1439
|
-
# @param use_ocr [Boolean] Whether or not to use OCR when processing files.
|
1439
|
+
# @param use_ocr [Boolean] Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and PNGs. Useful for documents with tables, images, and/or scanned text.
|
1440
1440
|
# @param generate_sparse_vectors [Boolean] Whether or not to generate sparse vectors for the file. This is *required* for the file to be a candidate for hybrid search.
|
1441
1441
|
# @param prepend_filename_to_chunks [Boolean] Whether or not to prepend the file's name to chunks.
|
1442
1442
|
# @param max_items_per_chunk [Integer] Number of objects per chunk. For csv, tsv, xlsx, and json files only.
|
@@ -1475,7 +1475,7 @@ module Carbon
|
|
1475
1475
|
end
|
1476
1476
|
|
1477
1477
|
# Create Upload File
|
1478
|
-
# This endpoint is used to directly upload local files to Carbon. The `POST` request should be a multipart form request. Note that the `set_page_as_boundary` query parameter is applicable only to PDFs for now. When this value is set, PDF chunks are at most one page long. Additional information can be retrieved for each chunk, however, namely the coordinates of the bounding box around the chunk (this can be used for things like text highlighting). Following is a description of all possible query parameters: - `chunk_size`: the chunk size (in tokens) applied when splitting the document - `chunk_overlap`: the chunk overlap (in tokens) applied when splitting the document - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings - `set_page_as_boundary`: described above - `embedding_model`: the model used to generate embeddings for the document chunks - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks
|
1478
|
+
# This endpoint is used to directly upload local files to Carbon. The `POST` request should be a multipart form request. Note that the `set_page_as_boundary` query parameter is applicable only to PDFs for now. When this value is set, PDF chunks are at most one page long. Additional information can be retrieved for each chunk, however, namely the coordinates of the bounding box around the chunk (this can be used for things like text highlighting). Following is a description of all possible query parameters: - `chunk_size`: the chunk size (in tokens) applied when splitting the document - `chunk_overlap`: the chunk overlap (in tokens) applied when splitting the document - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings - `set_page_as_boundary`: described above - `embedding_model`: the model used to generate embeddings for the document chunks - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks. Valid for PDFs, JPEGs, and PNGs - `generate_sparse_vectors`: whether or not to generate sparse vectors for the file. Required for hybrid search. - `prepend_filename_to_chunks`: whether or not to prepend the filename to the chunk text Carbon supports multiple models for use in generating embeddings for files. For images, we support Vertex AI's multimodal model; for text, we support OpenAI's `text-embedding-ada-002` and Cohere's embed-multilingual-v3.0. The model can be specified via the `embedding_model` parameter (in the POST body for `/embeddings`, and a query parameter in `/uploadfile`). If no model is supplied, the `text-embedding-ada-002` is used by default. When performing embedding queries, embeddings from files that used the specified model will be considered in the query. For example, if files A and B have embeddings generated with `OPENAI`, and files C and D have embeddings generated with `COHERE_MULTILINGUAL_V3`, then by default, queries will only consider files A and B. If `COHERE_MULTILINGUAL_V3` is specified as the `embedding_model` in `/embeddings`, then only files C and D will be considered. Make sure that the set of all files you want considered for a query have embeddings generated via the same model. For now, **do not** set `VERTEX_MULTIMODAL` as an `embedding_model`. This model is used automatically by Carbon when it detects an image file.
|
1479
1479
|
# @param file [File]
|
1480
1480
|
# @param body_create_upload_file_uploadfile_post [BodyCreateUploadFileUploadfilePost]
|
1481
1481
|
# @param [Hash] opts the optional parameters
|
@@ -1484,7 +1484,7 @@ module Carbon
|
|
1484
1484
|
# @option opts [Boolean] :skip_embedding_generation Flag to control whether or not embeddings should be generated and stored when processing file. (default to false)
|
1485
1485
|
# @option opts [Boolean] :set_page_as_boundary Flag to control whether or not to set the a page's worth of content as the maximum amount of content that can appear in a chunk. Only valid for PDFs. See description route description for more information. (default to false)
|
1486
1486
|
# @option opts [EmbeddingModel] :embedding_model Embedding model that will be used to embed file chunks. (default to 'OPENAI')
|
1487
|
-
# @option opts [Boolean] :use_ocr Whether or not to use OCR when processing files.
|
1487
|
+
# @option opts [Boolean] :use_ocr Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and PNGs. Useful for documents with tables, images, and/or scanned text. (default to false)
|
1488
1488
|
# @option opts [Boolean] :generate_sparse_vectors Whether or not to generate sparse vectors for the file. This is *required* for the file to be a candidate for hybrid search. (default to false)
|
1489
1489
|
# @option opts [Boolean] :prepend_filename_to_chunks Whether or not to prepend the file's name to chunks. (default to false)
|
1490
1490
|
# @option opts [Integer] :max_items_per_chunk Number of objects per chunk. For csv, tsv, xlsx, and json files only.
|
@@ -1503,7 +1503,7 @@ module Carbon
|
|
1503
1503
|
end
|
1504
1504
|
|
1505
1505
|
# Create Upload File
|
1506
|
-
# This endpoint is used to directly upload local files to Carbon. The `POST` request should be a multipart form request. Note that the `set_page_as_boundary` query parameter is applicable only to PDFs for now. When this value is set, PDF chunks are at most one page long. Additional information can be retrieved for each chunk, however, namely the coordinates of the bounding box around the chunk (this can be used for things like text highlighting). Following is a description of all possible query parameters: - `chunk_size`: the chunk size (in tokens) applied when splitting the document - `chunk_overlap`: the chunk overlap (in tokens) applied when splitting the document - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings - `set_page_as_boundary`: described above - `embedding_model`: the model used to generate embeddings for the document chunks - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks
|
1506
|
+
# This endpoint is used to directly upload local files to Carbon. The `POST` request should be a multipart form request. Note that the `set_page_as_boundary` query parameter is applicable only to PDFs for now. When this value is set, PDF chunks are at most one page long. Additional information can be retrieved for each chunk, however, namely the coordinates of the bounding box around the chunk (this can be used for things like text highlighting). Following is a description of all possible query parameters: - `chunk_size`: the chunk size (in tokens) applied when splitting the document - `chunk_overlap`: the chunk overlap (in tokens) applied when splitting the document - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings - `set_page_as_boundary`: described above - `embedding_model`: the model used to generate embeddings for the document chunks - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks. Valid for PDFs, JPEGs, and PNGs - `generate_sparse_vectors`: whether or not to generate sparse vectors for the file. Required for hybrid search. - `prepend_filename_to_chunks`: whether or not to prepend the filename to the chunk text Carbon supports multiple models for use in generating embeddings for files. For images, we support Vertex AI's multimodal model; for text, we support OpenAI's `text-embedding-ada-002` and Cohere's embed-multilingual-v3.0. The model can be specified via the `embedding_model` parameter (in the POST body for `/embeddings`, and a query parameter in `/uploadfile`). If no model is supplied, the `text-embedding-ada-002` is used by default. When performing embedding queries, embeddings from files that used the specified model will be considered in the query. For example, if files A and B have embeddings generated with `OPENAI`, and files C and D have embeddings generated with `COHERE_MULTILINGUAL_V3`, then by default, queries will only consider files A and B. If `COHERE_MULTILINGUAL_V3` is specified as the `embedding_model` in `/embeddings`, then only files C and D will be considered. Make sure that the set of all files you want considered for a query have embeddings generated via the same model. For now, **do not** set `VERTEX_MULTIMODAL` as an `embedding_model`. This model is used automatically by Carbon when it detects an image file.
|
1507
1507
|
# @param file [File]
|
1508
1508
|
# @param body_create_upload_file_uploadfile_post [BodyCreateUploadFileUploadfilePost]
|
1509
1509
|
# @param [Hash] opts the optional parameters
|
@@ -1512,7 +1512,7 @@ module Carbon
|
|
1512
1512
|
# @option opts [Boolean] :skip_embedding_generation Flag to control whether or not embeddings should be generated and stored when processing file. (default to false)
|
1513
1513
|
# @option opts [Boolean] :set_page_as_boundary Flag to control whether or not to set the a page's worth of content as the maximum amount of content that can appear in a chunk. Only valid for PDFs. See description route description for more information. (default to false)
|
1514
1514
|
# @option opts [EmbeddingModel] :embedding_model Embedding model that will be used to embed file chunks. (default to 'OPENAI')
|
1515
|
-
# @option opts [Boolean] :use_ocr Whether or not to use OCR when processing files.
|
1515
|
+
# @option opts [Boolean] :use_ocr Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and PNGs. Useful for documents with tables, images, and/or scanned text. (default to false)
|
1516
1516
|
# @option opts [Boolean] :generate_sparse_vectors Whether or not to generate sparse vectors for the file. This is *required* for the file to be a candidate for hybrid search. (default to false)
|
1517
1517
|
# @option opts [Boolean] :prepend_filename_to_chunks Whether or not to prepend the file's name to chunks. (default to false)
|
1518
1518
|
# @option opts [Integer] :max_items_per_chunk Number of objects per chunk. For csv, tsv, xlsx, and json files only.
|
@@ -641,7 +641,7 @@ module Carbon
|
|
641
641
|
# - A file syncing URL which skips the OAuth flow if the user already has a valid access token and takes them to the
|
642
642
|
# success state.
|
643
643
|
#
|
644
|
-
# @param service [
|
644
|
+
# @param service [ExternalDataSourceType]
|
645
645
|
# @param tags [Object]
|
646
646
|
# @param scope [String]
|
647
647
|
# @param chunk_size [Integer]
|
@@ -661,7 +661,7 @@ module Carbon
|
|
661
661
|
# @param data_source_id [Integer] Used to specify a data source to sync from if you have multiple connected. It can be skipped if you only have one data source of that type connected or are connecting a new account.
|
662
662
|
# @param connecting_new_account [Boolean] Used to connect a new data source. If not specified, we will attempt to create a sync URL for an existing data source based on type and ID.
|
663
663
|
# @param request_id [String] This request id will be added to all files that get synced using the generated OAuth URL
|
664
|
-
# @param use_ocr [Boolean] Enable OCR for files that support it. Supported formats: png, jpg
|
664
|
+
# @param use_ocr [Boolean] Enable OCR for files that support it. Supported formats: pdf, png, jpg
|
665
665
|
# @param parse_pdf_tables_with_ocr [Boolean]
|
666
666
|
# @param enable_file_picker [Boolean] Enable integration's file picker for sources that support it. Supported sources: BOX, DROPBOX, GOOGLE_DRIVE, ONEDRIVE, SHAREPOINT
|
667
667
|
# @param sync_source_items [Boolean] Enabling this flag will fetch all available content from the source to be listed via list items endpoint
|
@@ -711,7 +711,7 @@ module Carbon
|
|
711
711
|
# - A file syncing URL which skips the OAuth flow if the user already has a valid access token and takes them to the
|
712
712
|
# success state.
|
713
713
|
#
|
714
|
-
# @param service [
|
714
|
+
# @param service [ExternalDataSourceType]
|
715
715
|
# @param tags [Object]
|
716
716
|
# @param scope [String]
|
717
717
|
# @param chunk_size [Integer]
|
@@ -731,7 +731,7 @@ module Carbon
|
|
731
731
|
# @param data_source_id [Integer] Used to specify a data source to sync from if you have multiple connected. It can be skipped if you only have one data source of that type connected or are connecting a new account.
|
732
732
|
# @param connecting_new_account [Boolean] Used to connect a new data source. If not specified, we will attempt to create a sync URL for an existing data source based on type and ID.
|
733
733
|
# @param request_id [String] This request id will be added to all files that get synced using the generated OAuth URL
|
734
|
-
# @param use_ocr [Boolean] Enable OCR for files that support it. Supported formats: png, jpg
|
734
|
+
# @param use_ocr [Boolean] Enable OCR for files that support it. Supported formats: pdf, png, jpg
|
735
735
|
# @param parse_pdf_tables_with_ocr [Boolean]
|
736
736
|
# @param enable_file_picker [Boolean] Enable integration's file picker for sources that support it. Supported sources: BOX, DROPBOX, GOOGLE_DRIVE, ONEDRIVE, SHAREPOINT
|
737
737
|
# @param sync_source_items [Boolean] Enabling this flag will fetch all available content from the source to be listed via list items endpoint
|
@@ -0,0 +1,49 @@
|
|
1
|
+
=begin
|
2
|
+
#Carbon
|
3
|
+
|
4
|
+
#Connect external data to LLMs, no matter the source.
|
5
|
+
|
6
|
+
The version of the OpenAPI document: 1.0.0
|
7
|
+
=end
|
8
|
+
|
9
|
+
require 'date'
|
10
|
+
require 'time'
|
11
|
+
|
12
|
+
module Carbon
|
13
|
+
class ExternalDataSourceType
|
14
|
+
BOX = "BOX".freeze
|
15
|
+
CONFLUENCE = "CONFLUENCE".freeze
|
16
|
+
DROPBOX = "DROPBOX".freeze
|
17
|
+
GMAIL = "GMAIL".freeze
|
18
|
+
GOOGLE_DRIVE = "GOOGLE_DRIVE".freeze
|
19
|
+
GOOGLE_CLOUD_STORAGE = "GOOGLE_CLOUD_STORAGE".freeze
|
20
|
+
INTERCOM = "INTERCOM".freeze
|
21
|
+
NOTION = "NOTION".freeze
|
22
|
+
ONEDRIVE = "ONEDRIVE".freeze
|
23
|
+
OUTLOOK = "OUTLOOK".freeze
|
24
|
+
SALESFORCE = "SALESFORCE".freeze
|
25
|
+
SHAREPOINT = "SHAREPOINT".freeze
|
26
|
+
SLACK = "SLACK".freeze
|
27
|
+
ZENDESK = "ZENDESK".freeze
|
28
|
+
ZOTERO = "ZOTERO".freeze
|
29
|
+
|
30
|
+
def self.all_vars
|
31
|
+
@all_vars ||= [BOX, CONFLUENCE, DROPBOX, GMAIL, GOOGLE_DRIVE, GOOGLE_CLOUD_STORAGE, INTERCOM, NOTION, ONEDRIVE, OUTLOOK, SALESFORCE, SHAREPOINT, SLACK, ZENDESK, ZOTERO].freeze
|
32
|
+
end
|
33
|
+
|
34
|
+
# Builds the enum from string
|
35
|
+
# @param [String] The enum value in the form of the string
|
36
|
+
# @return [String] The enum value
|
37
|
+
def self.build_from_hash(value)
|
38
|
+
new.build_from_hash(value)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Builds the enum from string
|
42
|
+
# @param [String] The enum value in the form of the string
|
43
|
+
# @return [String] The enum value
|
44
|
+
def build_from_hash(value)
|
45
|
+
return value if ExternalDataSourceType.all_vars.include?(value)
|
46
|
+
raise "Invalid ENUM value #{value} for class #ExternalDataSourceType"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -56,7 +56,7 @@ module Carbon
|
|
56
56
|
# This request id will be added to all files that get synced using the generated OAuth URL
|
57
57
|
attr_accessor :request_id
|
58
58
|
|
59
|
-
# Enable OCR for files that support it. Supported formats: png, jpg
|
59
|
+
# Enable OCR for files that support it. Supported formats: pdf, png, jpg
|
60
60
|
attr_accessor :use_ocr
|
61
61
|
|
62
62
|
attr_accessor :parse_pdf_tables_with_ocr
|
@@ -118,7 +118,7 @@ module Carbon
|
|
118
118
|
{
|
119
119
|
:'tags' => :'Object',
|
120
120
|
:'scope' => :'String',
|
121
|
-
:'service' => :'
|
121
|
+
:'service' => :'ExternalDataSourceType',
|
122
122
|
:'chunk_size' => :'Integer',
|
123
123
|
:'chunk_overlap' => :'Integer',
|
124
124
|
:'skip_embedding_generation' => :'Boolean',
|
data/lib/carbon_ruby_sdk.rb
CHANGED
@@ -50,6 +50,7 @@ require 'carbon_ruby_sdk/models/embeddings_and_chunks_order_by_columns'
|
|
50
50
|
require 'carbon_ruby_sdk/models/embeddings_and_chunks_query_input'
|
51
51
|
require 'carbon_ruby_sdk/models/embeddings_and_chunks_query_input_v2'
|
52
52
|
require 'carbon_ruby_sdk/models/embeddings_and_chunks_response'
|
53
|
+
require 'carbon_ruby_sdk/models/external_data_source_type'
|
53
54
|
require 'carbon_ruby_sdk/models/external_file_sync_statuses'
|
54
55
|
require 'carbon_ruby_sdk/models/external_source_item'
|
55
56
|
require 'carbon_ruby_sdk/models/external_source_items_order_by'
|
data/spec/api/files_api_spec.rb
CHANGED
@@ -165,7 +165,7 @@ describe 'FilesApi' do
|
|
165
165
|
|
166
166
|
# unit tests for upload
|
167
167
|
# Create Upload File
|
168
|
-
# This endpoint is used to directly upload local files to Carbon. The `POST` request should be a multipart form request. Note that the `set_page_as_boundary` query parameter is applicable only to PDFs for now. When this value is set, PDF chunks are at most one page long. Additional information can be retrieved for each chunk, however, namely the coordinates of the bounding box around the chunk (this can be used for things like text highlighting). Following is a description of all possible query parameters: - `chunk_size`: the chunk size (in tokens) applied when splitting the document - `chunk_overlap`: the chunk overlap (in tokens) applied when splitting the document - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings - `set_page_as_boundary`: described above - `embedding_model`: the model used to generate embeddings for the document chunks - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks
|
168
|
+
# This endpoint is used to directly upload local files to Carbon. The `POST` request should be a multipart form request. Note that the `set_page_as_boundary` query parameter is applicable only to PDFs for now. When this value is set, PDF chunks are at most one page long. Additional information can be retrieved for each chunk, however, namely the coordinates of the bounding box around the chunk (this can be used for things like text highlighting). Following is a description of all possible query parameters: - `chunk_size`: the chunk size (in tokens) applied when splitting the document - `chunk_overlap`: the chunk overlap (in tokens) applied when splitting the document - `skip_embedding_generation`: whether or not to skip the generation of chunks and embeddings - `set_page_as_boundary`: described above - `embedding_model`: the model used to generate embeddings for the document chunks - `use_ocr`: whether or not to use OCR as a preprocessing step prior to generating chunks. Valid for PDFs, JPEGs, and PNGs - `generate_sparse_vectors`: whether or not to generate sparse vectors for the file. Required for hybrid search. - `prepend_filename_to_chunks`: whether or not to prepend the filename to the chunk text Carbon supports multiple models for use in generating embeddings for files. For images, we support Vertex AI's multimodal model; for text, we support OpenAI's `text-embedding-ada-002` and Cohere's embed-multilingual-v3.0. The model can be specified via the `embedding_model` parameter (in the POST body for `/embeddings`, and a query parameter in `/uploadfile`). If no model is supplied, the `text-embedding-ada-002` is used by default. When performing embedding queries, embeddings from files that used the specified model will be considered in the query. For example, if files A and B have embeddings generated with `OPENAI`, and files C and D have embeddings generated with `COHERE_MULTILINGUAL_V3`, then by default, queries will only consider files A and B. If `COHERE_MULTILINGUAL_V3` is specified as the `embedding_model` in `/embeddings`, then only files C and D will be considered. Make sure that the set of all files you want considered for a query have embeddings generated via the same model. For now, **do not** set `VERTEX_MULTIMODAL` as an `embedding_model`. This model is used automatically by Carbon when it detects an image file.
|
169
169
|
# @param file
|
170
170
|
# @param body_create_upload_file_uploadfile_post
|
171
171
|
# @param [Hash] opts the optional parameters
|
@@ -174,7 +174,7 @@ describe 'FilesApi' do
|
|
174
174
|
# @option opts [Boolean] :skip_embedding_generation Flag to control whether or not embeddings should be generated and stored when processing file.
|
175
175
|
# @option opts [Boolean] :set_page_as_boundary Flag to control whether or not to set the a page's worth of content as the maximum amount of content that can appear in a chunk. Only valid for PDFs. See description route description for more information.
|
176
176
|
# @option opts [EmbeddingModel] :embedding_model Embedding model that will be used to embed file chunks.
|
177
|
-
# @option opts [Boolean] :use_ocr Whether or not to use OCR when processing files.
|
177
|
+
# @option opts [Boolean] :use_ocr Whether or not to use OCR when processing files. Valid for PDFs, JPEGs, and PNGs. Useful for documents with tables, images, and/or scanned text.
|
178
178
|
# @option opts [Boolean] :generate_sparse_vectors Whether or not to generate sparse vectors for the file. This is *required* for the file to be a candidate for hybrid search.
|
179
179
|
# @option opts [Boolean] :prepend_filename_to_chunks Whether or not to prepend the file's name to chunks.
|
180
180
|
# @option opts [Integer] :max_items_per_chunk Number of objects per chunk. For csv, tsv, xlsx, and json files only.
|
@@ -0,0 +1,22 @@
|
|
1
|
+
=begin
|
2
|
+
#Carbon
|
3
|
+
|
4
|
+
#Connect external data to LLMs, no matter the source.
|
5
|
+
|
6
|
+
The version of the OpenAPI document: 1.0.0
|
7
|
+
=end
|
8
|
+
|
9
|
+
require 'spec_helper'
|
10
|
+
require 'json'
|
11
|
+
require 'date'
|
12
|
+
|
13
|
+
# Unit tests for Carbon::ExternalDataSourceType
|
14
|
+
describe Carbon::ExternalDataSourceType do
|
15
|
+
let(:instance) { Carbon::ExternalDataSourceType.new }
|
16
|
+
|
17
|
+
describe 'test an instance of ExternalDataSourceType' do
|
18
|
+
it 'should create an instance of ExternalDataSourceType' do
|
19
|
+
expect(instance).to be_instance_of(Carbon::ExternalDataSourceType)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: carbon_ruby_sdk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Konfig
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|
@@ -132,6 +132,7 @@ files:
|
|
132
132
|
- lib/carbon_ruby_sdk/models/embeddings_and_chunks_query_input.rb
|
133
133
|
- lib/carbon_ruby_sdk/models/embeddings_and_chunks_query_input_v2.rb
|
134
134
|
- lib/carbon_ruby_sdk/models/embeddings_and_chunks_response.rb
|
135
|
+
- lib/carbon_ruby_sdk/models/external_data_source_type.rb
|
135
136
|
- lib/carbon_ruby_sdk/models/external_file_sync_statuses.rb
|
136
137
|
- lib/carbon_ruby_sdk/models/external_source_item.rb
|
137
138
|
- lib/carbon_ruby_sdk/models/external_source_items_order_by.rb
|
@@ -305,6 +306,7 @@ files:
|
|
305
306
|
- spec/models/embeddings_and_chunks_query_input_spec.rb
|
306
307
|
- spec/models/embeddings_and_chunks_query_input_v2_spec.rb
|
307
308
|
- spec/models/embeddings_and_chunks_response_spec.rb
|
309
|
+
- spec/models/external_data_source_type_spec.rb
|
308
310
|
- spec/models/external_file_sync_statuses_spec.rb
|
309
311
|
- spec/models/external_source_item_spec.rb
|
310
312
|
- spec/models/external_source_items_order_by_spec.rb
|
@@ -600,6 +602,7 @@ test_files:
|
|
600
602
|
- spec/models/list_users_request_spec.rb
|
601
603
|
- spec/models/gitbook_sync_request_spec.rb
|
602
604
|
- spec/models/http_validation_error_spec.rb
|
605
|
+
- spec/models/external_data_source_type_spec.rb
|
603
606
|
- spec/models/embedding_and_chunk_spec.rb
|
604
607
|
- spec/models/embedding_properties_spec.rb
|
605
608
|
- spec/models/file_sync_config_nullable_spec.rb
|