carbon_ruby_sdk 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +20 -7
- data/lib/carbon_ruby_sdk/api/integrations_api.rb +8 -8
- data/lib/carbon_ruby_sdk/api/utilities_api.rb +10 -2
- data/lib/carbon_ruby_sdk/models/o_auth_url_request.rb +2 -2
- data/lib/carbon_ruby_sdk/models/sitemap_scrape_request.rb +60 -4
- data/lib/carbon_ruby_sdk/models/sync_files_request.rb +1 -1
- data/lib/carbon_ruby_sdk/models/sync_options.rb +1 -1
- data/lib/carbon_ruby_sdk/models/webscrape_request.rb +32 -4
- data/lib/carbon_ruby_sdk/version.rb +1 -1
- data/spec/models/sitemap_scrape_request_spec.rb +12 -0
- data/spec/models/webscrape_request_spec.rb +6 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: af28a3b256d49d38a6aca558d12a49a6e3f2888587dedeef54311ad7d0bd0ac9
|
|
4
|
+
data.tar.gz: a685b15e3ad3ab32463c4bd03c92432c21bc3c7cf3b8f29bcf340cde6e468377
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 023c5d51386e0e76ecd6954f25c69a8562302f89de2a1a15cd72fbfc0e3491c6129ad496da1ad9dbfd80a0030c30ba501cb2e8552cbdd92c07513a20f0257211
|
|
7
|
+
data.tar.gz: 465fbc642ce7bf817b5c5d647d34c1c963e254a21facef003efc0a60a5074dad6c1b642a1093e3f0e9228505bde79b9f5c55e263827987395734718b019c7a06
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
Connect external data to LLMs, no matter the source.
|
|
8
8
|
|
|
9
|
-
[](https://rubygems.org/gems/carbon_ruby_sdk/versions/0.2.3)
|
|
10
10
|
|
|
11
11
|
</div>
|
|
12
12
|
|
|
@@ -86,7 +86,7 @@ Connect external data to LLMs, no matter the source.
|
|
|
86
86
|
Add to Gemfile:
|
|
87
87
|
|
|
88
88
|
```ruby
|
|
89
|
-
gem 'carbon_ruby_sdk', '~> 0.2.
|
|
89
|
+
gem 'carbon_ruby_sdk', '~> 0.2.3'
|
|
90
90
|
```
|
|
91
91
|
|
|
92
92
|
## Getting Started<a id="getting-started"></a>
|
|
@@ -1240,7 +1240,7 @@ result = carbon.integrations.connect_data_source(
|
|
|
1240
1240
|
"prepend_filename_to_chunks" => false,
|
|
1241
1241
|
"sync_files_on_connection" => true,
|
|
1242
1242
|
"set_page_as_boundary" => false,
|
|
1243
|
-
"request_id" => "
|
|
1243
|
+
"request_id" => "652297b9-0f55-46d8-869d-13a36e89e5da",
|
|
1244
1244
|
"enable_file_picker" => true,
|
|
1245
1245
|
"sync_source_items" => true,
|
|
1246
1246
|
"incremental_sync" => false,
|
|
@@ -1459,7 +1459,7 @@ result = carbon.integrations.get_oauth_url(
|
|
|
1459
1459
|
set_page_as_boundary: false,
|
|
1460
1460
|
data_source_id: 1,
|
|
1461
1461
|
connecting_new_account: false,
|
|
1462
|
-
request_id: "
|
|
1462
|
+
request_id: "71f214fa-2155-41cb-9336-9b3070e86897",
|
|
1463
1463
|
use_ocr: false,
|
|
1464
1464
|
parse_pdf_tables_with_ocr: false,
|
|
1465
1465
|
enable_file_picker: true,
|
|
@@ -1519,7 +1519,7 @@ Enable OCR for files that support it. Supported formats: pdf
|
|
|
1519
1519
|
##### parse_pdf_tables_with_ocr: `Boolean`<a id="parse_pdf_tables_with_ocr-boolean"></a>
|
|
1520
1520
|
##### enable_file_picker: `Boolean`<a id="enable_file_picker-boolean"></a>
|
|
1521
1521
|
Enable integration's file picker for sources that support it. Supported sources:
|
|
1522
|
-
|
|
1522
|
+
DROPBOX, GOOGLE_DRIVE, SHAREPOINT, ONEDRIVE, BOX
|
|
1523
1523
|
|
|
1524
1524
|
##### sync_source_items: `Boolean`<a id="sync_source_items-boolean"></a>
|
|
1525
1525
|
Enabling this flag will fetch all available content from the source to be listed
|
|
@@ -1781,7 +1781,7 @@ result = carbon.integrations.sync_confluence(
|
|
|
1781
1781
|
prepend_filename_to_chunks: false,
|
|
1782
1782
|
max_items_per_chunk: 1,
|
|
1783
1783
|
set_page_as_boundary: false,
|
|
1784
|
-
request_id: "
|
|
1784
|
+
request_id: "6136b467-242e-49df-9478-d3e0cfdde299",
|
|
1785
1785
|
use_ocr: false,
|
|
1786
1786
|
parse_pdf_tables_with_ocr: false,
|
|
1787
1787
|
incremental_sync: false,
|
|
@@ -1884,7 +1884,7 @@ result = carbon.integrations.sync_files(
|
|
|
1884
1884
|
prepend_filename_to_chunks: false,
|
|
1885
1885
|
max_items_per_chunk: 1,
|
|
1886
1886
|
set_page_as_boundary: false,
|
|
1887
|
-
request_id: "
|
|
1887
|
+
request_id: "6136b467-242e-49df-9478-d3e0cfdde299",
|
|
1888
1888
|
use_ocr: false,
|
|
1889
1889
|
parse_pdf_tables_with_ocr: false,
|
|
1890
1890
|
incremental_sync: false,
|
|
@@ -2741,6 +2741,8 @@ result = carbon.utilities.scrape_sitemap(
|
|
|
2741
2741
|
css_classes_to_skip: [],
|
|
2742
2742
|
css_selectors_to_skip: [],
|
|
2743
2743
|
embedding_model: "OPENAI",
|
|
2744
|
+
url_paths_to_include: [],
|
|
2745
|
+
url_paths_to_exclude: [],
|
|
2744
2746
|
)
|
|
2745
2747
|
p result
|
|
2746
2748
|
```
|
|
@@ -2760,6 +2762,16 @@ p result
|
|
|
2760
2762
|
##### css_classes_to_skip: Array<`String`><a id="css_classes_to_skip-array"></a>
|
|
2761
2763
|
##### css_selectors_to_skip: Array<`String`><a id="css_selectors_to_skip-array"></a>
|
|
2762
2764
|
##### embedding_model: [`EmbeddingGenerators`](./lib/carbon_ruby_sdk/models/embedding_generators.rb)<a id="embedding_model-embeddinggeneratorslibcarbon_ruby_sdkmodelsembedding_generatorsrb"></a>
|
|
2765
|
+
##### url_paths_to_include: Array<`String`><a id="url_paths_to_include-array"></a>
|
|
2766
|
+
URL subpaths or directories that you want to include. For example if you want to
|
|
2767
|
+
only include URLs that start with /questions in stackoverflow.com, you will add
|
|
2768
|
+
/questions/ in this input
|
|
2769
|
+
|
|
2770
|
+
##### url_paths_to_exclude: Array<`String`><a id="url_paths_to_exclude-array"></a>
|
|
2771
|
+
URL subpaths or directories that you want to exclude. For example if you want to
|
|
2772
|
+
exclude URLs that start with /questions in stackoverflow.com, you will add
|
|
2773
|
+
/questions/ in this input
|
|
2774
|
+
|
|
2763
2775
|
#### 🌐 Endpoint<a id="🌐-endpoint"></a>
|
|
2764
2776
|
|
|
2765
2777
|
`/scrape_sitemap` `POST`
|
|
@@ -2799,6 +2811,7 @@ result = carbon.utilities.scrape_web(
|
|
|
2799
2811
|
"css_classes_to_skip" => [],
|
|
2800
2812
|
"css_selectors_to_skip" => [],
|
|
2801
2813
|
"embedding_model" => "OPENAI",
|
|
2814
|
+
"url_paths_to_include" => [],
|
|
2802
2815
|
}
|
|
2803
2816
|
],
|
|
2804
2817
|
)
|
|
@@ -653,13 +653,13 @@ module Carbon
|
|
|
653
653
|
# @param request_id [String] This request id will be added to all files that get synced using the generated OAuth URL
|
|
654
654
|
# @param use_ocr [Boolean] Enable OCR for files that support it. Supported formats: pdf
|
|
655
655
|
# @param parse_pdf_tables_with_ocr [Boolean]
|
|
656
|
-
# @param enable_file_picker [Boolean] Enable integration's file picker for sources that support it. Supported sources:
|
|
656
|
+
# @param enable_file_picker [Boolean] Enable integration's file picker for sources that support it. Supported sources: DROPBOX, GOOGLE_DRIVE, SHAREPOINT, ONEDRIVE, BOX
|
|
657
657
|
# @param sync_source_items [Boolean] Enabling this flag will fetch all available content from the source to be listed via list items endpoint
|
|
658
658
|
# @param incremental_sync [Boolean] Only sync files if they have not already been synced or if the embedding properties have changed. This flag is currently supported by ONEDRIVE, GOOGLE_DRIVE, BOX, DROPBOX. It will be ignored for other data sources.
|
|
659
659
|
# @param file_sync_config [FileSyncConfigNullable]
|
|
660
660
|
# @param body [OAuthURLRequest]
|
|
661
661
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
662
|
-
def get_oauth_url(service:, tags: SENTINEL, scope: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', zendesk_subdomain: SENTINEL, microsoft_tenant: SENTINEL, sharepoint_site_name: SENTINEL, confluence_subdomain: SENTINEL, generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, salesforce_domain: SENTINEL, sync_files_on_connection: true, set_page_as_boundary: false, data_source_id: SENTINEL, connecting_new_account: false, request_id: '
|
|
662
|
+
def get_oauth_url(service:, tags: SENTINEL, scope: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', zendesk_subdomain: SENTINEL, microsoft_tenant: SENTINEL, sharepoint_site_name: SENTINEL, confluence_subdomain: SENTINEL, generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, salesforce_domain: SENTINEL, sync_files_on_connection: true, set_page_as_boundary: false, data_source_id: SENTINEL, connecting_new_account: false, request_id: '71f214fa-2155-41cb-9336-9b3070e86897', use_ocr: false, parse_pdf_tables_with_ocr: false, enable_file_picker: true, sync_source_items: true, incremental_sync: false, file_sync_config: SENTINEL, extra: {})
|
|
663
663
|
_body = {}
|
|
664
664
|
_body[:tags] = tags if tags != SENTINEL
|
|
665
665
|
_body[:scope] = scope if scope != SENTINEL
|
|
@@ -721,13 +721,13 @@ module Carbon
|
|
|
721
721
|
# @param request_id [String] This request id will be added to all files that get synced using the generated OAuth URL
|
|
722
722
|
# @param use_ocr [Boolean] Enable OCR for files that support it. Supported formats: pdf
|
|
723
723
|
# @param parse_pdf_tables_with_ocr [Boolean]
|
|
724
|
-
# @param enable_file_picker [Boolean] Enable integration's file picker for sources that support it. Supported sources:
|
|
724
|
+
# @param enable_file_picker [Boolean] Enable integration's file picker for sources that support it. Supported sources: DROPBOX, GOOGLE_DRIVE, SHAREPOINT, ONEDRIVE, BOX
|
|
725
725
|
# @param sync_source_items [Boolean] Enabling this flag will fetch all available content from the source to be listed via list items endpoint
|
|
726
726
|
# @param incremental_sync [Boolean] Only sync files if they have not already been synced or if the embedding properties have changed. This flag is currently supported by ONEDRIVE, GOOGLE_DRIVE, BOX, DROPBOX. It will be ignored for other data sources.
|
|
727
727
|
# @param file_sync_config [FileSyncConfigNullable]
|
|
728
728
|
# @param body [OAuthURLRequest]
|
|
729
729
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
730
|
-
def get_oauth_url_with_http_info(service:, tags: SENTINEL, scope: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', zendesk_subdomain: SENTINEL, microsoft_tenant: SENTINEL, sharepoint_site_name: SENTINEL, confluence_subdomain: SENTINEL, generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, salesforce_domain: SENTINEL, sync_files_on_connection: true, set_page_as_boundary: false, data_source_id: SENTINEL, connecting_new_account: false, request_id: '
|
|
730
|
+
def get_oauth_url_with_http_info(service:, tags: SENTINEL, scope: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', zendesk_subdomain: SENTINEL, microsoft_tenant: SENTINEL, sharepoint_site_name: SENTINEL, confluence_subdomain: SENTINEL, generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, salesforce_domain: SENTINEL, sync_files_on_connection: true, set_page_as_boundary: false, data_source_id: SENTINEL, connecting_new_account: false, request_id: '71f214fa-2155-41cb-9336-9b3070e86897', use_ocr: false, parse_pdf_tables_with_ocr: false, enable_file_picker: true, sync_source_items: true, incremental_sync: false, file_sync_config: SENTINEL, extra: {})
|
|
731
731
|
_body = {}
|
|
732
732
|
_body[:tags] = tags if tags != SENTINEL
|
|
733
733
|
_body[:scope] = scope if scope != SENTINEL
|
|
@@ -1523,7 +1523,7 @@ module Carbon
|
|
|
1523
1523
|
# @param file_sync_config [FileSyncConfigNullable]
|
|
1524
1524
|
# @param body [SyncFilesRequest]
|
|
1525
1525
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
1526
|
-
def sync_confluence(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '
|
|
1526
|
+
def sync_confluence(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '6136b467-242e-49df-9478-d3e0cfdde299', use_ocr: false, parse_pdf_tables_with_ocr: false, incremental_sync: false, file_sync_config: SENTINEL, extra: {})
|
|
1527
1527
|
_body = {}
|
|
1528
1528
|
_body[:tags] = tags if tags != SENTINEL
|
|
1529
1529
|
_body[:data_source_id] = data_source_id if data_source_id != SENTINEL
|
|
@@ -1571,7 +1571,7 @@ module Carbon
|
|
|
1571
1571
|
# @param file_sync_config [FileSyncConfigNullable]
|
|
1572
1572
|
# @param body [SyncFilesRequest]
|
|
1573
1573
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
1574
|
-
def sync_confluence_with_http_info(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '
|
|
1574
|
+
def sync_confluence_with_http_info(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '6136b467-242e-49df-9478-d3e0cfdde299', use_ocr: false, parse_pdf_tables_with_ocr: false, incremental_sync: false, file_sync_config: SENTINEL, extra: {})
|
|
1575
1575
|
_body = {}
|
|
1576
1576
|
_body[:tags] = tags if tags != SENTINEL
|
|
1577
1577
|
_body[:data_source_id] = data_source_id if data_source_id != SENTINEL
|
|
@@ -1779,7 +1779,7 @@ module Carbon
|
|
|
1779
1779
|
# @param file_sync_config [FileSyncConfigNullable]
|
|
1780
1780
|
# @param body [SyncFilesRequest]
|
|
1781
1781
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
1782
|
-
def sync_files(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '
|
|
1782
|
+
def sync_files(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '6136b467-242e-49df-9478-d3e0cfdde299', use_ocr: false, parse_pdf_tables_with_ocr: false, incremental_sync: false, file_sync_config: SENTINEL, extra: {})
|
|
1783
1783
|
_body = {}
|
|
1784
1784
|
_body[:tags] = tags if tags != SENTINEL
|
|
1785
1785
|
_body[:data_source_id] = data_source_id if data_source_id != SENTINEL
|
|
@@ -1827,7 +1827,7 @@ module Carbon
|
|
|
1827
1827
|
# @param file_sync_config [FileSyncConfigNullable]
|
|
1828
1828
|
# @param body [SyncFilesRequest]
|
|
1829
1829
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
1830
|
-
def sync_files_with_http_info(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '
|
|
1830
|
+
def sync_files_with_http_info(data_source_id:, ids:, tags: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, embedding_model: 'OPENAI', generate_sparse_vectors: false, prepend_filename_to_chunks: false, max_items_per_chunk: SENTINEL, set_page_as_boundary: false, request_id: '6136b467-242e-49df-9478-d3e0cfdde299', use_ocr: false, parse_pdf_tables_with_ocr: false, incremental_sync: false, file_sync_config: SENTINEL, extra: {})
|
|
1831
1831
|
_body = {}
|
|
1832
1832
|
_body[:tags] = tags if tags != SENTINEL
|
|
1833
1833
|
_body[:data_source_id] = data_source_id if data_source_id != SENTINEL
|
|
@@ -342,9 +342,11 @@ module Carbon
|
|
|
342
342
|
# @param css_classes_to_skip [Array<String>]
|
|
343
343
|
# @param css_selectors_to_skip [Array<String>]
|
|
344
344
|
# @param embedding_model [EmbeddingGenerators]
|
|
345
|
+
# @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
|
346
|
+
# @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
|
345
347
|
# @param body [SitemapScrapeRequest]
|
|
346
348
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
347
|
-
def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', extra: {})
|
|
349
|
+
def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, extra: {})
|
|
348
350
|
_body = {}
|
|
349
351
|
_body[:tags] = tags if tags != SENTINEL
|
|
350
352
|
_body[:url] = url if url != SENTINEL
|
|
@@ -359,6 +361,8 @@ module Carbon
|
|
|
359
361
|
_body[:css_classes_to_skip] = css_classes_to_skip if css_classes_to_skip != SENTINEL
|
|
360
362
|
_body[:css_selectors_to_skip] = css_selectors_to_skip if css_selectors_to_skip != SENTINEL
|
|
361
363
|
_body[:embedding_model] = embedding_model if embedding_model != SENTINEL
|
|
364
|
+
_body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
|
|
365
|
+
_body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
|
|
362
366
|
sitemap_scrape_request = _body
|
|
363
367
|
api_response = scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
|
|
364
368
|
api_response.data
|
|
@@ -387,9 +391,11 @@ module Carbon
|
|
|
387
391
|
# @param css_classes_to_skip [Array<String>]
|
|
388
392
|
# @param css_selectors_to_skip [Array<String>]
|
|
389
393
|
# @param embedding_model [EmbeddingGenerators]
|
|
394
|
+
# @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
|
395
|
+
# @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
|
390
396
|
# @param body [SitemapScrapeRequest]
|
|
391
397
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
|
392
|
-
def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', extra: {})
|
|
398
|
+
def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, extra: {})
|
|
393
399
|
_body = {}
|
|
394
400
|
_body[:tags] = tags if tags != SENTINEL
|
|
395
401
|
_body[:url] = url if url != SENTINEL
|
|
@@ -404,6 +410,8 @@ module Carbon
|
|
|
404
410
|
_body[:css_classes_to_skip] = css_classes_to_skip if css_classes_to_skip != SENTINEL
|
|
405
411
|
_body[:css_selectors_to_skip] = css_selectors_to_skip if css_selectors_to_skip != SENTINEL
|
|
406
412
|
_body[:embedding_model] = embedding_model if embedding_model != SENTINEL
|
|
413
|
+
_body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
|
|
414
|
+
_body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
|
|
407
415
|
sitemap_scrape_request = _body
|
|
408
416
|
scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
|
|
409
417
|
end
|
|
@@ -61,7 +61,7 @@ module Carbon
|
|
|
61
61
|
|
|
62
62
|
attr_accessor :parse_pdf_tables_with_ocr
|
|
63
63
|
|
|
64
|
-
# Enable integration's file picker for sources that support it. Supported sources:
|
|
64
|
+
# Enable integration's file picker for sources that support it. Supported sources: DROPBOX, GOOGLE_DRIVE, SHAREPOINT, ONEDRIVE, BOX
|
|
65
65
|
attr_accessor :enable_file_picker
|
|
66
66
|
|
|
67
67
|
# Enabling this flag will fetch all available content from the source to be listed via list items endpoint
|
|
@@ -279,7 +279,7 @@ module Carbon
|
|
|
279
279
|
if attributes.key?(:'request_id')
|
|
280
280
|
self.request_id = attributes[:'request_id']
|
|
281
281
|
else
|
|
282
|
-
self.request_id = '
|
|
282
|
+
self.request_id = '71f214fa-2155-41cb-9336-9b3070e86897'
|
|
283
283
|
end
|
|
284
284
|
|
|
285
285
|
if attributes.key?(:'use_ocr')
|
|
@@ -37,6 +37,12 @@ module Carbon
|
|
|
37
37
|
|
|
38
38
|
attr_accessor :embedding_model
|
|
39
39
|
|
|
40
|
+
# URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
|
41
|
+
attr_accessor :url_paths_to_include
|
|
42
|
+
|
|
43
|
+
# URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
|
44
|
+
attr_accessor :url_paths_to_exclude
|
|
45
|
+
|
|
40
46
|
# Attribute mapping from ruby-style variable name to JSON key.
|
|
41
47
|
def self.attribute_map
|
|
42
48
|
{
|
|
@@ -52,7 +58,9 @@ module Carbon
|
|
|
52
58
|
:'html_tags_to_skip' => :'html_tags_to_skip',
|
|
53
59
|
:'css_classes_to_skip' => :'css_classes_to_skip',
|
|
54
60
|
:'css_selectors_to_skip' => :'css_selectors_to_skip',
|
|
55
|
-
:'embedding_model' => :'embedding_model'
|
|
61
|
+
:'embedding_model' => :'embedding_model',
|
|
62
|
+
:'url_paths_to_include' => :'url_paths_to_include',
|
|
63
|
+
:'url_paths_to_exclude' => :'url_paths_to_exclude'
|
|
56
64
|
}
|
|
57
65
|
end
|
|
58
66
|
|
|
@@ -76,7 +84,9 @@ module Carbon
|
|
|
76
84
|
:'html_tags_to_skip' => :'Array<String>',
|
|
77
85
|
:'css_classes_to_skip' => :'Array<String>',
|
|
78
86
|
:'css_selectors_to_skip' => :'Array<String>',
|
|
79
|
-
:'embedding_model' => :'EmbeddingGenerators'
|
|
87
|
+
:'embedding_model' => :'EmbeddingGenerators',
|
|
88
|
+
:'url_paths_to_include' => :'Array<String>',
|
|
89
|
+
:'url_paths_to_exclude' => :'Array<String>'
|
|
80
90
|
}
|
|
81
91
|
end
|
|
82
92
|
|
|
@@ -94,6 +104,8 @@ module Carbon
|
|
|
94
104
|
:'html_tags_to_skip',
|
|
95
105
|
:'css_classes_to_skip',
|
|
96
106
|
:'css_selectors_to_skip',
|
|
107
|
+
:'url_paths_to_include',
|
|
108
|
+
:'url_paths_to_exclude'
|
|
97
109
|
])
|
|
98
110
|
end
|
|
99
111
|
|
|
@@ -185,6 +197,18 @@ module Carbon
|
|
|
185
197
|
else
|
|
186
198
|
self.embedding_model = 'OPENAI'
|
|
187
199
|
end
|
|
200
|
+
|
|
201
|
+
if attributes.key?(:'url_paths_to_include')
|
|
202
|
+
if (value = attributes[:'url_paths_to_include']).is_a?(Array)
|
|
203
|
+
self.url_paths_to_include = value
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
if attributes.key?(:'url_paths_to_exclude')
|
|
208
|
+
if (value = attributes[:'url_paths_to_exclude']).is_a?(Array)
|
|
209
|
+
self.url_paths_to_exclude = value
|
|
210
|
+
end
|
|
211
|
+
end
|
|
188
212
|
end
|
|
189
213
|
|
|
190
214
|
# Show invalid properties with the reasons. Usually used together with valid?
|
|
@@ -199,6 +223,14 @@ module Carbon
|
|
|
199
223
|
invalid_properties.push('invalid value for "max_pages_to_scrape", must be greater than or equal to 1.')
|
|
200
224
|
end
|
|
201
225
|
|
|
226
|
+
if !@url_paths_to_include.nil? && @url_paths_to_include.length > 10
|
|
227
|
+
invalid_properties.push('invalid value for "url_paths_to_include", number of items must be less than or equal to 10.')
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
if !@url_paths_to_exclude.nil? && @url_paths_to_exclude.length > 10
|
|
231
|
+
invalid_properties.push('invalid value for "url_paths_to_exclude", number of items must be less than or equal to 10.')
|
|
232
|
+
end
|
|
233
|
+
|
|
202
234
|
invalid_properties
|
|
203
235
|
end
|
|
204
236
|
|
|
@@ -207,6 +239,8 @@ module Carbon
|
|
|
207
239
|
def valid?
|
|
208
240
|
return false if @url.nil?
|
|
209
241
|
return false if !@max_pages_to_scrape.nil? && @max_pages_to_scrape < 1
|
|
242
|
+
return false if !@url_paths_to_include.nil? && @url_paths_to_include.length > 10
|
|
243
|
+
return false if !@url_paths_to_exclude.nil? && @url_paths_to_exclude.length > 10
|
|
210
244
|
true
|
|
211
245
|
end
|
|
212
246
|
|
|
@@ -220,6 +254,26 @@ module Carbon
|
|
|
220
254
|
@max_pages_to_scrape = max_pages_to_scrape
|
|
221
255
|
end
|
|
222
256
|
|
|
257
|
+
# Custom attribute writer method with validation
|
|
258
|
+
# @param [Object] url_paths_to_include Value to be assigned
|
|
259
|
+
def url_paths_to_include=(url_paths_to_include)
|
|
260
|
+
if !url_paths_to_include.nil? && url_paths_to_include.length > 10
|
|
261
|
+
fail ArgumentError, 'invalid value for "url_paths_to_include", number of items must be less than or equal to 10.'
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
@url_paths_to_include = url_paths_to_include
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Custom attribute writer method with validation
|
|
268
|
+
# @param [Object] url_paths_to_exclude Value to be assigned
|
|
269
|
+
def url_paths_to_exclude=(url_paths_to_exclude)
|
|
270
|
+
if !url_paths_to_exclude.nil? && url_paths_to_exclude.length > 10
|
|
271
|
+
fail ArgumentError, 'invalid value for "url_paths_to_exclude", number of items must be less than or equal to 10.'
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
@url_paths_to_exclude = url_paths_to_exclude
|
|
275
|
+
end
|
|
276
|
+
|
|
223
277
|
# Checks equality by comparing each attribute.
|
|
224
278
|
# @param [Object] Object to be compared
|
|
225
279
|
def ==(o)
|
|
@@ -237,7 +291,9 @@ module Carbon
|
|
|
237
291
|
html_tags_to_skip == o.html_tags_to_skip &&
|
|
238
292
|
css_classes_to_skip == o.css_classes_to_skip &&
|
|
239
293
|
css_selectors_to_skip == o.css_selectors_to_skip &&
|
|
240
|
-
embedding_model == o.embedding_model
|
|
294
|
+
embedding_model == o.embedding_model &&
|
|
295
|
+
url_paths_to_include == o.url_paths_to_include &&
|
|
296
|
+
url_paths_to_exclude == o.url_paths_to_exclude
|
|
241
297
|
end
|
|
242
298
|
|
|
243
299
|
# @see the `==` method
|
|
@@ -249,7 +305,7 @@ module Carbon
|
|
|
249
305
|
# Calculates hash code according to all attributes.
|
|
250
306
|
# @return [Integer] Hash code
|
|
251
307
|
def hash
|
|
252
|
-
[tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model].hash
|
|
308
|
+
[tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, url_paths_to_exclude].hash
|
|
253
309
|
end
|
|
254
310
|
|
|
255
311
|
# Builds the object from hash
|
|
@@ -187,7 +187,7 @@ module Carbon
|
|
|
187
187
|
if attributes.key?(:'request_id')
|
|
188
188
|
self.request_id = attributes[:'request_id']
|
|
189
189
|
else
|
|
190
|
-
self.request_id = '
|
|
190
|
+
self.request_id = '6136b467-242e-49df-9478-d3e0cfdde299'
|
|
191
191
|
end
|
|
192
192
|
|
|
193
193
|
if attributes.key?(:'use_ocr')
|
|
@@ -182,7 +182,7 @@ module Carbon
|
|
|
182
182
|
if attributes.key?(:'request_id')
|
|
183
183
|
self.request_id = attributes[:'request_id']
|
|
184
184
|
else
|
|
185
|
-
self.request_id = '
|
|
185
|
+
self.request_id = '652297b9-0f55-46d8-869d-13a36e89e5da'
|
|
186
186
|
end
|
|
187
187
|
|
|
188
188
|
if attributes.key?(:'enable_file_picker')
|
|
@@ -39,6 +39,9 @@ module Carbon
|
|
|
39
39
|
|
|
40
40
|
attr_accessor :embedding_model
|
|
41
41
|
|
|
42
|
+
# URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
|
43
|
+
attr_accessor :url_paths_to_include
|
|
44
|
+
|
|
42
45
|
# Attribute mapping from ruby-style variable name to JSON key.
|
|
43
46
|
def self.attribute_map
|
|
44
47
|
{
|
|
@@ -55,7 +58,8 @@ module Carbon
|
|
|
55
58
|
:'html_tags_to_skip' => :'html_tags_to_skip',
|
|
56
59
|
:'css_classes_to_skip' => :'css_classes_to_skip',
|
|
57
60
|
:'css_selectors_to_skip' => :'css_selectors_to_skip',
|
|
58
|
-
:'embedding_model' => :'embedding_model'
|
|
61
|
+
:'embedding_model' => :'embedding_model',
|
|
62
|
+
:'url_paths_to_include' => :'url_paths_to_include'
|
|
59
63
|
}
|
|
60
64
|
end
|
|
61
65
|
|
|
@@ -80,7 +84,8 @@ module Carbon
|
|
|
80
84
|
:'html_tags_to_skip' => :'Array<String>',
|
|
81
85
|
:'css_classes_to_skip' => :'Array<String>',
|
|
82
86
|
:'css_selectors_to_skip' => :'Array<String>',
|
|
83
|
-
:'embedding_model' => :'EmbeddingGenerators'
|
|
87
|
+
:'embedding_model' => :'EmbeddingGenerators',
|
|
88
|
+
:'url_paths_to_include' => :'Array<String>'
|
|
84
89
|
}
|
|
85
90
|
end
|
|
86
91
|
|
|
@@ -99,6 +104,7 @@ module Carbon
|
|
|
99
104
|
:'html_tags_to_skip',
|
|
100
105
|
:'css_classes_to_skip',
|
|
101
106
|
:'css_selectors_to_skip',
|
|
107
|
+
:'url_paths_to_include'
|
|
102
108
|
])
|
|
103
109
|
end
|
|
104
110
|
|
|
@@ -198,6 +204,12 @@ module Carbon
|
|
|
198
204
|
else
|
|
199
205
|
self.embedding_model = 'OPENAI'
|
|
200
206
|
end
|
|
207
|
+
|
|
208
|
+
if attributes.key?(:'url_paths_to_include')
|
|
209
|
+
if (value = attributes[:'url_paths_to_include']).is_a?(Array)
|
|
210
|
+
self.url_paths_to_include = value
|
|
211
|
+
end
|
|
212
|
+
end
|
|
201
213
|
end
|
|
202
214
|
|
|
203
215
|
# Show invalid properties with the reasons. Usually used together with valid?
|
|
@@ -216,6 +228,10 @@ module Carbon
|
|
|
216
228
|
invalid_properties.push('invalid value for "max_pages_to_scrape", must be greater than or equal to 1.')
|
|
217
229
|
end
|
|
218
230
|
|
|
231
|
+
if !@url_paths_to_include.nil? && @url_paths_to_include.length > 10
|
|
232
|
+
invalid_properties.push('invalid value for "url_paths_to_include", number of items must be less than or equal to 10.')
|
|
233
|
+
end
|
|
234
|
+
|
|
219
235
|
invalid_properties
|
|
220
236
|
end
|
|
221
237
|
|
|
@@ -225,6 +241,7 @@ module Carbon
|
|
|
225
241
|
return false if @url.nil?
|
|
226
242
|
return false if !@recursion_depth.nil? && @recursion_depth < 0
|
|
227
243
|
return false if !@max_pages_to_scrape.nil? && @max_pages_to_scrape < 1
|
|
244
|
+
return false if !@url_paths_to_include.nil? && @url_paths_to_include.length > 10
|
|
228
245
|
true
|
|
229
246
|
end
|
|
230
247
|
|
|
@@ -248,6 +265,16 @@ module Carbon
|
|
|
248
265
|
@max_pages_to_scrape = max_pages_to_scrape
|
|
249
266
|
end
|
|
250
267
|
|
|
268
|
+
# Custom attribute writer method with validation
|
|
269
|
+
# @param [Object] url_paths_to_include Value to be assigned
|
|
270
|
+
def url_paths_to_include=(url_paths_to_include)
|
|
271
|
+
if !url_paths_to_include.nil? && url_paths_to_include.length > 10
|
|
272
|
+
fail ArgumentError, 'invalid value for "url_paths_to_include", number of items must be less than or equal to 10.'
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
@url_paths_to_include = url_paths_to_include
|
|
276
|
+
end
|
|
277
|
+
|
|
251
278
|
# Checks equality by comparing each attribute.
|
|
252
279
|
# @param [Object] Object to be compared
|
|
253
280
|
def ==(o)
|
|
@@ -266,7 +293,8 @@ module Carbon
|
|
|
266
293
|
html_tags_to_skip == o.html_tags_to_skip &&
|
|
267
294
|
css_classes_to_skip == o.css_classes_to_skip &&
|
|
268
295
|
css_selectors_to_skip == o.css_selectors_to_skip &&
|
|
269
|
-
embedding_model == o.embedding_model
|
|
296
|
+
embedding_model == o.embedding_model &&
|
|
297
|
+
url_paths_to_include == o.url_paths_to_include
|
|
270
298
|
end
|
|
271
299
|
|
|
272
300
|
# @see the `==` method
|
|
@@ -278,7 +306,7 @@ module Carbon
|
|
|
278
306
|
# Calculates hash code according to all attributes.
|
|
279
307
|
# @return [Integer] Hash code
|
|
280
308
|
def hash
|
|
281
|
-
[tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model].hash
|
|
309
|
+
[tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include].hash
|
|
282
310
|
end
|
|
283
311
|
|
|
284
312
|
# Builds the object from hash
|
|
@@ -97,4 +97,16 @@ describe Carbon::SitemapScrapeRequest do
|
|
|
97
97
|
end
|
|
98
98
|
end
|
|
99
99
|
|
|
100
|
+
describe 'test attribute "url_paths_to_include"' do
|
|
101
|
+
it 'should work' do
|
|
102
|
+
# assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
describe 'test attribute "url_paths_to_exclude"' do
|
|
107
|
+
it 'should work' do
|
|
108
|
+
# assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
100
112
|
end
|
|
@@ -103,4 +103,10 @@ describe Carbon::WebscrapeRequest do
|
|
|
103
103
|
end
|
|
104
104
|
end
|
|
105
105
|
|
|
106
|
+
describe 'test attribute "url_paths_to_include"' do
|
|
107
|
+
it 'should work' do
|
|
108
|
+
# assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
106
112
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: carbon_ruby_sdk
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Konfig
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2024-06-
|
|
11
|
+
date: 2024-06-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: faraday
|