carbon_ruby_sdk 0.2.27 → 0.2.28
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +9 -2
- data/lib/carbon_ruby_sdk/api/utilities_api.rb +6 -2
- data/lib/carbon_ruby_sdk/models/sitemap_scrape_request.rb +18 -5
- data/lib/carbon_ruby_sdk/models/webscrape_request.rb +18 -5
- data/lib/carbon_ruby_sdk/version.rb +1 -1
- data/spec/models/sitemap_scrape_request_spec.rb +6 -0
- data/spec/models/webscrape_request_spec.rb +6 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 631a1a24ddb1e9ee7ae6cf1bfe231dabaa4a9236ddb585a1cc234f5ad7a6a738
|
4
|
+
data.tar.gz: 5515b1db4929dc0cfa12e99c1a7a216a969c84d59d70d0ccf410f371f797c0ce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ec3740b74b7cf918778880dcfdafc8054d8d012c297a18d1c9d127d086b55692d94142bd2a0b20da80616d7b717ae4798dc1b0871a5e9baebbb19bb548a8d7ce
|
7
|
+
data.tar.gz: be240b0e3bc839bf3c741a98a7ced2453601e690c512f7a8ea883f32a797841218e9205498fcb6a0e2dc60a661d6d9eb22401f7792dab5adc82a5fd3c9982874
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
|
7
7
|
Connect external data to LLMs, no matter the source.
|
8
8
|
|
9
|
-
[![npm](https://img.shields.io/badge/gem-v0.2.
|
9
|
+
[![npm](https://img.shields.io/badge/gem-v0.2.28-blue)](https://rubygems.org/gems/carbon_ruby_sdk/versions/0.2.28)
|
10
10
|
|
11
11
|
</div>
|
12
12
|
|
@@ -93,7 +93,7 @@ Connect external data to LLMs, no matter the source.
|
|
93
93
|
Add to Gemfile:
|
94
94
|
|
95
95
|
```ruby
|
96
|
-
gem 'carbon_ruby_sdk', '~> 0.2.
|
96
|
+
gem 'carbon_ruby_sdk', '~> 0.2.28'
|
97
97
|
```
|
98
98
|
|
99
99
|
## Getting Started<a id="getting-started"></a>
|
@@ -3116,6 +3116,7 @@ result = carbon.utilities.scrape_sitemap(
|
|
3116
3116
|
url_paths_to_include: [],
|
3117
3117
|
url_paths_to_exclude: [],
|
3118
3118
|
urls_to_scrape: [],
|
3119
|
+
download_css_and_media: false,
|
3119
3120
|
)
|
3120
3121
|
p result
|
3121
3122
|
```
|
@@ -3150,6 +3151,11 @@ You can submit a subset of URLs from the sitemap that should be scraped. To get
|
|
3150
3151
|
the list of URLs, you can check out /process_sitemap endpoint. If left empty,
|
3151
3152
|
all URLs from the sitemap will be scraped.
|
3152
3153
|
|
3154
|
+
##### download_css_and_media: `Boolean`<a id="download_css_and_media-boolean"></a>
|
3155
|
+
Whether the scraper should download css and media from the page (images, fonts,
|
3156
|
+
etc). Scrapes might take longer to finish with this flag enabled, but the
|
3157
|
+
success rate is improved.
|
3158
|
+
|
3153
3159
|
#### 🌐 Endpoint<a id="🌐-endpoint"></a>
|
3154
3160
|
|
3155
3161
|
`/scrape_sitemap` `POST`
|
@@ -3190,6 +3196,7 @@ result = carbon.utilities.scrape_web(
|
|
3190
3196
|
"css_selectors_to_skip" => [],
|
3191
3197
|
"embedding_model" => "OPENAI",
|
3192
3198
|
"url_paths_to_include" => [],
|
3199
|
+
"download_css_and_media" => false,
|
3193
3200
|
}
|
3194
3201
|
],
|
3195
3202
|
)
|
@@ -437,9 +437,10 @@ module Carbon
|
|
437
437
|
# @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
438
438
|
# @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
439
439
|
# @param urls_to_scrape [Array<String>] You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
|
440
|
+
# @param download_css_and_media [Boolean] Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
|
440
441
|
# @param body [SitemapScrapeRequest]
|
441
442
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
442
|
-
def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, extra: {})
|
443
|
+
def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, extra: {})
|
443
444
|
_body = {}
|
444
445
|
_body[:tags] = tags if tags != SENTINEL
|
445
446
|
_body[:url] = url if url != SENTINEL
|
@@ -457,6 +458,7 @@ module Carbon
|
|
457
458
|
_body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
|
458
459
|
_body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
|
459
460
|
_body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
|
461
|
+
_body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
|
460
462
|
sitemap_scrape_request = _body
|
461
463
|
api_response = scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
|
462
464
|
api_response.data
|
@@ -488,9 +490,10 @@ module Carbon
|
|
488
490
|
# @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
489
491
|
# @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
490
492
|
# @param urls_to_scrape [Array<String>] You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
|
493
|
+
# @param download_css_and_media [Boolean] Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
|
491
494
|
# @param body [SitemapScrapeRequest]
|
492
495
|
# @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
|
493
|
-
def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, extra: {})
|
496
|
+
def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, extra: {})
|
494
497
|
_body = {}
|
495
498
|
_body[:tags] = tags if tags != SENTINEL
|
496
499
|
_body[:url] = url if url != SENTINEL
|
@@ -508,6 +511,7 @@ module Carbon
|
|
508
511
|
_body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
|
509
512
|
_body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
|
510
513
|
_body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
|
514
|
+
_body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
|
511
515
|
sitemap_scrape_request = _body
|
512
516
|
scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
|
513
517
|
end
|
@@ -46,6 +46,9 @@ module Carbon
|
|
46
46
|
# You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
|
47
47
|
attr_accessor :urls_to_scrape
|
48
48
|
|
49
|
+
# Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
|
50
|
+
attr_accessor :download_css_and_media
|
51
|
+
|
49
52
|
# Attribute mapping from ruby-style variable name to JSON key.
|
50
53
|
def self.attribute_map
|
51
54
|
{
|
@@ -64,7 +67,8 @@ module Carbon
|
|
64
67
|
:'embedding_model' => :'embedding_model',
|
65
68
|
:'url_paths_to_include' => :'url_paths_to_include',
|
66
69
|
:'url_paths_to_exclude' => :'url_paths_to_exclude',
|
67
|
-
:'urls_to_scrape' => :'urls_to_scrape'
|
70
|
+
:'urls_to_scrape' => :'urls_to_scrape',
|
71
|
+
:'download_css_and_media' => :'download_css_and_media'
|
68
72
|
}
|
69
73
|
end
|
70
74
|
|
@@ -91,7 +95,8 @@ module Carbon
|
|
91
95
|
:'embedding_model' => :'EmbeddingGenerators',
|
92
96
|
:'url_paths_to_include' => :'Array<String>',
|
93
97
|
:'url_paths_to_exclude' => :'Array<String>',
|
94
|
-
:'urls_to_scrape' => :'Array<String>'
|
98
|
+
:'urls_to_scrape' => :'Array<String>',
|
99
|
+
:'download_css_and_media' => :'Boolean'
|
95
100
|
}
|
96
101
|
end
|
97
102
|
|
@@ -111,7 +116,8 @@ module Carbon
|
|
111
116
|
:'css_selectors_to_skip',
|
112
117
|
:'url_paths_to_include',
|
113
118
|
:'url_paths_to_exclude',
|
114
|
-
:'urls_to_scrape'
|
119
|
+
:'urls_to_scrape',
|
120
|
+
:'download_css_and_media'
|
115
121
|
])
|
116
122
|
end
|
117
123
|
|
@@ -221,6 +227,12 @@ module Carbon
|
|
221
227
|
self.urls_to_scrape = value
|
222
228
|
end
|
223
229
|
end
|
230
|
+
|
231
|
+
if attributes.key?(:'download_css_and_media')
|
232
|
+
self.download_css_and_media = attributes[:'download_css_and_media']
|
233
|
+
else
|
234
|
+
self.download_css_and_media = false
|
235
|
+
end
|
224
236
|
end
|
225
237
|
|
226
238
|
# Show invalid properties with the reasons. Usually used together with valid?
|
@@ -306,7 +318,8 @@ module Carbon
|
|
306
318
|
embedding_model == o.embedding_model &&
|
307
319
|
url_paths_to_include == o.url_paths_to_include &&
|
308
320
|
url_paths_to_exclude == o.url_paths_to_exclude &&
|
309
|
-
urls_to_scrape == o.urls_to_scrape
|
321
|
+
urls_to_scrape == o.urls_to_scrape &&
|
322
|
+
download_css_and_media == o.download_css_and_media
|
310
323
|
end
|
311
324
|
|
312
325
|
# @see the `==` method
|
@@ -318,7 +331,7 @@ module Carbon
|
|
318
331
|
# Calculates hash code according to all attributes.
|
319
332
|
# @return [Integer] Hash code
|
320
333
|
def hash
|
321
|
-
[tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, url_paths_to_exclude, urls_to_scrape].hash
|
334
|
+
[tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, url_paths_to_exclude, urls_to_scrape, download_css_and_media].hash
|
322
335
|
end
|
323
336
|
|
324
337
|
# Builds the object from hash
|
@@ -42,6 +42,9 @@ module Carbon
|
|
42
42
|
# URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
|
43
43
|
attr_accessor :url_paths_to_include
|
44
44
|
|
45
|
+
# Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
|
46
|
+
attr_accessor :download_css_and_media
|
47
|
+
|
45
48
|
# Attribute mapping from ruby-style variable name to JSON key.
|
46
49
|
def self.attribute_map
|
47
50
|
{
|
@@ -59,7 +62,8 @@ module Carbon
|
|
59
62
|
:'css_classes_to_skip' => :'css_classes_to_skip',
|
60
63
|
:'css_selectors_to_skip' => :'css_selectors_to_skip',
|
61
64
|
:'embedding_model' => :'embedding_model',
|
62
|
-
:'url_paths_to_include' => :'url_paths_to_include'
|
65
|
+
:'url_paths_to_include' => :'url_paths_to_include',
|
66
|
+
:'download_css_and_media' => :'download_css_and_media'
|
63
67
|
}
|
64
68
|
end
|
65
69
|
|
@@ -85,7 +89,8 @@ module Carbon
|
|
85
89
|
:'css_classes_to_skip' => :'Array<String>',
|
86
90
|
:'css_selectors_to_skip' => :'Array<String>',
|
87
91
|
:'embedding_model' => :'EmbeddingGenerators',
|
88
|
-
:'url_paths_to_include' => :'Array<String>'
|
92
|
+
:'url_paths_to_include' => :'Array<String>',
|
93
|
+
:'download_css_and_media' => :'Boolean'
|
89
94
|
}
|
90
95
|
end
|
91
96
|
|
@@ -104,7 +109,8 @@ module Carbon
|
|
104
109
|
:'html_tags_to_skip',
|
105
110
|
:'css_classes_to_skip',
|
106
111
|
:'css_selectors_to_skip',
|
107
|
-
:'url_paths_to_include'
|
112
|
+
:'url_paths_to_include',
|
113
|
+
:'download_css_and_media'
|
108
114
|
])
|
109
115
|
end
|
110
116
|
|
@@ -210,6 +216,12 @@ module Carbon
|
|
210
216
|
self.url_paths_to_include = value
|
211
217
|
end
|
212
218
|
end
|
219
|
+
|
220
|
+
if attributes.key?(:'download_css_and_media')
|
221
|
+
self.download_css_and_media = attributes[:'download_css_and_media']
|
222
|
+
else
|
223
|
+
self.download_css_and_media = false
|
224
|
+
end
|
213
225
|
end
|
214
226
|
|
215
227
|
# Show invalid properties with the reasons. Usually used together with valid?
|
@@ -294,7 +306,8 @@ module Carbon
|
|
294
306
|
css_classes_to_skip == o.css_classes_to_skip &&
|
295
307
|
css_selectors_to_skip == o.css_selectors_to_skip &&
|
296
308
|
embedding_model == o.embedding_model &&
|
297
|
-
url_paths_to_include == o.url_paths_to_include
|
309
|
+
url_paths_to_include == o.url_paths_to_include &&
|
310
|
+
download_css_and_media == o.download_css_and_media
|
298
311
|
end
|
299
312
|
|
300
313
|
# @see the `==` method
|
@@ -306,7 +319,7 @@ module Carbon
|
|
306
319
|
# Calculates hash code according to all attributes.
|
307
320
|
# @return [Integer] Hash code
|
308
321
|
def hash
|
309
|
-
[tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include].hash
|
322
|
+
[tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, download_css_and_media].hash
|
310
323
|
end
|
311
324
|
|
312
325
|
# Builds the object from hash
|
@@ -115,4 +115,10 @@ describe Carbon::SitemapScrapeRequest do
|
|
115
115
|
end
|
116
116
|
end
|
117
117
|
|
118
|
+
describe 'test attribute "download_css_and_media"' do
|
119
|
+
it 'should work' do
|
120
|
+
# assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
118
124
|
end
|
@@ -109,4 +109,10 @@ describe Carbon::WebscrapeRequest do
|
|
109
109
|
end
|
110
110
|
end
|
111
111
|
|
112
|
+
describe 'test attribute "download_css_and_media"' do
|
113
|
+
it 'should work' do
|
114
|
+
# assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
112
118
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: carbon_ruby_sdk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Konfig
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: faraday
|