carbon_ruby_sdk 0.2.27 → 0.2.28

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '00969f44903e2698da2cdf106e0051ceeeff09c68652799607ce5e5fae9b7a3a'
4
- data.tar.gz: b59ee278e57db1ce4f5003787f146158865a172b782f6fcf73c5881a1c74d3e4
3
+ metadata.gz: 631a1a24ddb1e9ee7ae6cf1bfe231dabaa4a9236ddb585a1cc234f5ad7a6a738
4
+ data.tar.gz: 5515b1db4929dc0cfa12e99c1a7a216a969c84d59d70d0ccf410f371f797c0ce
5
5
  SHA512:
6
- metadata.gz: d8f090188e03d6a29d1cbd2164799b513fd4498c573ad4ab8b3ea4839e448bec35831e0065e4409b77429dc044d76bddc64b2d187147b8c1590e72b33e703ad8
7
- data.tar.gz: abfdbe67b5f27b6e015980f2fc633195cf3fd7b6126b9cacc2e65144418d923c6348acf7c40dd52d990f8ebe0224043c836918960144e268872f894eca0d97d8
6
+ metadata.gz: ec3740b74b7cf918778880dcfdafc8054d8d012c297a18d1c9d127d086b55692d94142bd2a0b20da80616d7b717ae4798dc1b0871a5e9baebbb19bb548a8d7ce
7
+ data.tar.gz: be240b0e3bc839bf3c741a98a7ced2453601e690c512f7a8ea883f32a797841218e9205498fcb6a0e2dc60a661d6d9eb22401f7792dab5adc82a5fd3c9982874
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- carbon_ruby_sdk (0.2.27)
4
+ carbon_ruby_sdk (0.2.28)
5
5
  faraday (>= 1.0.1, < 3.0)
6
6
  faraday-multipart (~> 1.0, >= 1.0.4)
7
7
 
data/README.md CHANGED
@@ -6,7 +6,7 @@
6
6
 
7
7
  Connect external data to LLMs, no matter the source.
8
8
 
9
- [![npm](https://img.shields.io/badge/gem-v0.2.27-blue)](https://rubygems.org/gems/carbon_ruby_sdk/versions/0.2.27)
9
+ [![npm](https://img.shields.io/badge/gem-v0.2.28-blue)](https://rubygems.org/gems/carbon_ruby_sdk/versions/0.2.28)
10
10
 
11
11
  </div>
12
12
 
@@ -93,7 +93,7 @@ Connect external data to LLMs, no matter the source.
93
93
  Add to Gemfile:
94
94
 
95
95
  ```ruby
96
- gem 'carbon_ruby_sdk', '~> 0.2.27'
96
+ gem 'carbon_ruby_sdk', '~> 0.2.28'
97
97
  ```
98
98
 
99
99
  ## Getting Started<a id="getting-started"></a>
@@ -3116,6 +3116,7 @@ result = carbon.utilities.scrape_sitemap(
3116
3116
  url_paths_to_include: [],
3117
3117
  url_paths_to_exclude: [],
3118
3118
  urls_to_scrape: [],
3119
+ download_css_and_media: false,
3119
3120
  )
3120
3121
  p result
3121
3122
  ```
@@ -3150,6 +3151,11 @@ You can submit a subset of URLs from the sitemap that should be scraped. To get
3150
3151
  the list of URLs, you can check out /process_sitemap endpoint. If left empty,
3151
3152
  all URLs from the sitemap will be scraped.
3152
3153
 
3154
+ ##### download_css_and_media: `Boolean`<a id="download_css_and_media-boolean"></a>
3155
+ Whether the scraper should download css and media from the page (images, fonts,
3156
+ etc). Scrapes might take longer to finish with this flag enabled, but the
3157
+ success rate is improved.
3158
+
3153
3159
  #### 🌐 Endpoint<a id="🌐-endpoint"></a>
3154
3160
 
3155
3161
  `/scrape_sitemap` `POST`
@@ -3190,6 +3196,7 @@ result = carbon.utilities.scrape_web(
3190
3196
  "css_selectors_to_skip" => [],
3191
3197
  "embedding_model" => "OPENAI",
3192
3198
  "url_paths_to_include" => [],
3199
+ "download_css_and_media" => false,
3193
3200
  }
3194
3201
  ],
3195
3202
  )
@@ -437,9 +437,10 @@ module Carbon
437
437
  # @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
438
438
  # @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
439
439
  # @param urls_to_scrape [Array<String>] You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
440
+ # @param download_css_and_media [Boolean] Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
440
441
  # @param body [SitemapScrapeRequest]
441
442
  # @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
442
- def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, extra: {})
443
+ def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, extra: {})
443
444
  _body = {}
444
445
  _body[:tags] = tags if tags != SENTINEL
445
446
  _body[:url] = url if url != SENTINEL
@@ -457,6 +458,7 @@ module Carbon
457
458
  _body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
458
459
  _body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
459
460
  _body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
461
+ _body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
460
462
  sitemap_scrape_request = _body
461
463
  api_response = scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
462
464
  api_response.data
@@ -488,9 +490,10 @@ module Carbon
488
490
  # @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
489
491
  # @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
490
492
  # @param urls_to_scrape [Array<String>] You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
493
+ # @param download_css_and_media [Boolean] Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
491
494
  # @param body [SitemapScrapeRequest]
492
495
  # @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
493
- def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, extra: {})
496
+ def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, extra: {})
494
497
  _body = {}
495
498
  _body[:tags] = tags if tags != SENTINEL
496
499
  _body[:url] = url if url != SENTINEL
@@ -508,6 +511,7 @@ module Carbon
508
511
  _body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
509
512
  _body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
510
513
  _body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
514
+ _body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
511
515
  sitemap_scrape_request = _body
512
516
  scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
513
517
  end
@@ -46,6 +46,9 @@ module Carbon
46
46
  # You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
47
47
  attr_accessor :urls_to_scrape
48
48
 
49
+ # Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
50
+ attr_accessor :download_css_and_media
51
+
49
52
  # Attribute mapping from ruby-style variable name to JSON key.
50
53
  def self.attribute_map
51
54
  {
@@ -64,7 +67,8 @@ module Carbon
64
67
  :'embedding_model' => :'embedding_model',
65
68
  :'url_paths_to_include' => :'url_paths_to_include',
66
69
  :'url_paths_to_exclude' => :'url_paths_to_exclude',
67
- :'urls_to_scrape' => :'urls_to_scrape'
70
+ :'urls_to_scrape' => :'urls_to_scrape',
71
+ :'download_css_and_media' => :'download_css_and_media'
68
72
  }
69
73
  end
70
74
 
@@ -91,7 +95,8 @@ module Carbon
91
95
  :'embedding_model' => :'EmbeddingGenerators',
92
96
  :'url_paths_to_include' => :'Array<String>',
93
97
  :'url_paths_to_exclude' => :'Array<String>',
94
- :'urls_to_scrape' => :'Array<String>'
98
+ :'urls_to_scrape' => :'Array<String>',
99
+ :'download_css_and_media' => :'Boolean'
95
100
  }
96
101
  end
97
102
 
@@ -111,7 +116,8 @@ module Carbon
111
116
  :'css_selectors_to_skip',
112
117
  :'url_paths_to_include',
113
118
  :'url_paths_to_exclude',
114
- :'urls_to_scrape'
119
+ :'urls_to_scrape',
120
+ :'download_css_and_media'
115
121
  ])
116
122
  end
117
123
 
@@ -221,6 +227,12 @@ module Carbon
221
227
  self.urls_to_scrape = value
222
228
  end
223
229
  end
230
+
231
+ if attributes.key?(:'download_css_and_media')
232
+ self.download_css_and_media = attributes[:'download_css_and_media']
233
+ else
234
+ self.download_css_and_media = false
235
+ end
224
236
  end
225
237
 
226
238
  # Show invalid properties with the reasons. Usually used together with valid?
@@ -306,7 +318,8 @@ module Carbon
306
318
  embedding_model == o.embedding_model &&
307
319
  url_paths_to_include == o.url_paths_to_include &&
308
320
  url_paths_to_exclude == o.url_paths_to_exclude &&
309
- urls_to_scrape == o.urls_to_scrape
321
+ urls_to_scrape == o.urls_to_scrape &&
322
+ download_css_and_media == o.download_css_and_media
310
323
  end
311
324
 
312
325
  # @see the `==` method
@@ -318,7 +331,7 @@ module Carbon
318
331
  # Calculates hash code according to all attributes.
319
332
  # @return [Integer] Hash code
320
333
  def hash
321
- [tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, url_paths_to_exclude, urls_to_scrape].hash
334
+ [tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, url_paths_to_exclude, urls_to_scrape, download_css_and_media].hash
322
335
  end
323
336
 
324
337
  # Builds the object from hash
@@ -42,6 +42,9 @@ module Carbon
42
42
  # URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
43
43
  attr_accessor :url_paths_to_include
44
44
 
45
+ # Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
46
+ attr_accessor :download_css_and_media
47
+
45
48
  # Attribute mapping from ruby-style variable name to JSON key.
46
49
  def self.attribute_map
47
50
  {
@@ -59,7 +62,8 @@ module Carbon
59
62
  :'css_classes_to_skip' => :'css_classes_to_skip',
60
63
  :'css_selectors_to_skip' => :'css_selectors_to_skip',
61
64
  :'embedding_model' => :'embedding_model',
62
- :'url_paths_to_include' => :'url_paths_to_include'
65
+ :'url_paths_to_include' => :'url_paths_to_include',
66
+ :'download_css_and_media' => :'download_css_and_media'
63
67
  }
64
68
  end
65
69
 
@@ -85,7 +89,8 @@ module Carbon
85
89
  :'css_classes_to_skip' => :'Array<String>',
86
90
  :'css_selectors_to_skip' => :'Array<String>',
87
91
  :'embedding_model' => :'EmbeddingGenerators',
88
- :'url_paths_to_include' => :'Array<String>'
92
+ :'url_paths_to_include' => :'Array<String>',
93
+ :'download_css_and_media' => :'Boolean'
89
94
  }
90
95
  end
91
96
 
@@ -104,7 +109,8 @@ module Carbon
104
109
  :'html_tags_to_skip',
105
110
  :'css_classes_to_skip',
106
111
  :'css_selectors_to_skip',
107
- :'url_paths_to_include'
112
+ :'url_paths_to_include',
113
+ :'download_css_and_media'
108
114
  ])
109
115
  end
110
116
 
@@ -210,6 +216,12 @@ module Carbon
210
216
  self.url_paths_to_include = value
211
217
  end
212
218
  end
219
+
220
+ if attributes.key?(:'download_css_and_media')
221
+ self.download_css_and_media = attributes[:'download_css_and_media']
222
+ else
223
+ self.download_css_and_media = false
224
+ end
213
225
  end
214
226
 
215
227
  # Show invalid properties with the reasons. Usually used together with valid?
@@ -294,7 +306,8 @@ module Carbon
294
306
  css_classes_to_skip == o.css_classes_to_skip &&
295
307
  css_selectors_to_skip == o.css_selectors_to_skip &&
296
308
  embedding_model == o.embedding_model &&
297
- url_paths_to_include == o.url_paths_to_include
309
+ url_paths_to_include == o.url_paths_to_include &&
310
+ download_css_and_media == o.download_css_and_media
298
311
  end
299
312
 
300
313
  # @see the `==` method
@@ -306,7 +319,7 @@ module Carbon
306
319
  # Calculates hash code according to all attributes.
307
320
  # @return [Integer] Hash code
308
321
  def hash
309
- [tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include].hash
322
+ [tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, download_css_and_media].hash
310
323
  end
311
324
 
312
325
  # Builds the object from hash
@@ -7,5 +7,5 @@ The version of the OpenAPI document: 1.0.0
7
7
  =end
8
8
 
9
9
  module Carbon
10
- VERSION = '0.2.27'
10
+ VERSION = '0.2.28'
11
11
  end
@@ -115,4 +115,10 @@ describe Carbon::SitemapScrapeRequest do
115
115
  end
116
116
  end
117
117
 
118
+ describe 'test attribute "download_css_and_media"' do
119
+ it 'should work' do
120
+ # assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
121
+ end
122
+ end
123
+
118
124
  end
@@ -109,4 +109,10 @@ describe Carbon::WebscrapeRequest do
109
109
  end
110
110
  end
111
111
 
112
+ describe 'test attribute "download_css_and_media"' do
113
+ it 'should work' do
114
+ # assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
115
+ end
116
+ end
117
+
112
118
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: carbon_ruby_sdk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.27
4
+ version: 0.2.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Konfig
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-24 00:00:00.000000000 Z
11
+ date: 2024-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday