RubyGems - carbon_ruby_sdk - Versions diffs - 0.2.27 → 0.2.28 - Mend

carbon_ruby_sdk 0.2.27 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +9 -2
data/lib/carbon_ruby_sdk/api/utilities_api.rb +6 -2
data/lib/carbon_ruby_sdk/models/sitemap_scrape_request.rb +18 -5
data/lib/carbon_ruby_sdk/models/webscrape_request.rb +18 -5
data/lib/carbon_ruby_sdk/version.rb +1 -1
data/spec/models/sitemap_scrape_request_spec.rb +6 -0
data/spec/models/webscrape_request_spec.rb +6 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '00969f44903e2698da2cdf106e0051ceeeff09c68652799607ce5e5fae9b7a3a'
-  data.tar.gz: b59ee278e57db1ce4f5003787f146158865a172b782f6fcf73c5881a1c74d3e4
+  metadata.gz: 631a1a24ddb1e9ee7ae6cf1bfe231dabaa4a9236ddb585a1cc234f5ad7a6a738
+  data.tar.gz: 5515b1db4929dc0cfa12e99c1a7a216a969c84d59d70d0ccf410f371f797c0ce
 SHA512:
-  metadata.gz: d8f090188e03d6a29d1cbd2164799b513fd4498c573ad4ab8b3ea4839e448bec35831e0065e4409b77429dc044d76bddc64b2d187147b8c1590e72b33e703ad8
-  data.tar.gz: abfdbe67b5f27b6e015980f2fc633195cf3fd7b6126b9cacc2e65144418d923c6348acf7c40dd52d990f8ebe0224043c836918960144e268872f894eca0d97d8
+  metadata.gz: ec3740b74b7cf918778880dcfdafc8054d8d012c297a18d1c9d127d086b55692d94142bd2a0b20da80616d7b717ae4798dc1b0871a5e9baebbb19bb548a8d7ce
+  data.tar.gz: be240b0e3bc839bf3c741a98a7ced2453601e690c512f7a8ea883f32a797841218e9205498fcb6a0e2dc60a661d6d9eb22401f7792dab5adc82a5fd3c9982874

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    carbon_ruby_sdk (0.2.27)
+    carbon_ruby_sdk (0.2.28)
       faraday (>= 1.0.1, < 3.0)
       faraday-multipart (~> 1.0, >= 1.0.4)

data/README.md CHANGED Viewed

@@ -6,7 +6,7 @@
 Connect external data to LLMs, no matter the source.
-[![npm](https://img.shields.io/badge/gem-v0.2.27-blue)](https://rubygems.org/gems/carbon_ruby_sdk/versions/0.2.27)
+[![npm](https://img.shields.io/badge/gem-v0.2.28-blue)](https://rubygems.org/gems/carbon_ruby_sdk/versions/0.2.28)
 </div>
@@ -93,7 +93,7 @@ Connect external data to LLMs, no matter the source.
 Add to Gemfile:
 ```ruby
-gem 'carbon_ruby_sdk', '~> 0.2.27'
+gem 'carbon_ruby_sdk', '~> 0.2.28'
 ```
 ## Getting Started<a id="getting-started"></a>
@@ -3116,6 +3116,7 @@ result = carbon.utilities.scrape_sitemap(
   url_paths_to_include: [],
   url_paths_to_exclude: [],
   urls_to_scrape: [],
+  download_css_and_media: false,
 )
 p result
 ```
@@ -3150,6 +3151,11 @@ You can submit a subset of URLs from the sitemap that should be scraped. To get
 the list of URLs, you can check out /process_sitemap endpoint. If left empty,
 all URLs from the sitemap will be scraped.
+##### download_css_and_media: `Boolean`<a id="download_css_and_media-boolean"></a>
+Whether the scraper should download css and media from the page (images, fonts,
+etc). Scrapes might take longer to finish with this flag enabled, but the
+success rate is improved.
 #### 🌐 Endpoint<a id="🌐-endpoint"></a>
 `/scrape_sitemap` `POST`
@@ -3190,6 +3196,7 @@ result = carbon.utilities.scrape_web(
             "css_selectors_to_skip" => [],
             "embedding_model" => "OPENAI",
             "url_paths_to_include" => [],
+            "download_css_and_media" => false,
         }
     ],
 )

data/lib/carbon_ruby_sdk/api/utilities_api.rb CHANGED Viewed

@@ -437,9 +437,10 @@ module Carbon
     # @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
     # @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
     # @param urls_to_scrape [Array<String>] You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
+    # @param download_css_and_media [Boolean] Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
     # @param body [SitemapScrapeRequest]
     # @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
-    def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, extra: {})
+    def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, extra: {})
       _body = {}
       _body[:tags] = tags if tags != SENTINEL
       _body[:url] = url if url != SENTINEL
@@ -457,6 +458,7 @@ module Carbon
       _body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
       _body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
       _body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
+      _body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
       sitemap_scrape_request = _body
       api_response = scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
       api_response.data
@@ -488,9 +490,10 @@ module Carbon
     # @param url_paths_to_include [Array<String>] URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
     # @param url_paths_to_exclude [Array<String>] URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
     # @param urls_to_scrape [Array<String>] You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
+    # @param download_css_and_media [Boolean] Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.
     # @param body [SitemapScrapeRequest]
     # @param [Hash] extra additional parameters to pass along through :header_params, :query_params, or parameter name
-    def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, extra: {})
+    def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: 'OPENAI', url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, extra: {})
       _body = {}
       _body[:tags] = tags if tags != SENTINEL
       _body[:url] = url if url != SENTINEL
@@ -508,6 +511,7 @@ module Carbon
       _body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
       _body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
       _body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
+      _body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
       sitemap_scrape_request = _body
       scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
     end

data/lib/carbon_ruby_sdk/models/sitemap_scrape_request.rb CHANGED Viewed

@@ -46,6 +46,9 @@ module Carbon
     # You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs,           you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.
     attr_accessor :urls_to_scrape
+    # Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes          might take longer to finish with this flag enabled, but the success rate is improved.
+    attr_accessor :download_css_and_media
     # Attribute mapping from ruby-style variable name to JSON key.
     def self.attribute_map
       {
@@ -64,7 +67,8 @@ module Carbon
         :'embedding_model' => :'embedding_model',
         :'url_paths_to_include' => :'url_paths_to_include',
         :'url_paths_to_exclude' => :'url_paths_to_exclude',
-        :'urls_to_scrape' => :'urls_to_scrape'
+        :'urls_to_scrape' => :'urls_to_scrape',
+        :'download_css_and_media' => :'download_css_and_media'
       }
     end
@@ -91,7 +95,8 @@ module Carbon
         :'embedding_model' => :'EmbeddingGenerators',
         :'url_paths_to_include' => :'Array<String>',
         :'url_paths_to_exclude' => :'Array<String>',
-        :'urls_to_scrape' => :'Array<String>'
+        :'urls_to_scrape' => :'Array<String>',
+        :'download_css_and_media' => :'Boolean'
       }
     end
@@ -111,7 +116,8 @@ module Carbon
         :'css_selectors_to_skip',
         :'url_paths_to_include',
         :'url_paths_to_exclude',
-        :'urls_to_scrape'
+        :'urls_to_scrape',
+        :'download_css_and_media'
       ])
     end
@@ -221,6 +227,12 @@ module Carbon
           self.urls_to_scrape = value
         end
       end
+      if attributes.key?(:'download_css_and_media')
+        self.download_css_and_media = attributes[:'download_css_and_media']
+      else
+        self.download_css_and_media = false
+      end
     end
     # Show invalid properties with the reasons. Usually used together with valid?
@@ -306,7 +318,8 @@ module Carbon
           embedding_model == o.embedding_model &&
           url_paths_to_include == o.url_paths_to_include &&
           url_paths_to_exclude == o.url_paths_to_exclude &&
-          urls_to_scrape == o.urls_to_scrape
+          urls_to_scrape == o.urls_to_scrape &&
+          download_css_and_media == o.download_css_and_media
     end
     # @see the `==` method
@@ -318,7 +331,7 @@ module Carbon
     # Calculates hash code according to all attributes.
     # @return [Integer] Hash code
     def hash
-      [tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, url_paths_to_exclude, urls_to_scrape].hash
+      [tags, url, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, url_paths_to_exclude, urls_to_scrape, download_css_and_media].hash
     end
     # Builds the object from hash

data/lib/carbon_ruby_sdk/models/webscrape_request.rb CHANGED Viewed

@@ -42,6 +42,9 @@ module Carbon
     # URL subpaths or directories that you want to include. For example if you want to only include         URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input
     attr_accessor :url_paths_to_include
+    # Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes          might take longer to finish with this flag enabled, but the success rate is improved.
+    attr_accessor :download_css_and_media
     # Attribute mapping from ruby-style variable name to JSON key.
     def self.attribute_map
       {
@@ -59,7 +62,8 @@ module Carbon
         :'css_classes_to_skip' => :'css_classes_to_skip',
         :'css_selectors_to_skip' => :'css_selectors_to_skip',
         :'embedding_model' => :'embedding_model',
-        :'url_paths_to_include' => :'url_paths_to_include'
+        :'url_paths_to_include' => :'url_paths_to_include',
+        :'download_css_and_media' => :'download_css_and_media'
       }
     end
@@ -85,7 +89,8 @@ module Carbon
         :'css_classes_to_skip' => :'Array<String>',
         :'css_selectors_to_skip' => :'Array<String>',
         :'embedding_model' => :'EmbeddingGenerators',
-        :'url_paths_to_include' => :'Array<String>'
+        :'url_paths_to_include' => :'Array<String>',
+        :'download_css_and_media' => :'Boolean'
       }
     end
@@ -104,7 +109,8 @@ module Carbon
         :'html_tags_to_skip',
         :'css_classes_to_skip',
         :'css_selectors_to_skip',
-        :'url_paths_to_include'
+        :'url_paths_to_include',
+        :'download_css_and_media'
       ])
     end
@@ -210,6 +216,12 @@ module Carbon
           self.url_paths_to_include = value
         end
       end
+      if attributes.key?(:'download_css_and_media')
+        self.download_css_and_media = attributes[:'download_css_and_media']
+      else
+        self.download_css_and_media = false
+      end
     end
     # Show invalid properties with the reasons. Usually used together with valid?
@@ -294,7 +306,8 @@ module Carbon
           css_classes_to_skip == o.css_classes_to_skip &&
           css_selectors_to_skip == o.css_selectors_to_skip &&
           embedding_model == o.embedding_model &&
-          url_paths_to_include == o.url_paths_to_include
+          url_paths_to_include == o.url_paths_to_include &&
+          download_css_and_media == o.download_css_and_media
     end
     # @see the `==` method
@@ -306,7 +319,7 @@ module Carbon
     # Calculates hash code according to all attributes.
     # @return [Integer] Hash code
     def hash
-      [tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include].hash
+      [tags, url, recursion_depth, max_pages_to_scrape, chunk_size, chunk_overlap, skip_embedding_generation, enable_auto_sync, generate_sparse_vectors, prepend_filename_to_chunks, html_tags_to_skip, css_classes_to_skip, css_selectors_to_skip, embedding_model, url_paths_to_include, download_css_and_media].hash
     end
     # Builds the object from hash

data/lib/carbon_ruby_sdk/version.rb CHANGED Viewed

@@ -7,5 +7,5 @@ The version of the OpenAPI document: 1.0.0
 =end
 module Carbon
-  VERSION = '0.2.27'
+  VERSION = '0.2.28'
 end

data/spec/models/sitemap_scrape_request_spec.rb CHANGED Viewed

@@ -115,4 +115,10 @@ describe Carbon::SitemapScrapeRequest do
     end
   end
+  describe 'test attribute "download_css_and_media"' do
+    it 'should work' do
+      # assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
+    end
+  end
 end

data/spec/models/webscrape_request_spec.rb CHANGED Viewed

@@ -109,4 +109,10 @@ describe Carbon::WebscrapeRequest do
     end
   end
+  describe 'test attribute "download_css_and_media"' do
+    it 'should work' do
+      # assertion here. ref: https://www.relishapp.com/rspec/rspec-expectations/docs/built-in-matchers
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: carbon_ruby_sdk
 version: !ruby/object:Gem::Version
-  version: 0.2.27
+  version: 0.2.28
 platform: ruby
 authors:
 - Konfig
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-08-24 00:00:00.000000000 Z
+date: 2024-08-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: faraday