crawlbase 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 50c42144b472e240907828a2656215a7f1e5004f07a3288d3eafca184b4dc16c
4
+ data.tar.gz: fdbbd3ebe2a64ecde61e34b94bd5716574d04d5f1bffb5e1a9225f6500b0a793
5
+ SHA512:
6
+ metadata.gz: c29927980f6cf82b431c7385e78429802916ebe188a40e789ca17e1c5a63d7183f4f8b5b30d8b9b7bd42c84fa6f1223eda48703cf317395e359a422a88cddfee
7
+ data.tar.gz: db7cf49a5dc174920a76bb35fecb92097b61025e3b2f7fb372a7143282fba93edd748763020001b0cdd928da4500cfe1a948e8123a2232a614cbfb399f3a4843
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at info@crawlbase.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2020 Crawlbase
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,364 @@
1
+ # Crawlbase
2
+
3
+ Dependency free gem for scraping and crawling websites using the Crawlbase API.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'crawlbase'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install crawlbase
20
+
21
+ ## Crawling API Usage
22
+
23
+ Require the gem in your project
24
+
25
+ ```ruby
26
+ require 'crawlbase'
27
+ ```
28
+
29
+ Initialize the API with one of your account tokens, either normal or javascript token. Then make get or post requests accordingly.
30
+
31
+ You can get a token for free by [creating a Crawlbase account](https://crawlbase.com/signup) and 1000 free testing requests. You can use them for tcp calls or javascript calls or both.
32
+
33
+ ```ruby
34
+ api = Crawlbase::API.new(token: 'YOUR_TOKEN')
35
+ ```
36
+
37
+ ### GET requests
38
+
39
+ Pass the url that you want to scrape plus any options from the ones available in the [API documentation](https://crawlbase.com/dashboard/docs).
40
+
41
+ ```ruby
42
+ api.get(url, options)
43
+ ```
44
+
45
+ Example:
46
+
47
+ ```ruby
48
+
49
+ begin
50
+ response = api.get('https://www.facebook.com/britneyspears')
51
+ puts response.status_code
52
+ puts response.original_status
53
+ puts response.pc_status
54
+ puts response.body
55
+ rescue => exception
56
+ puts exception.backtrace
57
+ end
58
+
59
+ ```
60
+
61
+ You can pass any options of what the Crawlbase API supports in exact param format.
62
+
63
+ Example:
64
+
65
+ ```ruby
66
+ options = {
67
+ user_agent: 'Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/30.0',
68
+ format: 'json'
69
+ }
70
+
71
+ response = api.get('https://www.reddit.com/r/pics/comments/5bx4bx/thanks_obama/', options)
72
+
73
+ puts response.status_code
74
+ puts response.body # read the API json response
75
+ ```
76
+
77
+ ### POST requests
78
+
79
+ Pass the url that you want to scrape, the data that you want to send which can be either a json or a string, plus any options from the ones available in the [API documentation](https://crawlbase.com/dashboard/docs).
80
+
81
+ ```ruby
82
+ api.post(url, data, options);
83
+ ```
84
+
85
+ Example:
86
+
87
+ ```ruby
88
+ api.post('https://producthunt.com/search', { text: 'example search' })
89
+ ```
90
+
91
+ You can send the data as application/json instead of x-www-form-urlencoded by setting options `post_content_type` as json.
92
+
93
+ ```ruby
94
+ response = api.post('https://httpbin.org/post', { some_json: 'with some value' }, { post_content_type: 'json' })
95
+
96
+ puts response.status_code
97
+ puts response.body
98
+
99
+ ```
100
+
101
+ ### Javascript requests
102
+
103
+ If you need to scrape any website built with Javascript like React, Angular, Vue, etc. You just need to pass your javascript token and use the same calls. Note that only `.get` is available for javascript and not `.post`.
104
+
105
+ ```ruby
106
+ api = Crawlbase::API.new(token: 'YOUR_JAVASCRIPT_TOKEN' })
107
+ ```
108
+
109
+ ```ruby
110
+ response = api.get('https://www.nfl.com')
111
+ puts response.status_code
112
+ puts response.body
113
+ ```
114
+
115
+ Same way you can pass javascript additional options.
116
+
117
+ ```ruby
118
+ response = api.get('https://www.freelancer.com', options: { page_wait: 5000 })
119
+ puts response.status_code
120
+ ```
121
+
122
+ ## Original status
123
+
124
+ You can always get the original status and crawlbase status from the response. Read the [Crawlbase documentation](https://crawlbase.com/dashboard/docs) to learn more about those status.
125
+
126
+ ```ruby
127
+ response = api.get('https://sfbay.craigslist.org/')
128
+
129
+ puts response.original_status
130
+ puts response.pc_status
131
+ ```
132
+
133
+ ## Scraper API usage
134
+
135
+ Initialize the Scraper API using your normal token and call the `get` method.
136
+
137
+ ```ruby
138
+ scraper_api = Crawlbase::ScraperAPI.new(token: 'YOUR_TOKEN')
139
+ ```
140
+
141
+ Pass the url that you want to scrape plus any options from the ones available in the [Scraper API documentation](https://crawlbase.com/docs/scraper-api/parameters).
142
+
143
+ ```ruby
144
+ api.get(url, options)
145
+ ```
146
+
147
+ Example:
148
+
149
+ ```ruby
150
+ begin
151
+ response = scraper_api.get('https://www.amazon.com/Halo-SleepSack-Swaddle-Triangle-Neutral/dp/B01LAG1TOS')
152
+ puts response.remaining_requests
153
+ puts response.status_code
154
+ puts response.body
155
+ rescue => exception
156
+ puts exception.backtrace
157
+ end
158
+ ```
159
+
160
+ ## Leads API usage
161
+
162
+ Initialize with your Leads API token and call the `get` method.
163
+
164
+ For more details on the implementation, please visit the [Leads API documentation](https://crawlbase.com/docs/leads-api).
165
+
166
+ ```ruby
167
+ leads_api = Crawlbase::LeadsAPI.new(token: 'YOUR_TOKEN')
168
+
169
+ begin
170
+ response = leads_api.get('stripe.com')
171
+ puts response.success
172
+ puts response.remaining_requests
173
+ puts response.status_code
174
+ puts response.body
175
+ rescue => exception
176
+ puts exception.backtrace
177
+ end
178
+ ```
179
+
180
+ If you have questions or need help using the library, please open an issue or [contact us](https://crawlbase.com/contact).
181
+
182
+
183
+ ## Screenshots API usage
184
+
185
+ Initialize with your Screenshots API token and call the `get` method.
186
+
187
+ ```ruby
188
+ screenshots_api = Crawlbase::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
189
+
190
+ begin
191
+ response = screenshots_api.get('https://www.apple.com')
192
+ puts response.success
193
+ puts response.remaining_requests
194
+ puts response.status_code
195
+ puts response.screenshot_path # do something with screenshot_path here
196
+ rescue => exception
197
+ puts exception.backtrace
198
+ end
199
+ ```
200
+
201
+ or with using a block
202
+
203
+ ```ruby
204
+ screenshots_api = Crawlbase::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
205
+
206
+ begin
207
+ response = screenshots_api.get('https://www.apple.com') do |file|
208
+ # do something (reading/writing) with the image file here
209
+ end
210
+ puts response.success
211
+ puts response.remaining_requests
212
+ puts response.status_code
213
+ rescue => exception
214
+ puts exception.backtrace
215
+ end
216
+ ```
217
+
218
+ or specifying a file path
219
+
220
+ ```ruby
221
+ screenshots_api = Crawlbase::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
222
+
223
+ begin
224
+ response = screenshots_api.get('https://www.apple.com', save_to_path: '~/screenshot.jpg') do |file|
225
+ # do something (reading/writing) with the image file here
226
+ end
227
+ puts response.success
228
+ puts response.remaining_requests
229
+ puts response.status_code
230
+ rescue => exception
231
+ puts exception.backtrace
232
+ end
233
+ ```
234
+
235
+ Note that `screenshots_api.get(url, options)` method accepts an [options](https://crawlbase.com/docs/screenshots-api/parameters)
236
+
237
+ ## Storage API usage
238
+
239
+ Initialize the Storage API using your private token.
240
+
241
+ ```ruby
242
+ storage_api = Crawlbase::StorageAPI.new(token: 'YOUR_TOKEN')
243
+ ```
244
+
245
+ Pass the [url](https://crawlbase.com/docs/storage-api/parameters/#url) that you want to get from [Crawlbase Storage](https://crawlbase.com/dashboard/storage).
246
+
247
+ ```ruby
248
+ begin
249
+ response = storage_api.get('https://www.apple.com')
250
+ puts response.original_status
251
+ puts response.pc_status
252
+ puts response.url
253
+ puts response.status_code
254
+ puts response.rid
255
+ puts response.body
256
+ puts response.stored_at
257
+ rescue => exception
258
+ puts exception.backtrace
259
+ end
260
+ ```
261
+
262
+ or you can use the [RID](https://crawlbase.com/docs/storage-api/parameters/#rid)
263
+
264
+ ```ruby
265
+ begin
266
+ response = storage_api.get(RID)
267
+ puts response.original_status
268
+ puts response.pc_status
269
+ puts response.url
270
+ puts response.status_code
271
+ puts response.rid
272
+ puts response.body
273
+ puts response.stored_at
274
+ rescue => exception
275
+ puts exception.backtrace
276
+ end
277
+ ```
278
+
279
+ Note: One of the two RID or URL must be sent. So both are optional but it's mandatory to send one of the two.
280
+
281
+ ### [Delete](https://crawlbase.com/docs/storage-api/delete/) request
282
+
283
+ To delete a storage item from your storage area, use the correct RID
284
+
285
+ ```ruby
286
+ if storage_api.delete(RID)
287
+ puts 'delete success'
288
+ else
289
+ puts "Unable to delete: #{storage_api.body['error']}"
290
+ end
291
+ ```
292
+
293
+ ### [Bulk](https://crawlbase.com/docs/storage-api/bulk/) request
294
+
295
+ To do a bulk request with a list of RIDs, please send the list of rids as an array
296
+
297
+ ```ruby
298
+ begin
299
+ response = storage_api.bulk([RID1, RID2, RID3, ...])
300
+ puts response.original_status
301
+ puts response.pc_status
302
+ puts response.url
303
+ puts response.status_code
304
+ puts response.rid
305
+ puts response.body
306
+ puts response.stored_at
307
+ rescue => exception
308
+ puts exception.backtrace
309
+ end
310
+ ```
311
+
312
+ ### [RIDs](https://crawlbase.com/docs/storage-api/rids) request
313
+
314
+ To request a bulk list of RIDs from your storage area
315
+
316
+ ```ruby
317
+ begin
318
+ response = storage_api.rids
319
+ puts response.status_code
320
+ puts response.rid
321
+ puts response.body
322
+ rescue => exception
323
+ puts exception.backtrace
324
+ end
325
+ ```
326
+
327
+ You can also specify a limit as a parameter
328
+
329
+ ```ruby
330
+ storage_api.rids(100)
331
+ ```
332
+
333
+ ### [Total Count](https://crawlbase.com/docs/storage-api/total_count)
334
+
335
+ To get the total number of documents in your storage area
336
+
337
+ ```ruby
338
+ total_count = storage_api.total_count
339
+ puts "total_count: #{total_count}"
340
+ ```
341
+
342
+ If you have questions or need help using the library, please open an issue or [contact us](https://crawlbase.com/contact).
343
+
344
+ ## Development
345
+
346
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
347
+
348
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
349
+
350
+ ## Contributing
351
+
352
+ Bug reports and pull requests are welcome on GitHub at https://github.com/crawlbase-source/crawlbase-ruby. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
353
+
354
+ ## License
355
+
356
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
357
+
358
+ ## Code of Conduct
359
+
360
+ Everyone interacting in the Crawlbase project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/crawlbase-source/crawlbase-ruby/blob/master/CODE_OF_CONDUCT.md).
361
+
362
+ ---
363
+
364
+ Copyright 2023 Crawlbase
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "crawlbase"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/crawlbase.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "crawlbase/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "crawlbase"
8
+ spec.version = Crawlbase::VERSION
9
+ spec.platform = Gem::Platform::RUBY
10
+ spec.authors = ["crawlbase"]
11
+ spec.email = ["info@crawlbase.com"]
12
+ spec.summary = %q{Crawlbase API client for web scraping and crawling}
13
+ spec.description = %q{Ruby based client for the Crawlbase API that helps developers crawl or scrape thousands of web pages anonymously}
14
+ spec.homepage = "https://github.com/crawlbase-source/crawlbase-ruby"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
+ f.match(%r{^(test|spec|features)/})
19
+ end
20
+
21
+ spec.required_ruby_version = '>= 2.0'
22
+
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.add_development_dependency "rspec", "~> 3.2"
28
+ spec.add_development_dependency "webmock", "~> 3.4"
29
+ spec.add_development_dependency "bundler", "~> 2.0"
30
+ spec.add_development_dependency "rake", "~> 12.3.3"
31
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module Crawlbase
8
+ class API
9
+ attr_reader :token, :body, :status_code, :original_status, :pc_status, :url, :storage_url
10
+
11
+ INVALID_TOKEN = 'Token is required'
12
+ INVALID_URL = 'URL is required'
13
+
14
+ def initialize(options = {})
15
+ raise INVALID_TOKEN if options[:token].nil?
16
+
17
+ @token = options[:token]
18
+ end
19
+
20
+ def get(url, options = {})
21
+ raise INVALID_URL if url.empty?
22
+
23
+ uri = prepare_uri(url, options)
24
+
25
+ response = Net::HTTP.get_response(uri)
26
+
27
+ prepare_response(response, options[:format])
28
+
29
+ self
30
+ end
31
+
32
+ def post(url, data, options = {})
33
+ raise INVALID_URL if url.empty?
34
+
35
+ uri = prepare_uri(url, options)
36
+
37
+ http = Net::HTTP.new(uri.host, uri.port)
38
+
39
+ http.use_ssl = true
40
+
41
+ content_type = options[:post_content_type].to_s.include?('json') ? { 'Content-Type': 'text/json' } : nil
42
+
43
+ request = Net::HTTP::Post.new(uri.request_uri, content_type)
44
+
45
+ if options[:post_content_type].to_s.include?('json')
46
+ request.body = data.to_json
47
+ else
48
+ request.set_form_data(data)
49
+ end
50
+
51
+ response = http.request(request)
52
+
53
+ prepare_response(response, options[:format])
54
+
55
+ self
56
+ end
57
+
58
+ private
59
+
60
+ def base_url
61
+ 'https://api.crawlbase.com'
62
+ end
63
+
64
+ def prepare_uri(url, options)
65
+ uri = URI(base_url)
66
+ uri.query = URI.encode_www_form({ token: @token, url: url }.merge(options))
67
+
68
+ uri
69
+ end
70
+
71
+ def prepare_response(response, format)
72
+ res = format == 'json' || base_url.include?('/scraper') ? JSON.parse(response.body) : response
73
+
74
+ @original_status = res['original_status'].to_i
75
+ @pc_status = res['pc_status'].to_i
76
+ @url = res['url']
77
+ @storage_url = res['storage_url']
78
+ @status_code = response.code.to_i
79
+ @body = response.body
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module Crawlbase
8
+ class LeadsAPI
9
+ attr_reader :token, :body, :status_code, :success, :remaining_requests
10
+
11
+ INVALID_TOKEN = 'Token is required'
12
+ INVALID_DOMAIN = 'Domain is required'
13
+
14
+ def initialize(options = {})
15
+ raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
16
+
17
+ @token = options[:token]
18
+ end
19
+
20
+ def get(domain)
21
+ raise INVALID_DOMAIN if domain.empty?
22
+
23
+ uri = URI('https://api.crawlbase.com/leads')
24
+ uri.query = URI.encode_www_form({ token: token, domain: domain })
25
+
26
+ response = Net::HTTP.get_response(uri)
27
+ @status_code = response.code.to_i
28
+ @body = response.body
29
+
30
+ json_body = JSON.parse(response.body)
31
+ @success = json_body['success']
32
+ @remaining_requests = json_body['remaining_requests'].to_i
33
+
34
+ self
35
+ end
36
+
37
+ def post
38
+ raise 'Only GET is allowed for the LeadsAPI'
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlbase
4
+ class ScraperAPI < Crawlbase::API
5
+ attr_reader :remaining_requests
6
+
7
+ def post
8
+ raise 'Only GET is allowed for the ScraperAPI'
9
+ end
10
+
11
+ private
12
+
13
+ def prepare_response(response, format)
14
+ super(response, format)
15
+ json_body = JSON.parse(response.body)
16
+ @remaining_requests = json_body['remaining_requests'].to_i
17
+ end
18
+
19
+ def base_url
20
+ 'https://api.crawlbase.com/scraper'
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'securerandom'
4
+ require 'tmpdir'
5
+
6
+ module Crawlbase
7
+ class ScreenshotsAPI < Crawlbase::API
8
+ attr_reader :screenshot_path, :success, :remaining_requests, :screenshot_url
9
+
10
+ INVALID_SAVE_TO_PATH_FILENAME = 'Filename must end with .jpg or .jpeg'
11
+ SAVE_TO_PATH_FILENAME_PATTERN = /.+\.(jpg|JPG|jpeg|JPEG)$/.freeze
12
+
13
+ def post
14
+ raise 'Only GET is allowed for the ScreenshotsAPI'
15
+ end
16
+
17
+ def get(url, options = {})
18
+ screenshot_path = options.delete(:save_to_path) || generate_file_path
19
+ raise INVALID_SAVE_TO_PATH_FILENAME unless SAVE_TO_PATH_FILENAME_PATTERN =~ screenshot_path
20
+
21
+ response = super(url, options)
22
+ file = File.open(screenshot_path, 'w+')
23
+ file.write(response.body&.force_encoding('UTF-8'))
24
+ @screenshot_path = screenshot_path
25
+ yield(file) if block_given?
26
+ response
27
+ ensure
28
+ file&.close
29
+ end
30
+
31
+ private
32
+
33
+ def prepare_response(response, format)
34
+ super(response, format)
35
+ @remaining_requests = response['remaining_requests'].to_i
36
+ @success = response['success'] == 'true'
37
+ @screenshot_url = response['screenshot_url']
38
+ end
39
+
40
+ def base_url
41
+ 'https://api.crawlbase.com/screenshots'
42
+ end
43
+
44
+ def generate_file_name
45
+ "#{SecureRandom.urlsafe_base64}.jpg"
46
+ end
47
+
48
+ def generate_file_path
49
+ File.join(Dir.tmpdir, generate_file_name)
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'json'
5
+ require 'uri'
6
+
7
+ module Crawlbase
8
+ class StorageAPI
9
+ attr_reader :token, :original_status, :pc_status, :url, :status_code, :rid, :body, :stored_at
10
+
11
+ INVALID_TOKEN = 'Token is required'
12
+ INVALID_RID = 'RID is required'
13
+ INVALID_RID_ARRAY = 'One or more RIDs are required'
14
+ INVALID_URL_OR_RID = 'Either URL or RID is required'
15
+ BASE_URL = 'https://api.crawlbase.com/storage'
16
+
17
+ def initialize(options = {})
18
+ raise INVALID_TOKEN if options[:token].nil? || options[:token].empty?
19
+
20
+ @token = options[:token]
21
+ end
22
+
23
+ def get(url_or_rid, format = 'html')
24
+ raise INVALID_URL_OR_RID if url_or_rid.nil? || url_or_rid.empty?
25
+
26
+ uri = URI(BASE_URL)
27
+ uri.query = URI.encode_www_form({ token: token, format: format }.merge(decide_url_or_rid(url_or_rid)))
28
+ response = Net::HTTP.get_response(uri)
29
+
30
+ res = format == 'json' ? JSON.parse(response.body) : response
31
+
32
+ @original_status = res['original_status'].to_i
33
+ @pc_status = res['pc_status'].to_i
34
+ @url = res['url']
35
+ @rid = res['rid']
36
+ @stored_at = res['stored_at']
37
+
38
+ @status_code = response.code.to_i
39
+ @body = response.body
40
+
41
+ self
42
+ end
43
+
44
+ def delete(rid)
45
+ raise INVALID_RID if rid.nil? || rid.empty?
46
+
47
+ uri = URI(BASE_URL)
48
+ uri.query = URI.encode_www_form(token: token, rid: rid)
49
+ http = Net::HTTP.new(uri.host)
50
+ request = Net::HTTP::Delete.new(uri.request_uri)
51
+ response = http.request(request)
52
+
53
+ @url, @original_status, @pc_status, @stored_at = nil
54
+ @status_code = response.code.to_i
55
+ @rid = rid
56
+ @body = JSON.parse(response.body)
57
+
58
+ @body.key?('success')
59
+ end
60
+
61
+ def bulk(rids_array = [])
62
+ raise INVALID_RID_ARRAY if rids_array.empty?
63
+
64
+ uri = URI("#{BASE_URL}/bulk")
65
+ uri.query = URI.encode_www_form(token: token)
66
+ http = Net::HTTP.new(uri.host)
67
+ request = Net::HTTP::Post.new(uri.request_uri, { 'Content-Type': 'application/json' })
68
+ request.body = { rids: rids_array }.to_json
69
+ response = http.request(request)
70
+
71
+ @body = JSON.parse(response.body)
72
+ @original_status = @body.map { |item| item['original_status'].to_i }
73
+ @status_code = response.code.to_i
74
+ @pc_status = @body.map { |item| item['pc_status'].to_i }
75
+ @url = @body.map { |item| item['url'] }
76
+ @rid = @body.map { |item| item['rid'] }
77
+ @stored_at = @body.map { |item| item['stored_at'] }
78
+
79
+ self
80
+ end
81
+
82
+ def rids(limit = -1)
83
+ uri = URI("#{BASE_URL}/rids")
84
+ query_hash = { token: token }
85
+ query_hash.merge!({ limit: limit }) if limit >= 0
86
+ uri.query = URI.encode_www_form(query_hash)
87
+
88
+ response = Net::HTTP.get_response(uri)
89
+ @url, @original_status, @pc_status, @stored_at = nil
90
+ @status_code = response.code.to_i
91
+ @body = JSON.parse(response.body)
92
+ @rid = @body
93
+
94
+ @body
95
+ end
96
+
97
+ def total_count
98
+ uri = URI("#{BASE_URL}/total_count")
99
+ uri.query = URI.encode_www_form(token: token)
100
+
101
+ response = Net::HTTP.get_response(uri)
102
+ @url, @original_status, @pc_status, @stored_at = nil
103
+ @status_code = response.code.to_i
104
+ @rid = rid
105
+ @body = JSON.parse(response.body)
106
+
107
+ body['totalCount']
108
+ end
109
+
110
+ private
111
+
112
+ def decide_url_or_rid(url_or_rid)
113
+ %r{^https?://} =~ url_or_rid ? { url: url_or_rid } : { rid: url_or_rid }
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Crawlbase
4
+ VERSION = '1.0.0'
5
+ end
data/lib/crawlbase.rb ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'crawlbase/version'
4
+ require 'crawlbase/api'
5
+ require 'crawlbase/scraper_api'
6
+ require 'crawlbase/leads_api'
7
+ require 'crawlbase/screenshots_api'
8
+ require 'crawlbase/storage_api'
9
+
10
+ module Crawlbase
11
+ end
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawlbase
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - crawlbase
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-06-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: webmock
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.4'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 12.3.3
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 12.3.3
69
+ description: Ruby based client for the Crawlbase API that helps developers crawl or
70
+ scrape thousands of web pages anonymously
71
+ email:
72
+ - info@crawlbase.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - CODE_OF_CONDUCT.md
79
+ - Gemfile
80
+ - LICENSE.txt
81
+ - README.md
82
+ - Rakefile
83
+ - bin/console
84
+ - bin/setup
85
+ - crawlbase.gemspec
86
+ - lib/crawlbase.rb
87
+ - lib/crawlbase/api.rb
88
+ - lib/crawlbase/leads_api.rb
89
+ - lib/crawlbase/scraper_api.rb
90
+ - lib/crawlbase/screenshots_api.rb
91
+ - lib/crawlbase/storage_api.rb
92
+ - lib/crawlbase/version.rb
93
+ homepage: https://github.com/crawlbase-source/crawlbase-ruby
94
+ licenses:
95
+ - MIT
96
+ metadata: {}
97
+ post_install_message:
98
+ rdoc_options: []
99
+ require_paths:
100
+ - lib
101
+ required_ruby_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '2.0'
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ requirements: []
112
+ rubygems_version: 3.1.2
113
+ signing_key:
114
+ specification_version: 4
115
+ summary: Crawlbase API client for web scraping and crawling
116
+ test_files: []