spidercloud 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ module SpiderCloud
2
+
3
+ LinksResultItemSchema = DynamicSchema::Struct.define do
4
+ url String
5
+ status Integer
6
+ duration_elapsed_ms Integer, as: :duration_elapsed_ms
7
+ error String
8
+ end
9
+
10
+ class LinksResultItem < LinksResultItemSchema
11
+ def success?
12
+ error.nil? && ( status.nil? || ( status >= 200 && status < 300 ) )
13
+ end
14
+ end
15
+
16
+ LinksResultSchema = DynamicSchema::Struct.define do
17
+ items LinksResultItem, array: true
18
+ end
19
+
20
+ class LinksResult < LinksResultSchema
21
+ extend Forwardable
22
+ include Enumerable
23
+
24
+ def_delegators :items, :each, :[], :count, :size, :length, :first, :last, :empty?
25
+
26
+ def self.from_array( array )
27
+ new( items: array )
28
+ end
29
+
30
+ def success?
31
+ items&.all?( &:success? ) || false
32
+ end
33
+
34
+ # get all discovered URLs
35
+ def urls
36
+ items&.map( &:url ) || []
37
+ end
38
+
39
+ # get failed URLs
40
+ def failed
41
+ items&.reject( &:success? ) || []
42
+ end
43
+
44
+ # get successful URLs
45
+ def succeeded
46
+ items&.select( &:success? ) || []
47
+ end
48
+
49
+ # get URLs by status code
50
+ def with_status( code )
51
+ items&.select { | item | item.status == code } || []
52
+ end
53
+ end
54
+
55
+ end
@@ -0,0 +1,31 @@
1
+ module SpiderCloud
2
+ module ModuleMethods
3
+
4
+ def connection( connection = nil )
5
+ @connection = connection if connection
6
+ @connection ||= Faraday.new { | builder | builder.adapter Faraday.default_adapter }
7
+ end
8
+
9
+ def api_key( api_key = nil )
10
+ @api_key = api_key || @api_key
11
+ @api_key
12
+ end
13
+
14
+ def scrape( url, options = nil, &block )
15
+ SpiderCloud::ScrapeRequest.new.submit( url, options, &block )
16
+ end
17
+
18
+ def crawl( url, options = nil, &block )
19
+ SpiderCloud::CrawlRequest.new.submit( url, options, &block )
20
+ end
21
+
22
+ def screenshot( url, options = nil, &block )
23
+ SpiderCloud::ScreenshotRequest.new.submit( url, options, &block )
24
+ end
25
+
26
+ def links( url, options = nil, &block )
27
+ SpiderCloud::LinksRequest.new.submit( url, options, &block )
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,41 @@
1
+ module SpiderCloud
2
+ class Request
3
+
4
+ BASE_URI = 'https://api.spider.cloud'
5
+
6
+ def initialize( connection: nil, api_key: nil )
7
+ @connection = connection || SpiderCloud.connection
8
+ @api_key = api_key || SpiderCloud.api_key
9
+ raise ArgumentError, "An 'api_key' is required unless configured using 'SpiderCloud.api_key'." \
10
+ unless @api_key
11
+ end
12
+
13
+ protected
14
+
15
+ def post( uri, body, &block )
16
+ headers = {
17
+ 'Authorization' => "Bearer #{ @api_key }",
18
+ 'Content-Type' => 'application/json'
19
+ }
20
+
21
+ @connection.post( uri ) do | request |
22
+ headers.each { | key, value | request.headers[ key ] = value }
23
+ request.body = body.is_a?( String ) ? body : JSON.generate( body )
24
+ block.call( request ) if block
25
+ end
26
+ end
27
+
28
+ def get( uri, &block )
29
+ headers = {
30
+ 'Authorization' => "Bearer #{ @api_key }",
31
+ 'Content-Type' => 'application/json'
32
+ }
33
+
34
+ @connection.get( uri ) do | request |
35
+ headers.each { | key, value | request.headers[ key ] = value }
36
+ block.call( request ) if block
37
+ end
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,15 @@
1
+ module SpiderCloud
2
+ #
3
+ # The ResponseMethods module extends a Faraday response, adding the +result+ method.
4
+ #
5
+ module ResponseMethods
6
+ def self.install( response, result )
7
+ response.instance_variable_set( "@_spider_cloud_result", result )
8
+ response.extend( ResponseMethods )
9
+ end
10
+
11
+ def result
12
+ @_spider_cloud_result
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,164 @@
1
+ module SpiderCloud
2
+ class ScrapeOptions
3
+ include DynamicSchema::Definable
4
+ include Helpers
5
+
6
+ schema do
7
+ request Symbol, in: REQUEST_TYPES
8
+ return_format Symbol, in: RETURN_FORMATS
9
+ request_timeout Integer, in: 5..255
10
+ lite_mode [ TrueClass, FalseClass ]
11
+
12
+ network_blacklist String, array: true
13
+ network_whitelist String, array: true
14
+ disable_hints [ TrueClass, FalseClass ]
15
+ disable_intercept [ TrueClass, FalseClass ]
16
+ preserve_host [ TrueClass, FalseClass ]
17
+ anti_bot [ TrueClass, FalseClass ]
18
+
19
+ session [ TrueClass, FalseClass ]
20
+ cookies String
21
+ headers Hash
22
+ user_agent String
23
+
24
+ proxy Symbol, in: PROXY_TYPES
25
+ proxy_enabled [ TrueClass, FalseClass ]
26
+ remote_proxy String
27
+ country_code String
28
+ locale String
29
+
30
+ readability [ TrueClass, FalseClass ]
31
+ root_selector String
32
+ exclude_selector String
33
+ css_extraction_map Hash
34
+ clean_html [ TrueClass, FalseClass ]
35
+ filter_svg [ TrueClass, FalseClass ]
36
+ filter_images [ TrueClass, FalseClass ]
37
+ filter_main_only [ TrueClass, FalseClass ]
38
+ filter_output_svg [ TrueClass, FalseClass ]
39
+ filter_output_images [ TrueClass, FalseClass ]
40
+ filter_output_main_only [ TrueClass, FalseClass ]
41
+
42
+ return_json_data [ TrueClass, FalseClass ]
43
+ return_headers [ TrueClass, FalseClass ]
44
+ return_cookies [ TrueClass, FalseClass ]
45
+ return_page_links [ TrueClass, FalseClass ]
46
+ return_embeddings [ TrueClass, FalseClass ]
47
+ metadata [ TrueClass, FalseClass ]
48
+ encoding String
49
+
50
+ gpt_config do
51
+ prompt String
52
+ model String
53
+ max_tokens Integer
54
+ temperature Float
55
+ top_p Float
56
+ api_key String
57
+ extra_ai_data [ TrueClass, FalseClass ]
58
+ screenshot [ TrueClass, FalseClass ]
59
+ end
60
+ custom_prompt String
61
+ custom_function String
62
+ model String
63
+
64
+ chunking_algorithm as: :chunking_alg do
65
+ type Symbol, in: CHUNKING_TYPES
66
+ value Integer
67
+ end
68
+
69
+ wait_for do
70
+ idle_network do
71
+ timeout do
72
+ seconds Integer, as: :secs
73
+ nanoseconds Integer, as: :nanos
74
+ end
75
+ end
76
+ selector String
77
+ dom do
78
+ timeout do
79
+ seconds Integer, as: :secs
80
+ nanoseconds Integer, as: :nanos
81
+ end
82
+ end
83
+ delay do
84
+ timeout do
85
+ seconds Integer, as: :secs
86
+ nanoseconds Integer, as: :nanos
87
+ end
88
+ end
89
+ page_navigations do
90
+ timeout do
91
+ seconds Integer, as: :secs
92
+ nanoseconds Integer, as: :nanos
93
+ end
94
+ end
95
+ end
96
+
97
+ fingerprint [ TrueClass, FalseClass ]
98
+ stealth [ TrueClass, FalseClass ]
99
+ viewport do
100
+ width Integer
101
+ height Integer
102
+ end
103
+ device Symbol, in: DEVICE_TYPES
104
+ scroll Integer
105
+ block_ads [ TrueClass, FalseClass ]
106
+ block_analytics [ TrueClass, FalseClass ]
107
+ block_stylesheets [ TrueClass, FalseClass ]
108
+ block_images [ TrueClass, FalseClass ]
109
+ omit_background [ TrueClass, FalseClass ]
110
+ service_worker_enabled [ TrueClass, FalseClass ]
111
+ virtual_display [ TrueClass, FalseClass ]
112
+
113
+ automation_scripts Hash
114
+ execution_scripts Hash
115
+ evaluate_on_new_document String
116
+
117
+ blacklist String, array: true
118
+ whitelist String, array: true
119
+ external_domains String, array: true
120
+ redirect_policy Symbol, in: REDIRECT_POLICIES
121
+ link_rewrite Hash
122
+
123
+ sitemap [ TrueClass, FalseClass ]
124
+ sitemap_only [ TrueClass, FalseClass ]
125
+ sitemap_path String
126
+ subdomains [ TrueClass, FalseClass ]
127
+ tld [ TrueClass, FalseClass ]
128
+
129
+ cache [ TrueClass, FalseClass ]
130
+ skip_config_checks [ TrueClass, FalseClass ]
131
+ storageless [ TrueClass, FalseClass ]
132
+ store_data [ TrueClass, FalseClass ]
133
+ respect_robots [ TrueClass, FalseClass ]
134
+ concurrency_limit Integer
135
+ delay Integer
136
+
137
+ max_credits_per_page Integer
138
+ max_credits_allowed Integer
139
+ budget Hash
140
+
141
+ webhooks Hash
142
+ run_in_background [ TrueClass, FalseClass ]
143
+ event_tracker Hash
144
+ end
145
+
146
+ def self.build( options = nil, &block )
147
+ new( api_options: builder.build( options, &block ) )
148
+ end
149
+
150
+ def self.build!( options = nil, &block )
151
+ new( api_options: builder.build!( options, &block ) )
152
+ end
153
+
154
+ def initialize( options = {}, api_options: nil )
155
+ @options = self.class.builder.build( options || {} )
156
+ @options = api_options.merge( @options ) if api_options
157
+ end
158
+
159
+ def to_h
160
+ @options.to_h
161
+ end
162
+
163
+ end
164
+ end
@@ -0,0 +1,29 @@
1
+ module SpiderCloud
2
+ class ScrapeRequest < Request
3
+
4
+ def submit( url, options = nil, &block )
5
+ if options
6
+ options = options.is_a?( ScrapeOptions ) ? options : \
7
+ ScrapeOptions.build!( options.to_h )
8
+ options = options.to_h
9
+ else
10
+ options = {}
11
+ end
12
+ options[ :url ] = Helpers.normalize_url( url )
13
+
14
+ response = post( "#{ BASE_URI }/scrape", options, &block )
15
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
16
+
17
+ result = if response.success? && attributes.is_a?( Array )
18
+ ScrapeResult.from_array( attributes )
19
+ elsif response.success?
20
+ ErrorResult.new( response.status, attributes )
21
+ else
22
+ ErrorResult.new( response.status, attributes )
23
+ end
24
+
25
+ ResponseMethods.install( response, result )
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,62 @@
1
+ module SpiderCloud
2
+
3
+ ScrapeResultItemSchema = DynamicSchema::Struct.define do
4
+ content String
5
+ error String
6
+ status Integer
7
+ duration_elapsed_ms Integer, as: :duration_elapsed_ms
8
+ costs Costs
9
+ url String
10
+ end
11
+
12
+ class ScrapeResultItem < ScrapeResultItemSchema
13
+ def success?
14
+ error.nil? && ( status.nil? || ( status >= 200 && status < 300 ) )
15
+ end
16
+ end
17
+
18
+ ScrapeResultSchema = DynamicSchema::Struct.define do
19
+ items ScrapeResultItem, array: true
20
+ end
21
+
22
+ class ScrapeResult < ScrapeResultSchema
23
+ extend Forwardable
24
+ include Enumerable
25
+
26
+ def_delegators :items, :each, :[], :count, :size, :length, :first, :last, :empty?
27
+
28
+ def self.from_array( array )
29
+ new( items: array )
30
+ end
31
+
32
+ def success?
33
+ items&.all?( &:success? ) || false
34
+ end
35
+
36
+ # convenience method for single URL scrapes
37
+ def content
38
+ first&.content
39
+ end
40
+
41
+ def error
42
+ first&.error
43
+ end
44
+
45
+ def status
46
+ first&.status
47
+ end
48
+
49
+ def url
50
+ first&.url
51
+ end
52
+
53
+ def costs
54
+ first&.costs
55
+ end
56
+
57
+ def duration_elapsed_ms
58
+ first&.duration_elapsed_ms
59
+ end
60
+ end
61
+
62
+ end
@@ -0,0 +1,84 @@
1
+ module SpiderCloud
2
+ class ScreenshotOptions
3
+ include DynamicSchema::Definable
4
+ include Helpers
5
+
6
+ schema do
7
+ full_page [ TrueClass, FalseClass ]
8
+ binary [ TrueClass, FalseClass ]
9
+ omit_background [ TrueClass, FalseClass ]
10
+ block_images [ TrueClass, FalseClass ]
11
+
12
+ cdp_params do
13
+ format Symbol, in: IMAGE_FORMATS
14
+ quality Integer, in: 0..100
15
+ clip do
16
+ x Integer
17
+ y Integer
18
+ width Integer
19
+ height Integer
20
+ scale [ Integer, Float ]
21
+ end
22
+ from_surface [ TrueClass, FalseClass ]
23
+ capture_beyond_viewport [ TrueClass, FalseClass ]
24
+ end
25
+
26
+ limit Integer
27
+ request Symbol, in: REQUEST_TYPES
28
+
29
+ viewport do
30
+ width Integer
31
+ height Integer
32
+ end
33
+ device Symbol, in: DEVICE_TYPES
34
+
35
+ wait_for do
36
+ idle_network do
37
+ timeout do
38
+ seconds Integer, as: :secs
39
+ nanoseconds Integer, as: :nanos
40
+ end
41
+ end
42
+ selector String
43
+ delay do
44
+ timeout do
45
+ seconds Integer, as: :secs
46
+ nanoseconds Integer, as: :nanos
47
+ end
48
+ end
49
+ end
50
+
51
+ proxy Symbol, in: PROXY_TYPES
52
+ proxy_enabled [ TrueClass, FalseClass ]
53
+ country_code String
54
+
55
+ stealth [ TrueClass, FalseClass ]
56
+ fingerprint [ TrueClass, FalseClass ]
57
+ scroll Integer
58
+ block_ads [ TrueClass, FalseClass ]
59
+ virtual_display [ TrueClass, FalseClass ]
60
+
61
+ cookies String
62
+ headers Hash
63
+ automation_scripts Hash
64
+ end
65
+
66
+ def self.build( options = nil, &block )
67
+ new( api_options: builder.build( options, &block ) )
68
+ end
69
+
70
+ def self.build!( options = nil, &block )
71
+ new( api_options: builder.build!( options, &block ) )
72
+ end
73
+
74
+ def initialize( options = {}, api_options: nil )
75
+ @options = self.class.builder.build( options || {} )
76
+ @options = api_options.merge( @options ) if api_options
77
+ end
78
+
79
+ def to_h
80
+ @options.to_h
81
+ end
82
+
83
+ end
84
+ end
@@ -0,0 +1,29 @@
1
+ module SpiderCloud
2
+ class ScreenshotRequest < Request
3
+
4
+ def submit( url, options = nil, &block )
5
+ if options
6
+ options = options.is_a?( ScreenshotOptions ) ? options : \
7
+ ScreenshotOptions.build!( options.to_h )
8
+ options = options.to_h
9
+ else
10
+ options = {}
11
+ end
12
+ options[ :url ] = Helpers.normalize_url( url )
13
+
14
+ response = post( "#{ BASE_URI }/screenshot", options, &block )
15
+ attributes = ( JSON.parse( response.body, symbolize_names: true ) rescue nil )
16
+
17
+ result = if response.success? && attributes.is_a?( Array )
18
+ ScreenshotResult.from_array( attributes )
19
+ elsif response.success?
20
+ ErrorResult.new( response.status, attributes )
21
+ else
22
+ ErrorResult.new( response.status, attributes )
23
+ end
24
+
25
+ ResponseMethods.install( response, result )
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,69 @@
1
+ module SpiderCloud
2
+
3
+ ScreenshotResultItemSchema = DynamicSchema::Struct.define do
4
+ content String
5
+ error String
6
+ status Integer
7
+ url String
8
+ end
9
+
10
+ class ScreenshotResultItem < ScreenshotResultItemSchema
11
+ def success?
12
+ error.nil? && ( status.nil? || ( status >= 200 && status < 300 ) )
13
+ end
14
+
15
+ # decode base64 content to binary
16
+ def image_data
17
+ return nil unless content
18
+ Base64.decode64( content )
19
+ end
20
+ end
21
+
22
+ ScreenshotResultSchema = DynamicSchema::Struct.define do
23
+ items ScreenshotResultItem, array: true
24
+ end
25
+
26
+ class ScreenshotResult < ScreenshotResultSchema
27
+ extend Forwardable
28
+ include Enumerable
29
+
30
+ def_delegators :items, :each, :[], :count, :size, :length, :first, :last, :empty?
31
+
32
+ def self.from_array( array )
33
+ new( items: array )
34
+ end
35
+
36
+ def success?
37
+ items&.all?( &:success? ) || false
38
+ end
39
+
40
+ # convenience method for single URL screenshots
41
+ def content
42
+ first&.content
43
+ end
44
+
45
+ def image_data
46
+ first&.image_data
47
+ end
48
+
49
+ def error
50
+ first&.error
51
+ end
52
+
53
+ def status
54
+ first&.status
55
+ end
56
+
57
+ def url
58
+ first&.url
59
+ end
60
+
61
+ # save screenshot to file
62
+ def save_to( path )
63
+ return false unless image_data
64
+ File.binwrite( path, image_data )
65
+ true
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,80 @@
1
+ module SpiderCloud
2
+
3
+ RETURN_FORMATS = [ :markdown, :commonmark, :raw, :text, :html2text,
4
+ :xml, :bytes, :empty ]
5
+ REQUEST_TYPES = [ :http, :chrome, :smart ]
6
+ PROXY_TYPES = [ :residential, :mobile, :isp ]
7
+ DEVICE_TYPES = [ :mobile, :tablet, :desktop ]
8
+ REDIRECT_POLICIES = [ :loose, :strict, :none ]
9
+ CHUNKING_TYPES = [ :no, :by_words, :by_lines, :by_character_length,
10
+ :by_sentence ]
11
+ IMAGE_FORMATS = [ :png, :jpeg ]
12
+
13
+ # reusable schema fragments for dynamic_schema definable classes
14
+ module SharedSchemas
15
+
16
+ # timeout schema fragment: seconds and nanoseconds
17
+ def timeout_schema
18
+ proc do
19
+ timeout do
20
+ seconds Integer, as: :secs
21
+ nanoseconds Integer, as: :nanos
22
+ end
23
+ end
24
+ end
25
+
26
+ # viewport schema fragment: width and height
27
+ def viewport_schema
28
+ proc do
29
+ viewport do
30
+ width Integer
31
+ height Integer
32
+ end
33
+ end
34
+ end
35
+
36
+ # wait_for schema fragment with all wait conditions
37
+ def wait_for_schema( include_dom: true, include_page_navigations: true )
38
+ timeout_proc = timeout_schema
39
+ proc do
40
+ wait_for do
41
+ idle_network do
42
+ instance_eval( &timeout_proc )
43
+ end
44
+ selector String
45
+ if include_dom
46
+ dom do
47
+ instance_eval( &timeout_proc )
48
+ end
49
+ end
50
+ delay do
51
+ instance_eval( &timeout_proc )
52
+ end
53
+ if include_page_navigations
54
+ page_navigations do
55
+ instance_eval( &timeout_proc )
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ # gpt_config schema fragment for ai integration
63
+ def gpt_config_schema
64
+ proc do
65
+ gpt_config do
66
+ prompt String
67
+ model String
68
+ max_tokens Integer
69
+ temperature Float
70
+ top_p Float
71
+ api_key String
72
+ extra_ai_data [ TrueClass, FalseClass ]
73
+ screenshot [ TrueClass, FalseClass ]
74
+ end
75
+ end
76
+ end
77
+
78
+ end
79
+
80
+ end
@@ -0,0 +1,3 @@
1
+ module SpiderCloud
2
+ VERSION = '1.0.0'
3
+ end