crawlberg 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/crawlberg_rb/native/Cargo.lock +66 -30
- data/ext/crawlberg_rb/native/Cargo.toml +5 -2
- data/ext/crawlberg_rb/src/lib.rs +7 -84
- data/lib/crawlberg/native.rb +81 -39
- data/lib/crawlberg/version.rb +2 -2
- data/lib/crawlberg.rb +10 -1
- data/lib/crawlberg_rb.so +0 -0
- data/sig/types.rbs +372 -372
- metadata +2 -2
data/sig/types.rbs
CHANGED
|
@@ -1,530 +1,530 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:23d662f17ccee663375ea978facec5b4b691adf30860c73224d58efb602c12d2
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
|
|
6
6
|
module Crawlberg
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
VERSION: String
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
type json_value = Hash[String, untyped] | Array[untyped] | String | Integer | Float | bool | nil
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
12
|
+
class ExtractionMeta
|
|
13
|
+
attr_accessor cost: Float?
|
|
14
|
+
attr_accessor prompt_tokens: Integer?
|
|
15
|
+
attr_accessor completion_tokens: Integer?
|
|
16
|
+
attr_accessor model: String?
|
|
17
|
+
attr_accessor chunks_processed: Integer?
|
|
18
18
|
|
|
19
19
|
def initialize: (?cost: Float, ?prompt_tokens: Integer, ?completion_tokens: Integer, ?model: String, ?chunks_processed: Integer) -> void
|
|
20
|
-
|
|
20
|
+
end
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
22
|
+
class ProxyConfig
|
|
23
|
+
attr_accessor url: String?
|
|
24
|
+
attr_accessor username: String?
|
|
25
|
+
attr_accessor password: String?
|
|
26
26
|
|
|
27
27
|
def initialize: (?url: String, ?username: String, ?password: String) -> void
|
|
28
|
-
|
|
28
|
+
end
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
30
|
+
class ContentConfig
|
|
31
|
+
attr_accessor output_format: String?
|
|
32
|
+
attr_accessor preprocessing_preset: String?
|
|
33
|
+
attr_accessor remove_navigation: bool?
|
|
34
|
+
attr_accessor remove_forms: bool?
|
|
35
35
|
attr_accessor strip_tags: Array[String]?
|
|
36
36
|
attr_accessor preserve_tags: Array[String]?
|
|
37
37
|
attr_accessor exclude_selectors: Array[String]?
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
38
|
+
attr_accessor skip_images: bool?
|
|
39
|
+
attr_accessor max_depth: Integer?
|
|
40
|
+
attr_accessor wrap: bool?
|
|
41
|
+
attr_accessor wrap_width: Integer?
|
|
42
|
+
attr_accessor include_document_structure: bool?
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
def initialize: (?output_format: String, ?preprocessing_preset: String, ?remove_navigation: bool, ?remove_forms: bool, ?strip_tags: Array[String], ?preserve_tags: Array[String], ?exclude_selectors: Array[String], ?skip_images: bool, ?max_depth: Integer, ?wrap: bool, ?wrap_width: Integer, ?include_document_structure: bool) -> void
|
|
45
45
|
def self.default: () -> ContentConfig
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
class BrowserConfig
|
|
49
|
+
attr_accessor mode: BrowserMode?
|
|
50
|
+
attr_accessor backend: BrowserBackend?
|
|
51
|
+
attr_accessor endpoint: String?
|
|
52
|
+
attr_accessor timeout: Integer?
|
|
53
|
+
attr_accessor wait: BrowserWait?
|
|
54
|
+
attr_accessor wait_selector: String?
|
|
55
|
+
attr_accessor extra_wait: Integer?
|
|
56
|
+
attr_accessor proxy: ProxyConfig?
|
|
57
57
|
attr_accessor block_url_patterns: Array[String]?
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
attr_accessor eval_script: String?
|
|
59
|
+
attr_accessor robots_user_agent: String?
|
|
60
|
+
attr_accessor capture_network_events: bool?
|
|
61
|
+
attr_accessor session_affinity: bool?
|
|
62
62
|
|
|
63
|
-
|
|
63
|
+
def initialize: (?mode: BrowserMode, ?backend: BrowserBackend, ?endpoint: String, ?timeout: Integer, ?wait: BrowserWait, ?wait_selector: String, ?extra_wait: Integer, ?proxy: ProxyConfig, ?block_url_patterns: Array[String], ?eval_script: String, ?robots_user_agent: String, ?capture_network_events: bool, ?session_affinity: bool) -> void
|
|
64
64
|
def self.default: () -> BrowserConfig
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
class CrawlConfig
|
|
68
|
+
attr_accessor max_depth: Integer?
|
|
69
|
+
attr_accessor max_pages: Integer?
|
|
70
|
+
attr_accessor max_concurrent: Integer?
|
|
71
|
+
attr_accessor respect_robots_txt: bool?
|
|
72
|
+
attr_accessor soft_http_errors: bool?
|
|
73
|
+
attr_accessor user_agent: String?
|
|
74
|
+
attr_accessor stay_on_domain: bool?
|
|
75
|
+
attr_accessor allow_subdomains: bool?
|
|
76
76
|
attr_accessor include_paths: Array[String]?
|
|
77
77
|
attr_accessor exclude_paths: Array[String]?
|
|
78
78
|
attr_accessor custom_headers: Hash[String, String]?
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
79
|
+
attr_accessor request_timeout: Integer?
|
|
80
|
+
attr_accessor rate_limit_ms: Integer?
|
|
81
|
+
attr_accessor max_redirects: Integer?
|
|
82
|
+
attr_accessor retry_count: Integer?
|
|
83
83
|
attr_accessor retry_codes: Array[Integer]?
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
attr_accessor cookies_enabled: bool?
|
|
85
|
+
attr_accessor auth: AuthConfig?
|
|
86
|
+
attr_accessor max_body_size: Integer?
|
|
87
87
|
attr_accessor remove_tags: Array[String]?
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
88
|
+
attr_accessor content: ContentConfig?
|
|
89
|
+
attr_accessor map_limit: Integer?
|
|
90
|
+
attr_accessor map_search: String?
|
|
91
|
+
attr_accessor download_assets: bool?
|
|
92
92
|
attr_accessor asset_types: Array[AssetCategory]?
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
93
|
+
attr_accessor max_asset_size: Integer?
|
|
94
|
+
attr_accessor browser: BrowserConfig?
|
|
95
|
+
attr_accessor proxy: ProxyConfig?
|
|
96
96
|
attr_accessor user_agents: Array[String]?
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
97
|
+
attr_accessor capture_screenshot: bool?
|
|
98
|
+
attr_accessor follow_document_urls: bool?
|
|
99
|
+
attr_accessor document_url_depth: Integer?
|
|
100
|
+
attr_accessor download_documents: bool?
|
|
101
|
+
attr_accessor document_max_size: Integer?
|
|
102
102
|
attr_accessor document_mime_types: Array[String]?
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
103
|
+
attr_accessor warc_output: String?
|
|
104
|
+
attr_accessor browser_profile: String?
|
|
105
|
+
attr_accessor save_browser_profile: bool?
|
|
106
|
+
attr_accessor ssrf: SsrfPolicy?
|
|
107
107
|
|
|
108
|
-
|
|
108
|
+
def initialize: (?max_depth: Integer, ?max_pages: Integer, ?max_concurrent: Integer, ?respect_robots_txt: bool, ?soft_http_errors: bool, ?user_agent: String, ?stay_on_domain: bool, ?allow_subdomains: bool, ?include_paths: Array[String], ?exclude_paths: Array[String], ?custom_headers: Hash[String, String], ?request_timeout: Integer, ?rate_limit_ms: Integer, ?max_redirects: Integer, ?retry_count: Integer, ?retry_codes: Array[Integer], ?cookies_enabled: bool, ?auth: AuthConfig, ?max_body_size: Integer, ?remove_tags: Array[String], ?content: ContentConfig, ?map_limit: Integer, ?map_search: String, ?download_assets: bool, ?asset_types: Array[AssetCategory], ?max_asset_size: Integer, ?browser: BrowserConfig, ?proxy: ProxyConfig, ?user_agents: Array[String], ?capture_screenshot: bool, ?follow_document_urls: bool, ?document_url_depth: Integer, ?download_documents: bool, ?document_max_size: Integer, ?document_mime_types: Array[String], ?warc_output: String, ?browser_profile: String, ?save_browser_profile: bool, ?ssrf: SsrfPolicy) -> void
|
|
109
109
|
def validate: () -> void
|
|
110
110
|
def self.default: () -> CrawlConfig
|
|
111
|
-
|
|
111
|
+
end
|
|
112
112
|
|
|
113
|
-
|
|
114
|
-
|
|
113
|
+
class BrowserExtras
|
|
114
|
+
attr_accessor eval_result: json_value?
|
|
115
115
|
attr_accessor network_events: Array[ResponseMeta]?
|
|
116
116
|
attr_accessor cookies: Array[CookieInfo]?
|
|
117
117
|
|
|
118
|
-
|
|
119
|
-
|
|
118
|
+
def initialize: (?eval_result: json_value, ?network_events: Array[ResponseMeta], ?cookies: Array[CookieInfo]) -> void
|
|
119
|
+
end
|
|
120
120
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
121
|
+
class DownloadedDocument
|
|
122
|
+
attr_accessor url: String?
|
|
123
|
+
attr_accessor mime_type: String?
|
|
124
|
+
attr_accessor size: Integer?
|
|
125
|
+
attr_accessor filename: String?
|
|
126
|
+
attr_accessor content_hash: String?
|
|
127
127
|
attr_accessor headers: Hash[String, String]?
|
|
128
128
|
|
|
129
|
-
|
|
130
|
-
|
|
129
|
+
def initialize: (?url: String, ?mime_type: String, ?size: Integer, ?filename: String, ?content_hash: String, ?headers: Hash[String, String]) -> void
|
|
130
|
+
end
|
|
131
131
|
|
|
132
|
-
|
|
132
|
+
class InteractionResult
|
|
133
133
|
attr_accessor action_results: Array[ActionResult]?
|
|
134
|
-
|
|
135
|
-
|
|
134
|
+
attr_accessor final_html: String?
|
|
135
|
+
attr_accessor final_url: String?
|
|
136
136
|
|
|
137
|
-
|
|
138
|
-
|
|
137
|
+
def initialize: (?action_results: Array[ActionResult], ?final_html: String, ?final_url: String) -> void
|
|
138
|
+
end
|
|
139
139
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
140
|
+
class ActionResult
|
|
141
|
+
attr_accessor action_index: Integer?
|
|
142
|
+
attr_accessor action_type: String?
|
|
143
|
+
attr_accessor success: bool?
|
|
144
|
+
attr_accessor data: json_value?
|
|
145
|
+
attr_accessor error: String?
|
|
146
146
|
|
|
147
147
|
def initialize: (?action_index: Integer, ?action_type: String, ?success: bool, ?data: json_value, ?error: String) -> void
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
class ScrapeResult
|
|
151
|
+
attr_accessor status_code: Integer?
|
|
152
|
+
attr_accessor final_url: String?
|
|
153
|
+
attr_accessor content_type: String?
|
|
154
|
+
attr_accessor html: String?
|
|
155
|
+
attr_accessor body_size: Integer?
|
|
156
|
+
attr_accessor metadata: PageMetadata?
|
|
157
157
|
attr_accessor links: Array[LinkInfo]?
|
|
158
158
|
attr_accessor images: Array[ImageInfo]?
|
|
159
159
|
attr_accessor feeds: Array[FeedInfo]?
|
|
160
160
|
attr_accessor json_ld: Array[JsonLdEntry]?
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
161
|
+
attr_accessor is_allowed: bool?
|
|
162
|
+
attr_accessor crawl_delay: Integer?
|
|
163
|
+
attr_accessor noindex_detected: bool?
|
|
164
|
+
attr_accessor nofollow_detected: bool?
|
|
165
|
+
attr_accessor x_robots_tag: String?
|
|
166
|
+
attr_accessor is_pdf: bool?
|
|
167
|
+
attr_accessor was_skipped: bool?
|
|
168
|
+
attr_accessor detected_charset: String?
|
|
169
|
+
attr_accessor auth_header_sent: bool?
|
|
170
|
+
attr_accessor response_meta: ResponseMeta?
|
|
171
171
|
attr_accessor assets: Array[DownloadedAsset]?
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
172
|
+
attr_accessor js_render_hint: bool?
|
|
173
|
+
attr_accessor browser_used: bool?
|
|
174
|
+
attr_accessor markdown: MarkdownResult?
|
|
175
|
+
attr_accessor extracted_data: json_value?
|
|
176
|
+
attr_accessor extraction_meta: ExtractionMeta?
|
|
177
|
+
attr_accessor downloaded_document: DownloadedDocument?
|
|
178
|
+
attr_accessor browser: BrowserExtras?
|
|
179
|
+
|
|
180
|
+
def initialize: (?status_code: Integer, ?final_url: String, ?content_type: String, ?html: String, ?body_size: Integer, ?metadata: PageMetadata, ?links: Array[LinkInfo], ?images: Array[ImageInfo], ?feeds: Array[FeedInfo], ?json_ld: Array[JsonLdEntry], ?is_allowed: bool, ?crawl_delay: Integer, ?noindex_detected: bool, ?nofollow_detected: bool, ?x_robots_tag: String, ?is_pdf: bool, ?was_skipped: bool, ?detected_charset: String, ?auth_header_sent: bool, ?response_meta: ResponseMeta, ?assets: Array[DownloadedAsset], ?js_render_hint: bool, ?browser_used: bool, ?markdown: MarkdownResult, ?extracted_data: json_value, ?extraction_meta: ExtractionMeta, ?downloaded_document: DownloadedDocument, ?browser: BrowserExtras) -> void
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
class CrawlPageResult
|
|
184
|
+
attr_accessor url: String?
|
|
185
|
+
attr_accessor normalized_url: String?
|
|
186
|
+
attr_accessor status_code: Integer?
|
|
187
|
+
attr_accessor content_type: String?
|
|
188
|
+
attr_accessor html: String?
|
|
189
|
+
attr_accessor body_size: Integer?
|
|
190
|
+
attr_accessor metadata: PageMetadata?
|
|
191
191
|
attr_accessor links: Array[LinkInfo]?
|
|
192
192
|
attr_accessor images: Array[ImageInfo]?
|
|
193
193
|
attr_accessor feeds: Array[FeedInfo]?
|
|
194
194
|
attr_accessor json_ld: Array[JsonLdEntry]?
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
195
|
+
attr_accessor depth: Integer?
|
|
196
|
+
attr_accessor stayed_on_domain: bool?
|
|
197
|
+
attr_accessor was_skipped: bool?
|
|
198
|
+
attr_accessor is_pdf: bool?
|
|
199
|
+
attr_accessor detected_charset: String?
|
|
200
|
+
attr_accessor markdown: MarkdownResult?
|
|
201
|
+
attr_accessor extracted_data: json_value?
|
|
202
|
+
attr_accessor extraction_meta: ExtractionMeta?
|
|
203
|
+
attr_accessor downloaded_document: DownloadedDocument?
|
|
204
|
+
attr_accessor browser_used: bool?
|
|
205
|
+
|
|
206
|
+
def initialize: (?url: String, ?normalized_url: String, ?status_code: Integer, ?content_type: String, ?html: String, ?body_size: Integer, ?metadata: PageMetadata, ?links: Array[LinkInfo], ?images: Array[ImageInfo], ?feeds: Array[FeedInfo], ?json_ld: Array[JsonLdEntry], ?depth: Integer, ?stayed_on_domain: bool, ?was_skipped: bool, ?is_pdf: bool, ?detected_charset: String, ?markdown: MarkdownResult, ?extracted_data: json_value, ?extraction_meta: ExtractionMeta, ?downloaded_document: DownloadedDocument, ?browser_used: bool) -> void
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
class CrawlResult
|
|
210
210
|
attr_accessor pages: Array[CrawlPageResult]?
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
211
|
+
attr_accessor final_url: String?
|
|
212
|
+
attr_accessor redirect_count: Integer?
|
|
213
|
+
attr_accessor was_skipped: bool?
|
|
214
|
+
attr_accessor error: String?
|
|
215
215
|
attr_accessor cookies: Array[CookieInfo]?
|
|
216
|
-
|
|
217
|
-
|
|
216
|
+
attr_accessor stayed_on_domain: bool?
|
|
217
|
+
attr_accessor browser_used: bool?
|
|
218
218
|
|
|
219
|
-
|
|
219
|
+
def initialize: (?pages: Array[CrawlPageResult], ?final_url: String, ?redirect_count: Integer, ?was_skipped: bool, ?error: String, ?cookies: Array[CookieInfo], ?stayed_on_domain: bool, ?browser_used: bool) -> void
|
|
220
220
|
def unique_normalized_urls: () -> Integer
|
|
221
|
-
|
|
221
|
+
end
|
|
222
222
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
223
|
+
class SitemapUrl
|
|
224
|
+
attr_accessor url: String?
|
|
225
|
+
attr_accessor lastmod: String?
|
|
226
|
+
attr_accessor changefreq: String?
|
|
227
|
+
attr_accessor priority: String?
|
|
228
228
|
|
|
229
229
|
def initialize: (?url: String, ?lastmod: String, ?changefreq: String, ?priority: String) -> void
|
|
230
|
-
|
|
230
|
+
end
|
|
231
231
|
|
|
232
|
-
|
|
232
|
+
class MapResult
|
|
233
233
|
attr_accessor urls: Array[SitemapUrl]?
|
|
234
234
|
|
|
235
|
-
|
|
236
|
-
|
|
235
|
+
def initialize: (?urls: Array[SitemapUrl]) -> void
|
|
236
|
+
end
|
|
237
237
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
238
|
+
class MarkdownResult
|
|
239
|
+
attr_accessor content: String?
|
|
240
|
+
attr_accessor document_structure: json_value?
|
|
241
241
|
attr_accessor tables: Array[json_value]?
|
|
242
242
|
attr_accessor warnings: Array[String]?
|
|
243
|
-
|
|
244
|
-
|
|
243
|
+
attr_accessor citations: bool?
|
|
244
|
+
attr_accessor fit_content: String?
|
|
245
245
|
|
|
246
|
-
|
|
247
|
-
|
|
246
|
+
def initialize: (?content: String, ?document_structure: json_value, ?tables: Array[json_value], ?warnings: Array[String], ?citations: bool, ?fit_content: String) -> void
|
|
247
|
+
end
|
|
248
248
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
249
|
+
class LinkInfo
|
|
250
|
+
attr_accessor url: String?
|
|
251
|
+
attr_accessor text: String?
|
|
252
|
+
attr_accessor link_type: LinkType?
|
|
253
|
+
attr_accessor rel: String?
|
|
254
|
+
attr_accessor nofollow: bool?
|
|
255
255
|
|
|
256
256
|
def initialize: (?url: String, ?text: String, ?link_type: LinkType, ?rel: String, ?nofollow: bool) -> void
|
|
257
|
-
|
|
257
|
+
end
|
|
258
258
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
259
|
+
class ImageInfo
|
|
260
|
+
attr_accessor url: String?
|
|
261
|
+
attr_accessor alt: String?
|
|
262
|
+
attr_accessor width: Integer?
|
|
263
|
+
attr_accessor height: Integer?
|
|
264
|
+
attr_accessor source: ImageSource?
|
|
265
265
|
|
|
266
266
|
def initialize: (?url: String, ?alt: String, ?width: Integer, ?height: Integer, ?source: ImageSource) -> void
|
|
267
|
-
|
|
267
|
+
end
|
|
268
268
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
269
|
+
class FeedInfo
|
|
270
|
+
attr_accessor url: String?
|
|
271
|
+
attr_accessor title: String?
|
|
272
|
+
attr_accessor feed_type: FeedType?
|
|
273
273
|
|
|
274
274
|
def initialize: (?url: String, ?title: String, ?feed_type: FeedType) -> void
|
|
275
|
-
|
|
275
|
+
end
|
|
276
276
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
277
|
+
class JsonLdEntry
|
|
278
|
+
attr_accessor schema_type: String?
|
|
279
|
+
attr_accessor name: String?
|
|
280
|
+
attr_accessor raw: String?
|
|
281
281
|
|
|
282
282
|
def initialize: (?schema_type: String, ?name: String, ?raw: String) -> void
|
|
283
|
-
|
|
283
|
+
end
|
|
284
284
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
285
|
+
class CookieInfo
|
|
286
|
+
attr_accessor name: String?
|
|
287
|
+
attr_accessor value: String?
|
|
288
|
+
attr_accessor domain: String?
|
|
289
|
+
attr_accessor path: String?
|
|
290
290
|
|
|
291
291
|
def initialize: (?name: String, ?value: String, ?domain: String, ?path: String) -> void
|
|
292
|
-
|
|
292
|
+
end
|
|
293
293
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
294
|
+
class DownloadedAsset
|
|
295
|
+
attr_accessor url: String?
|
|
296
|
+
attr_accessor content_hash: String?
|
|
297
|
+
attr_accessor mime_type: String?
|
|
298
|
+
attr_accessor size: Integer?
|
|
299
|
+
attr_accessor asset_category: AssetCategory?
|
|
300
|
+
attr_accessor html_tag: String?
|
|
301
301
|
|
|
302
302
|
def initialize: (?url: String, ?content_hash: String, ?mime_type: String, ?size: Integer, ?asset_category: AssetCategory, ?html_tag: String) -> void
|
|
303
|
-
|
|
303
|
+
end
|
|
304
304
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
305
|
+
class ArticleMetadata
|
|
306
|
+
attr_accessor published_time: String?
|
|
307
|
+
attr_accessor modified_time: String?
|
|
308
|
+
attr_accessor author: String?
|
|
309
|
+
attr_accessor section: String?
|
|
310
310
|
attr_accessor tags: Array[String]?
|
|
311
311
|
|
|
312
|
-
|
|
313
|
-
|
|
312
|
+
def initialize: (?published_time: String, ?modified_time: String, ?author: String, ?section: String, ?tags: Array[String]) -> void
|
|
313
|
+
end
|
|
314
314
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
315
|
+
class HreflangEntry
|
|
316
|
+
attr_accessor lang: String?
|
|
317
|
+
attr_accessor url: String?
|
|
318
318
|
|
|
319
319
|
def initialize: (?lang: String, ?url: String) -> void
|
|
320
|
-
|
|
320
|
+
end
|
|
321
321
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
322
|
+
class FaviconInfo
|
|
323
|
+
attr_accessor url: String?
|
|
324
|
+
attr_accessor rel: String?
|
|
325
|
+
attr_accessor sizes: String?
|
|
326
|
+
attr_accessor mime_type: String?
|
|
327
327
|
|
|
328
328
|
def initialize: (?url: String, ?rel: String, ?sizes: String, ?mime_type: String) -> void
|
|
329
|
-
|
|
329
|
+
end
|
|
330
330
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
331
|
+
class HeadingInfo
|
|
332
|
+
attr_accessor level: Integer?
|
|
333
|
+
attr_accessor text: String?
|
|
334
334
|
|
|
335
335
|
def initialize: (?level: Integer, ?text: String) -> void
|
|
336
|
-
|
|
336
|
+
end
|
|
337
337
|
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
338
|
+
class ResponseMeta
|
|
339
|
+
attr_accessor etag: String?
|
|
340
|
+
attr_accessor last_modified: String?
|
|
341
|
+
attr_accessor cache_control: String?
|
|
342
|
+
attr_accessor server: String?
|
|
343
|
+
attr_accessor x_powered_by: String?
|
|
344
|
+
attr_accessor content_language: String?
|
|
345
|
+
attr_accessor content_encoding: String?
|
|
346
346
|
|
|
347
347
|
def initialize: (?etag: String, ?last_modified: String, ?cache_control: String, ?server: String, ?x_powered_by: String, ?content_language: String, ?content_encoding: String) -> void
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
class PageMetadata
|
|
351
|
+
attr_accessor title: String?
|
|
352
|
+
attr_accessor description: String?
|
|
353
|
+
attr_accessor canonical_url: String?
|
|
354
|
+
attr_accessor keywords: String?
|
|
355
|
+
attr_accessor author: String?
|
|
356
|
+
attr_accessor viewport: String?
|
|
357
|
+
attr_accessor theme_color: String?
|
|
358
|
+
attr_accessor generator: String?
|
|
359
|
+
attr_accessor robots: String?
|
|
360
|
+
attr_accessor html_lang: String?
|
|
361
|
+
attr_accessor html_dir: String?
|
|
362
|
+
attr_accessor og_title: String?
|
|
363
|
+
attr_accessor og_type: String?
|
|
364
|
+
attr_accessor og_image: String?
|
|
365
|
+
attr_accessor og_description: String?
|
|
366
|
+
attr_accessor og_url: String?
|
|
367
|
+
attr_accessor og_site_name: String?
|
|
368
|
+
attr_accessor og_locale: String?
|
|
369
|
+
attr_accessor og_video: String?
|
|
370
|
+
attr_accessor og_audio: String?
|
|
371
371
|
attr_accessor og_locale_alternates: Array[String]?
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
372
|
+
attr_accessor twitter_card: String?
|
|
373
|
+
attr_accessor twitter_title: String?
|
|
374
|
+
attr_accessor twitter_description: String?
|
|
375
|
+
attr_accessor twitter_image: String?
|
|
376
|
+
attr_accessor twitter_site: String?
|
|
377
|
+
attr_accessor twitter_creator: String?
|
|
378
|
+
attr_accessor dc_title: String?
|
|
379
|
+
attr_accessor dc_creator: String?
|
|
380
|
+
attr_accessor dc_subject: String?
|
|
381
|
+
attr_accessor dc_description: String?
|
|
382
|
+
attr_accessor dc_publisher: String?
|
|
383
|
+
attr_accessor dc_date: String?
|
|
384
|
+
attr_accessor dc_type: String?
|
|
385
|
+
attr_accessor dc_format: String?
|
|
386
|
+
attr_accessor dc_identifier: String?
|
|
387
|
+
attr_accessor dc_language: String?
|
|
388
|
+
attr_accessor dc_rights: String?
|
|
389
|
+
attr_accessor article: ArticleMetadata?
|
|
390
390
|
attr_accessor hreflangs: Array[HreflangEntry]?
|
|
391
391
|
attr_accessor favicons: Array[FaviconInfo]?
|
|
392
392
|
attr_accessor headings: Array[HeadingInfo]?
|
|
393
|
-
|
|
393
|
+
attr_accessor word_count: Integer?
|
|
394
394
|
|
|
395
|
-
|
|
396
|
-
|
|
395
|
+
def initialize: (?title: String, ?description: String, ?canonical_url: String, ?keywords: String, ?author: String, ?viewport: String, ?theme_color: String, ?generator: String, ?robots: String, ?html_lang: String, ?html_dir: String, ?og_title: String, ?og_type: String, ?og_image: String, ?og_description: String, ?og_url: String, ?og_site_name: String, ?og_locale: String, ?og_video: String, ?og_audio: String, ?og_locale_alternates: Array[String], ?twitter_card: String, ?twitter_title: String, ?twitter_description: String, ?twitter_image: String, ?twitter_site: String, ?twitter_creator: String, ?dc_title: String, ?dc_creator: String, ?dc_subject: String, ?dc_description: String, ?dc_publisher: String, ?dc_date: String, ?dc_type: String, ?dc_format: String, ?dc_identifier: String, ?dc_language: String, ?dc_rights: String, ?article: ArticleMetadata, ?hreflangs: Array[HreflangEntry], ?favicons: Array[FaviconInfo], ?headings: Array[HeadingInfo], ?word_count: Integer) -> void
|
|
396
|
+
end
|
|
397
397
|
|
|
398
|
-
|
|
399
|
-
|
|
398
|
+
class CrawlStreamRequest
|
|
399
|
+
attr_accessor url: String?
|
|
400
400
|
|
|
401
401
|
def initialize: (?url: String) -> void
|
|
402
|
-
|
|
402
|
+
end
|
|
403
403
|
|
|
404
|
-
|
|
404
|
+
class BatchCrawlStreamRequest
|
|
405
405
|
attr_accessor urls: Array[String]?
|
|
406
406
|
|
|
407
|
-
|
|
408
|
-
|
|
407
|
+
def initialize: (?urls: Array[String]) -> void
|
|
408
|
+
end
|
|
409
409
|
|
|
410
|
-
|
|
411
|
-
|
|
410
|
+
class CitationResult
|
|
411
|
+
attr_accessor content: String?
|
|
412
412
|
attr_accessor references: Array[CitationReference]?
|
|
413
413
|
|
|
414
|
-
|
|
415
|
-
|
|
414
|
+
def initialize: (?content: String, ?references: Array[CitationReference]) -> void
|
|
415
|
+
end
|
|
416
416
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
417
|
+
class CitationReference
|
|
418
|
+
attr_accessor index: Integer?
|
|
419
|
+
attr_accessor url: String?
|
|
420
|
+
attr_accessor text: String?
|
|
421
421
|
|
|
422
422
|
def initialize: (?index: Integer, ?url: String, ?text: String) -> void
|
|
423
|
-
|
|
423
|
+
end
|
|
424
424
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
425
|
+
class CrawlEngineHandle
|
|
426
|
+
def crawl_stream: (CrawlStreamRequest req) -> Enumerator[CrawlStreamIterator]
|
|
427
|
+
def batch_crawl_stream: (BatchCrawlStreamRequest req) -> Enumerator[BatchCrawlStreamIterator]
|
|
428
|
+
end
|
|
429
429
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
430
|
+
class BatchScrapeResult
|
|
431
|
+
attr_accessor url: String?
|
|
432
|
+
attr_accessor result: ScrapeResult?
|
|
433
|
+
attr_accessor error: String?
|
|
434
434
|
|
|
435
435
|
def initialize: (?url: String, ?result: ScrapeResult, ?error: String) -> void
|
|
436
|
-
|
|
436
|
+
end
|
|
437
437
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
438
|
+
class BatchCrawlResult
|
|
439
|
+
attr_accessor url: String?
|
|
440
|
+
attr_accessor result: CrawlResult?
|
|
441
|
+
attr_accessor error: String?
|
|
442
442
|
|
|
443
443
|
def initialize: (?url: String, ?result: CrawlResult, ?error: String) -> void
|
|
444
|
-
|
|
444
|
+
end
|
|
445
445
|
|
|
446
|
-
|
|
446
|
+
class BatchScrapeResults
|
|
447
447
|
attr_accessor results: Array[BatchScrapeResult]?
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
448
|
+
attr_accessor total_count: Integer?
|
|
449
|
+
attr_accessor completed_count: Integer?
|
|
450
|
+
attr_accessor failed_count: Integer?
|
|
451
451
|
|
|
452
|
-
|
|
453
|
-
|
|
452
|
+
def initialize: (?results: Array[BatchScrapeResult], ?total_count: Integer, ?completed_count: Integer, ?failed_count: Integer) -> void
|
|
453
|
+
end
|
|
454
454
|
|
|
455
|
-
|
|
455
|
+
class BatchCrawlResults
|
|
456
456
|
attr_accessor results: Array[BatchCrawlResult]?
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
457
|
+
attr_accessor total_count: Integer?
|
|
458
|
+
attr_accessor completed_count: Integer?
|
|
459
|
+
attr_accessor failed_count: Integer?
|
|
460
460
|
|
|
461
|
-
|
|
462
|
-
|
|
461
|
+
def initialize: (?results: Array[BatchCrawlResult], ?total_count: Integer, ?completed_count: Integer, ?failed_count: Integer) -> void
|
|
462
|
+
end
|
|
463
463
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
464
|
+
class SsrfPolicy
|
|
465
|
+
attr_accessor deny_private: bool?
|
|
466
|
+
attr_accessor max_redirects: Integer?
|
|
467
467
|
|
|
468
468
|
def initialize: (?deny_private: bool, ?max_redirects: Integer) -> void
|
|
469
469
|
def self.default: () -> SsrfPolicy
|
|
470
470
|
def self.from_env: () -> SsrfPolicy
|
|
471
|
-
|
|
471
|
+
end
|
|
472
472
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
473
|
+
class BrowserMode
|
|
474
|
+
type value = :auto | :always | :never | :stealth
|
|
475
|
+
end
|
|
476
476
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
477
|
+
class BrowserWait
|
|
478
|
+
type value = :network_idle | :selector | :fixed
|
|
479
|
+
end
|
|
480
480
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
481
|
+
class BrowserBackend
|
|
482
|
+
type value = :chromiumoxide | :native
|
|
483
|
+
end
|
|
484
484
|
|
|
485
|
-
|
|
486
|
-
|
|
485
|
+
class AuthConfig
|
|
486
|
+
end
|
|
487
487
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
488
|
+
class LinkType
|
|
489
|
+
type value = :internal | :external | :anchor | :document
|
|
490
|
+
end
|
|
491
491
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
492
|
+
class ImageSource
|
|
493
|
+
type value = :img | :picture_source | :og_image | :twitter_image
|
|
494
|
+
end
|
|
495
495
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
496
|
+
class FeedType
|
|
497
|
+
type value = :rss | :atom | :json_feed
|
|
498
|
+
end
|
|
499
499
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
500
|
+
class AssetCategory
|
|
501
|
+
type value = :document | :image | :audio | :video | :font | :stylesheet | :script | :archive | :data | :other
|
|
502
|
+
end
|
|
503
503
|
|
|
504
|
-
|
|
505
|
-
|
|
504
|
+
class CrawlEvent
|
|
505
|
+
end
|
|
506
506
|
|
|
507
|
-
|
|
508
|
-
|
|
507
|
+
class PageAction
|
|
508
|
+
end
|
|
509
509
|
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
510
|
+
class ScrollDirection
|
|
511
|
+
type value = :up | :down
|
|
512
|
+
end
|
|
513
513
|
|
|
514
|
-
|
|
514
|
+
def self.generate_citations: (String markdown) -> CitationResult
|
|
515
515
|
|
|
516
|
-
|
|
516
|
+
def self.create_engine: (?CrawlConfig config) -> CrawlEngineHandle
|
|
517
517
|
|
|
518
|
-
|
|
518
|
+
def self.scrape: (CrawlEngineHandle engine, String url) -> ScrapeResult
|
|
519
519
|
|
|
520
|
-
|
|
520
|
+
def self.crawl: (CrawlEngineHandle engine, String url) -> CrawlResult
|
|
521
521
|
|
|
522
|
-
|
|
522
|
+
def self.map_urls: (CrawlEngineHandle engine, String url) -> MapResult
|
|
523
523
|
|
|
524
|
-
|
|
524
|
+
def self.interact: (CrawlEngineHandle engine, String url, Array[PageAction] actions) -> InteractionResult
|
|
525
525
|
|
|
526
|
-
|
|
526
|
+
def self.batch_scrape: (CrawlEngineHandle engine, Array[String] urls) -> BatchScrapeResults
|
|
527
527
|
|
|
528
|
-
|
|
528
|
+
def self.batch_crawl: (CrawlEngineHandle engine, Array[String] urls) -> BatchCrawlResults
|
|
529
529
|
|
|
530
530
|
end
|