curlyq 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/CHANGELOG.md +8 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +41 -0
- data/LICENSE.txt +19 -0
- data/README.md +233 -0
- data/README.rdoc +6 -0
- data/Rakefile +77 -0
- data/bin/curlyq +477 -0
- data/curlyq.gemspec +27 -0
- data/curlyq.rdoc +355 -0
- data/lib/curly/array.rb +134 -0
- data/lib/curly/curl/html.rb +720 -0
- data/lib/curly/curl/json.rb +108 -0
- data/lib/curly/curl.rb +7 -0
- data/lib/curly/hash.rb +200 -0
- data/lib/curly/string.rb +91 -0
- data/lib/curly/version.rb +3 -0
- data/lib/curly.rb +12 -0
- data/src/_README.md +101 -0
- data/test/default_test.rb +14 -0
- data/test/test_helper.rb +4 -0
- metadata +191 -0
@@ -0,0 +1,720 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Curl
|
4
|
+
# String helpers
|
5
|
+
class ::String
|
6
|
+
def remove_entities
|
7
|
+
gsub(/ /, ' ')
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Class for CURLing an HTML page
|
12
|
+
class Html
|
13
|
+
attr_reader :url, :code, :headers, :meta, :links, :head, :body,
|
14
|
+
:source, :title, :description, :body_links, :body_images, :clean
|
15
|
+
|
16
|
+
def to_data(url: nil)
|
17
|
+
{
|
18
|
+
url: @url || url,
|
19
|
+
code: @code,
|
20
|
+
headers: @headers,
|
21
|
+
meta: @meta,
|
22
|
+
meta_links: @links,
|
23
|
+
head: @head,
|
24
|
+
body: @body,
|
25
|
+
source: @source,
|
26
|
+
title: @title,
|
27
|
+
description: @description,
|
28
|
+
links: @body_links,
|
29
|
+
images: @body_images
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
## Create a new page object from a URL
|
35
|
+
##
|
36
|
+
## @param url [String] The url
|
37
|
+
## @param headers [Hash] The headers to use in the curl call
|
38
|
+
## @param headers_only [Boolean] Return headers only
|
39
|
+
## @param compressed [Boolean] Expect compressed result
|
40
|
+
##
|
41
|
+
## @return [HTMLCurl] new page object
|
42
|
+
##
|
43
|
+
def initialize(url, browser: nil, source: nil, headers: nil,
|
44
|
+
headers_only: false, compressed: false, clean: false, fallback: false,
|
45
|
+
ignore_local_links: false, ignore_fragment_links: false, external_links_only: false)
|
46
|
+
@clean = clean
|
47
|
+
@ignore_local_links = ignore_local_links
|
48
|
+
@ignore_fragment_links = ignore_fragment_links
|
49
|
+
@external_links_only = external_links_only
|
50
|
+
@curl = TTY::Which.which('curl')
|
51
|
+
@url = url
|
52
|
+
res = if url && browser && browser != :none
|
53
|
+
source = curl_dynamic_html(url, browser, headers)
|
54
|
+
curl_html(nil, source: source, headers: headers)
|
55
|
+
elsif url.nil? && !source.nil?
|
56
|
+
curl_html(nil, source: source, headers: headers, headers_only: headers_only, compressed: compressed, fallback: false)
|
57
|
+
else
|
58
|
+
curl_html(url, headers: headers, headers_only: headers_only, compressed: compressed, fallback: fallback)
|
59
|
+
end
|
60
|
+
@url = res[:url]
|
61
|
+
@code = res[:code]
|
62
|
+
@headers = res[:headers]
|
63
|
+
@meta = res[:meta]
|
64
|
+
@links = res[:links]
|
65
|
+
@head = res[:head] unless res[:head].nil?
|
66
|
+
@body = reencode(res[:body])
|
67
|
+
@source = res[:source]
|
68
|
+
@title = @meta['og:title'] || @meta['title'] unless @meta.nil?
|
69
|
+
@description = @meta['og:description'] || @meta['description'] unless @meta.nil?
|
70
|
+
@body_links = content_links
|
71
|
+
@body_images = content_images
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
## Save a screenshot of the url
|
76
|
+
##
|
77
|
+
## @param urls [Array] The urls
|
78
|
+
## @param destination The file destination
|
79
|
+
## @param browser The browser (:firefox,
|
80
|
+
## :chrome)
|
81
|
+
## @param type The type of screenshot to
|
82
|
+
## save (:full_page,
|
83
|
+
## :print_page, :visible)
|
84
|
+
##
|
85
|
+
def screenshot(destination = nil, browser: :chrome, type: :full_page)
|
86
|
+
full_page = type.to_sym == :full_page
|
87
|
+
print_page = type.to_sym == :print_page
|
88
|
+
save_screenshot(destination, browser: browser, type: type)
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
## Extract text between two regular expressions
|
93
|
+
##
|
94
|
+
## @param before [String, Regexp] The before
|
95
|
+
## @param after [String, Regexp] The after
|
96
|
+
##
|
97
|
+
## @return [Array] array of matches
|
98
|
+
##
|
99
|
+
def extract(before, after)
|
100
|
+
before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp)
|
101
|
+
after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp)
|
102
|
+
rx = /(?<=#{before.source})(.*?)(?=#{after.source})/m
|
103
|
+
@body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
## Extract an array of tags or tag attributes
|
108
|
+
##
|
109
|
+
## @param tag [String] The tag
|
110
|
+
## @param attribute [String] The attribute
|
111
|
+
## @param source [Boolean] Return full tag source
|
112
|
+
## (negates attribute if true)
|
113
|
+
## @param content [Boolean] Return only tag
|
114
|
+
## contents
|
115
|
+
##
|
116
|
+
## @return [Hash, Array] if source, return array of full
|
117
|
+
## tags, if content, return array of tag contents,
|
118
|
+
## otherwise, return a hash of tags including
|
119
|
+
## attributes and content
|
120
|
+
##
|
121
|
+
## If attribute is not given, tag contents will be returned
|
122
|
+
##
|
123
|
+
## @example page.extract_tag('h1') => [Array of h1 tag
|
124
|
+
## contents]
|
125
|
+
## @example page.extract_tag('img', 'src') => [Array of img
|
126
|
+
## src attributes]
|
127
|
+
##
|
128
|
+
def extract_tag(tag, attribute = nil, source: false, content: false)
|
129
|
+
res = extract_tag_contents(tag, source: true)
|
130
|
+
|
131
|
+
return res if source
|
132
|
+
|
133
|
+
res.map! do |tag_source|
|
134
|
+
m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
|
135
|
+
attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
|
136
|
+
tags = tag_source.match(/<.*?>(?<content>.*?)</)
|
137
|
+
contents = tags.nil? ? nil : tags['content']
|
138
|
+
{
|
139
|
+
tag: tag,
|
140
|
+
source: tag_source,
|
141
|
+
attrs: attrs,
|
142
|
+
content: @clean ? contents&.clean : contents
|
143
|
+
}
|
144
|
+
end
|
145
|
+
|
146
|
+
return res.map { |r| r[:content] } if content
|
147
|
+
|
148
|
+
return res if attribute.nil?
|
149
|
+
|
150
|
+
res.map { |r| r[:attrs][attribute] }
|
151
|
+
end
|
152
|
+
|
153
|
+
##
|
154
|
+
## Extract tag contents or full tag source
|
155
|
+
##
|
156
|
+
## @param tag The tag
|
157
|
+
## @param source [Boolean] Return full tag instead of contents
|
158
|
+
##
|
159
|
+
## @return [Array] array of tag matches/contents
|
160
|
+
def extract_tag_contents(tag, source: false)
|
161
|
+
return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source
|
162
|
+
|
163
|
+
@body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
|
164
|
+
end
|
165
|
+
|
166
|
+
##
|
167
|
+
## Return all tags in body, or a specific tag
|
168
|
+
##
|
169
|
+
## @param tag [String, Array] The tag to return,
|
170
|
+
## can be an array
|
171
|
+
##
|
172
|
+
## @return [Array] Array of tags. If no tag is specified, a
|
173
|
+
## hierarchical array of all tags in the document
|
174
|
+
## is returned. If one or more tags are specified,
|
175
|
+
## return a flattened list in document order.
|
176
|
+
##
|
177
|
+
def tags(tag = nil)
|
178
|
+
tags = content_tags(@body)
|
179
|
+
return tags if tag.nil?
|
180
|
+
|
181
|
+
tag = [tag] unless tag.is_a?(Array)
|
182
|
+
tag.map!(&:downcase)
|
183
|
+
flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
|
184
|
+
end
|
185
|
+
|
186
|
+
##
|
187
|
+
## Get all images from the page
|
188
|
+
##
|
189
|
+
## @return [Array] Array of images, both from picture sources and img tags
|
190
|
+
##
|
191
|
+
def images(types: :all)
|
192
|
+
output = []
|
193
|
+
types = [types] unless types.is_a?(Array)
|
194
|
+
# types.map!(&:normalize_image_type)
|
195
|
+
types.each do |type|
|
196
|
+
if %i[all opengraph].include?(type)
|
197
|
+
%w[og:image twitter:image].each do |src|
|
198
|
+
next unless @meta.key?(src)
|
199
|
+
|
200
|
+
output << {
|
201
|
+
type: 'opengraph',
|
202
|
+
attrs: nil,
|
203
|
+
src: @meta[src]
|
204
|
+
}
|
205
|
+
end
|
206
|
+
end
|
207
|
+
images = tags(%w[img source])
|
208
|
+
images.each do |img|
|
209
|
+
case img[:tag].downcase
|
210
|
+
when /source/
|
211
|
+
next unless %i[all srcset].include?(type)
|
212
|
+
|
213
|
+
srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i }
|
214
|
+
if srcsets.count.positive?
|
215
|
+
srcset = []
|
216
|
+
srcsets.each do |src|
|
217
|
+
src[:value].split(/ *, */).each do |s|
|
218
|
+
image, media = s.split(/ /)
|
219
|
+
srcset << {
|
220
|
+
src: image,
|
221
|
+
media: media
|
222
|
+
}
|
223
|
+
end
|
224
|
+
end
|
225
|
+
output << {
|
226
|
+
type: 'srcset',
|
227
|
+
attrs: img[:attrs],
|
228
|
+
images: srcset
|
229
|
+
}
|
230
|
+
end
|
231
|
+
when /img/
|
232
|
+
next unless %i[all img].include?(type)
|
233
|
+
|
234
|
+
width = img[:attrs].select { |a| a[:key] == 'width' }.first[:value]
|
235
|
+
height = img[:attrs].select { |a| a[:key] == 'height' }.first[:value]
|
236
|
+
alt = img[:attrs].select { |a| a[:key] == 'alt' }.first[:value]
|
237
|
+
title = img[:attrs].select { |a| a[:key] == 'title' }.first[:value]
|
238
|
+
|
239
|
+
output << {
|
240
|
+
type: 'img',
|
241
|
+
src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value],
|
242
|
+
width: width || 'unknown',
|
243
|
+
height: height || 'unknown',
|
244
|
+
alt: alt,
|
245
|
+
title: title,
|
246
|
+
attrs: img[:attrs],
|
247
|
+
}
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
output
|
252
|
+
end
|
253
|
+
|
254
|
+
def to_s
|
255
|
+
headers = @headers.nil? ? 0 : @headers.count
|
256
|
+
meta = @meta.nil? ? 0 : @meta.count
|
257
|
+
links = @links.nil? ? 0 : @links.count
|
258
|
+
[
|
259
|
+
%(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
|
260
|
+
%(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
|
261
|
+
].join(' ')
|
262
|
+
end
|
263
|
+
|
264
|
+
##
|
265
|
+
## Return all headers of given level
|
266
|
+
##
|
267
|
+
## @param level [Number] The level (1-6)
|
268
|
+
##
|
269
|
+
## @return [Array] array of headers with text and all tag attributes as symbols
|
270
|
+
##
|
271
|
+
def h(level = '\d')
|
272
|
+
res = []
|
273
|
+
headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map { Regexp.last_match }
|
274
|
+
headlines.each do |m|
|
275
|
+
headline = { level: m['level'] }
|
276
|
+
if m['tag'].nil?
|
277
|
+
attrs = nil
|
278
|
+
else
|
279
|
+
attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
|
280
|
+
attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
|
281
|
+
end
|
282
|
+
headline[:text] = m['text'].remove_entities
|
283
|
+
res << headline
|
284
|
+
end
|
285
|
+
res
|
286
|
+
end
|
287
|
+
|
288
|
+
##
|
289
|
+
## Convert a nokogiri element to Curl::Html format
|
290
|
+
##
|
291
|
+
## @param el [Nokogiri] element to convert
|
292
|
+
##
|
293
|
+
def nokogiri_to_tag(el)
|
294
|
+
attributes = el.attribute_nodes.map do |a|
|
295
|
+
{ key: a.name, value: a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value }
|
296
|
+
end
|
297
|
+
|
298
|
+
{
|
299
|
+
tag: el.name,
|
300
|
+
source: el.to_html,
|
301
|
+
attrs: attributes,
|
302
|
+
content: @clean ? el.text&.strip&.clean : el.text.strip,
|
303
|
+
tags: recurse_children(el)
|
304
|
+
}
|
305
|
+
end
|
306
|
+
|
307
|
+
def recurse_children(element)
|
308
|
+
children = []
|
309
|
+
element.children.each do |child|
|
310
|
+
next if child.name == 'text'
|
311
|
+
|
312
|
+
children.push(nokogiri_to_tag(child))
|
313
|
+
end
|
314
|
+
children
|
315
|
+
end
|
316
|
+
|
317
|
+
#-------------------------------------------------------
|
318
|
+
## Perform a CSS query using Nokogiri
|
319
|
+
##
|
320
|
+
## @param path [String] The CSS path
|
321
|
+
##
|
322
|
+
## @return [Array] array of matched elements
|
323
|
+
##
|
324
|
+
def search(path, source: @source)
|
325
|
+
doc = Nokogiri::HTML(source)
|
326
|
+
output = []
|
327
|
+
doc.search(path).each do |el|
|
328
|
+
out = nokogiri_to_tag(el)
|
329
|
+
output.push(out)
|
330
|
+
end
|
331
|
+
output
|
332
|
+
end
|
333
|
+
|
334
|
+
private
|
335
|
+
|
336
|
+
##
|
337
|
+
## Flatten the array of tags
|
338
|
+
##
|
339
|
+
## @param tags [Array] Document tags
|
340
|
+
##
|
341
|
+
def flatten_tags(tags)
|
342
|
+
flattened = []
|
343
|
+
|
344
|
+
tags.each do |t|
|
345
|
+
flattened << { tag: t[:tag], attrs: t[:attrs],
|
346
|
+
content: @clean ? t[:content]&.strip&.clean : t[:content]&.strip }
|
347
|
+
flattened.concat(flatten_tags(t[:tags])) unless t[:tags].nil?
|
348
|
+
end
|
349
|
+
|
350
|
+
flattened
|
351
|
+
end
|
352
|
+
|
353
|
+
##
|
354
|
+
## Return an array of all tags in the content
|
355
|
+
##
|
356
|
+
## @param content [String] The content to parse
|
357
|
+
##
|
358
|
+
def content_tags(content)
|
359
|
+
return nil if content.nil?
|
360
|
+
|
361
|
+
res = content.to_enum(:scan, %r{(?mix)
|
362
|
+
<(?<tag>(?!</)[a-z0-9]+)(?<attrs>\s[^>]+)?
|
363
|
+
(?:\s*/>|>(?<content>.*?)</\k<tag>>)}).map { Regexp.last_match }
|
364
|
+
res.map do |tag|
|
365
|
+
if tag['attrs'].nil?
|
366
|
+
attrs = nil
|
367
|
+
else
|
368
|
+
attrs = tag['attrs'].strip.to_enum(:scan, /(?ix)
|
369
|
+
(?<key>[@a-z0-9-]+)(?:=(?<quot>["'])
|
370
|
+
(?<value>[^"']+)\k<quot>|[ >])?/i).map { Regexp.last_match }
|
371
|
+
attrs.map! { |a| { key: a['key'], value: a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value'] } }
|
372
|
+
end
|
373
|
+
{
|
374
|
+
tag: tag['tag'],
|
375
|
+
source: tag.to_s,
|
376
|
+
attrs: attrs,
|
377
|
+
content: @clean ? tag['content']&.clean : tag['content'],
|
378
|
+
tags: content_tags(tag['content'])
|
379
|
+
}
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
##
|
384
|
+
## Extract all meta tags from the document head
|
385
|
+
##
|
386
|
+
## @param head [String] The head content
|
387
|
+
##
|
388
|
+
## @return [Hash] hash of meta tags and values
|
389
|
+
##
|
390
|
+
def meta_tags(head)
|
391
|
+
meta = {}
|
392
|
+
title = head.match(%r{(?<=<title>)(.*?)(?=</title>)})
|
393
|
+
meta['title'] = title.nil? ? nil : title[1]
|
394
|
+
refresh = head.match(/http-equiv=(['"])refresh\1(.*?)>/)
|
395
|
+
url = refresh.nil? ? nil : refresh[2].match(/url=(.*?)['"]/)
|
396
|
+
meta['refresh_url'] = url
|
397
|
+
meta_tags = head.scan(/<meta.*?>/)
|
398
|
+
meta_tags.each do |tag|
|
399
|
+
meta_name = tag.match(/(?:name|property|http-equiv)=(["'])(.*?)\1/)
|
400
|
+
next if meta_name.nil?
|
401
|
+
|
402
|
+
meta_value = tag.match(/(?:content)=(['"])(.*?)\1/)
|
403
|
+
next if meta_value.nil?
|
404
|
+
|
405
|
+
meta[meta_name[2].downcase] = meta_value[2]
|
406
|
+
end
|
407
|
+
meta
|
408
|
+
rescue StandardError => e
|
409
|
+
warn e
|
410
|
+
{}
|
411
|
+
end
|
412
|
+
|
413
|
+
##
|
414
|
+
## Extract all <link> tags from head
|
415
|
+
##
|
416
|
+
## @param head [String] The head content
|
417
|
+
##
|
418
|
+
## @return [Array] Array of links
|
419
|
+
##
|
420
|
+
def link_tags(head)
|
421
|
+
links = []
|
422
|
+
link_tags = head.scan(/<link.*?>/)
|
423
|
+
link_tags.each do |tag|
|
424
|
+
link_rel = tag.match(/rel=(['"])(.*?)\1/)
|
425
|
+
link_rel = link_rel.nil? ? nil : link_rel[2]
|
426
|
+
|
427
|
+
next if link_rel =~ /preload/
|
428
|
+
|
429
|
+
link_href = tag.match(/href=(["'])(.*?)\1/)
|
430
|
+
next if link_href.nil?
|
431
|
+
|
432
|
+
link_href = link_href[2]
|
433
|
+
|
434
|
+
next if link_href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
|
435
|
+
|
436
|
+
next if link_href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
|
437
|
+
|
438
|
+
next if same_origin?(link_href) && @external_links_only
|
439
|
+
|
440
|
+
link_title = tag.match(/title=(['"])(.*?)\1/)
|
441
|
+
link_title = link_title.nil? ? nil : link_title[2]
|
442
|
+
|
443
|
+
link_type = tag.match(/type=(['"])(.*?)\1/)
|
444
|
+
link_type = link_type.nil? ? nil : link_type[2]
|
445
|
+
|
446
|
+
links << { rel: link_rel, href: link_href, type: link_type, title: link_title }
|
447
|
+
end
|
448
|
+
links
|
449
|
+
end
|
450
|
+
|
451
|
+
##
|
452
|
+
## Get all links in the body of the page
|
453
|
+
##
|
454
|
+
## rel and class are returned as arrays
|
455
|
+
##
|
456
|
+
## @return [Array] array of links with href, title,
|
457
|
+
## rel, content and class
|
458
|
+
##
|
459
|
+
def content_links
|
460
|
+
links = []
|
461
|
+
link_tags = @body.to_enum(:scan, %r{<a ?(?<tag>.*?)>(?<text>.*?)</a>}).map { Regexp.last_match }
|
462
|
+
link_tags.each do |m|
|
463
|
+
href = m['tag'].match(/href=(["'])(.*?)\1/)
|
464
|
+
href = href[2] unless href.nil?
|
465
|
+
next if href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
|
466
|
+
|
467
|
+
next if href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
|
468
|
+
|
469
|
+
next if same_origin?(href) && @external_links_only
|
470
|
+
|
471
|
+
title = m['tag'].match(/title=(["'])(.*?)\1/)
|
472
|
+
title = title[2] unless title.nil?
|
473
|
+
rel = m['tag'].match(/rel=(["'])(.*?)\1/)
|
474
|
+
rel = rel[2].split(/ +/) unless rel.nil?
|
475
|
+
link_class = m['tag'].match(/class=(["'])(.*?)\1/)
|
476
|
+
link_class = link_class[2].split(/ +/) unless link_class.nil?
|
477
|
+
text = m['text'].remove_entities
|
478
|
+
link = {
|
479
|
+
href: href,
|
480
|
+
title: title,
|
481
|
+
rel: rel,
|
482
|
+
content: text,
|
483
|
+
class: link_class
|
484
|
+
}
|
485
|
+
links << link
|
486
|
+
end
|
487
|
+
links
|
488
|
+
end
|
489
|
+
|
490
|
+
##
|
491
|
+
## Get all img tags in the body of the page
|
492
|
+
##
|
493
|
+
## @return [Array] array of images with src and all attributes
|
494
|
+
##
|
495
|
+
def content_images
|
496
|
+
images = []
|
497
|
+
image_tags = @body.to_enum(:scan, %r{<img (?<tag>.*?)/?>}).map { Regexp.last_match }
|
498
|
+
image_tags.each do |m|
|
499
|
+
attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
|
500
|
+
image = {}
|
501
|
+
attrs.each { |a| image[a['attr'].to_sym] = a['content'] }
|
502
|
+
images << image
|
503
|
+
end
|
504
|
+
images
|
505
|
+
end
|
506
|
+
|
507
|
+
##
|
508
|
+
## Uses Selenium to load a page, allowing capture of dynamic (JS) pages
|
509
|
+
##
|
510
|
+
## @param url The url
|
511
|
+
##
|
512
|
+
## @return [String] page source
|
513
|
+
##
|
514
|
+
def curl_dynamic_html(url, browser, headers)
|
515
|
+
browser = browser.normalize_browser_type if browser.is_a?(String)
|
516
|
+
res = nil
|
517
|
+
|
518
|
+
driver = Selenium::WebDriver.for browser
|
519
|
+
driver.manage.timeouts.implicit_wait = 4
|
520
|
+
begin
|
521
|
+
driver.get url
|
522
|
+
res = driver.page_source
|
523
|
+
ensure
|
524
|
+
driver.quit
|
525
|
+
end
|
526
|
+
|
527
|
+
res
|
528
|
+
end
|
529
|
+
|
530
|
+
##
|
531
|
+
## Save a screenshot of a url
|
532
|
+
##
|
533
|
+
## @param destination [String] File path destination
|
534
|
+
## @param browser [Symbol] The browser (:chrome or :firefox)
|
535
|
+
## @param type [Symbol] The type of screenshot (:full_page, :print_page, or :visible)
|
536
|
+
##
|
537
|
+
def save_screenshot(destination = nil, browser: :chrome, type: :full_page)
|
538
|
+
raise 'No URL provided' if url.nil?
|
539
|
+
|
540
|
+
raise 'No file destination provided' if destination.nil?
|
541
|
+
|
542
|
+
destination = File.expand_path(destination)
|
543
|
+
|
544
|
+
raise 'Path doesn\'t exist' unless File.directory?(File.dirname(destination))
|
545
|
+
|
546
|
+
browser = browser.normalize_browser_type if browser.is_a?(String)
|
547
|
+
type = type.normalize_screenshot_type if type.is_a?(String)
|
548
|
+
raise 'Can not save full screen with Chrome, use Firefox' if type == :full_page && browser == :chrome
|
549
|
+
|
550
|
+
destination = case type
|
551
|
+
when :print_page
|
552
|
+
"#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.pdf"
|
553
|
+
else
|
554
|
+
"#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.png"
|
555
|
+
end
|
556
|
+
|
557
|
+
driver = Selenium::WebDriver.for browser
|
558
|
+
driver.manage.timeouts.implicit_wait = 4
|
559
|
+
begin
|
560
|
+
driver.get @url
|
561
|
+
case type
|
562
|
+
when :print_page
|
563
|
+
driver.save_print_page(destination)
|
564
|
+
when :full_page
|
565
|
+
driver.save_full_page_screenshot(destination)
|
566
|
+
else
|
567
|
+
driver.save_screenshot(destination)
|
568
|
+
end
|
569
|
+
ensure
|
570
|
+
driver.quit
|
571
|
+
end
|
572
|
+
|
573
|
+
$stderr.puts "Screenshot saved to #{destination}"
|
574
|
+
end
|
575
|
+
|
576
|
+
##
|
577
|
+
## Curls the html for the page
|
578
|
+
##
|
579
|
+
## @param url [String] The url
|
580
|
+
## @param headers [Hash] The headers
|
581
|
+
## @param headers_only [Boolean] Return headers only
|
582
|
+
## @param compressed [Boolean] expect compressed results
|
583
|
+
##
|
584
|
+
## @return [Hash] hash of url, code, headers, meta, links, head, body, and source
|
585
|
+
##
|
586
|
+
def curl_html(url = nil, source: nil, headers: nil,
|
587
|
+
headers_only: false, compressed: false, fallback: false)
|
588
|
+
unless url.nil?
|
589
|
+
flags = 'SsL'
|
590
|
+
flags += headers_only ? 'I' : 'i'
|
591
|
+
agents = [
|
592
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
|
593
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
|
594
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
|
595
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
|
596
|
+
]
|
597
|
+
headers = headers.nil? ? '' : headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
|
598
|
+
compress = compressed ? '--compressed' : ''
|
599
|
+
source = `#{@curl} -#{flags} #{compress} #{headers} '#{url}' 2>/dev/null`
|
600
|
+
agent = 0
|
601
|
+
while source.nil? || source.empty?
|
602
|
+
source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{url}' 2>/dev/null`
|
603
|
+
break if agent >= agents.count - 1
|
604
|
+
end
|
605
|
+
|
606
|
+
unless $?.success? || fallback
|
607
|
+
warn "Error curling #{url}"
|
608
|
+
Process.exit 1
|
609
|
+
end
|
610
|
+
|
611
|
+
if fallback && (source.nil? || source.empty?)
|
612
|
+
source = curl_dynamic_html(url, fallback, headers)
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
return false if source.nil? || source.empty?
|
617
|
+
|
618
|
+
source.strip!
|
619
|
+
|
620
|
+
headers = { 'location' => url }
|
621
|
+
lines = source.split(/\r\n/)
|
622
|
+
code = lines[0].match(/(\d\d\d)/)[1]
|
623
|
+
lines.shift
|
624
|
+
lines.each_with_index do |line, idx|
|
625
|
+
if line =~ /^([\w-]+): (.*?)$/
|
626
|
+
m = Regexp.last_match
|
627
|
+
headers[m[1]] = m[2]
|
628
|
+
else
|
629
|
+
source = lines[idx..].join("\n")
|
630
|
+
break
|
631
|
+
end
|
632
|
+
end
|
633
|
+
|
634
|
+
if headers['content-encoding'] =~ /gzip/i && !compressed
|
635
|
+
warn 'Response is gzipped, you may need to try again with --compressed'
|
636
|
+
end
|
637
|
+
|
638
|
+
if headers['content-type'] =~ /json/
|
639
|
+
return { url: url, code: code, headers: headers, meta: nil, links: nil,
|
640
|
+
head: nil, body: source.strip, source: source.strip, body_links: nil, body_images: nil }
|
641
|
+
end
|
642
|
+
|
643
|
+
head = source.match(%r{(?<=<head>)(.*?)(?=</head>)}mi)
|
644
|
+
|
645
|
+
if head.nil?
|
646
|
+
{ url: url, code: code, headers: headers, meta: nil, links: nil, head: nil, body: source.strip,
|
647
|
+
source: source.strip, body_links: nil, body_images: nil }
|
648
|
+
else
|
649
|
+
meta = meta_tags(head[1])
|
650
|
+
links = link_tags(head[1])
|
651
|
+
body = source.match(%r{<body.*?>(.*?)</body>}mi)[1]
|
652
|
+
{ url: url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: body,
|
653
|
+
source: source.strip, body_links: body_links, body_images: body_images }
|
654
|
+
end
|
655
|
+
end
|
656
|
+
|
657
|
+
##
|
658
|
+
## Reencode the content (borrowed from Nokogiri)
|
659
|
+
##
|
660
|
+
## @param body [String] The body
|
661
|
+
## @param content_type [String] Force content type
|
662
|
+
##
|
663
|
+
def reencode(body, content_type = nil)
|
664
|
+
if body.encoding == Encoding::ASCII_8BIT
|
665
|
+
encoding = nil
|
666
|
+
|
667
|
+
# look for a Byte Order Mark (BOM)
|
668
|
+
initial_bytes = body[0..2].bytes
|
669
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
670
|
+
encoding = Encoding::UTF_8
|
671
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
672
|
+
encoding = Encoding::UTF_16BE
|
673
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
674
|
+
encoding = Encoding::UTF_16LE
|
675
|
+
end
|
676
|
+
|
677
|
+
# look for a charset in a content-encoding header
|
678
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] if content_type
|
679
|
+
|
680
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
681
|
+
unless encoding
|
682
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
683
|
+
data.scan(/<meta.*?>/im).each do |meta|
|
684
|
+
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
685
|
+
end
|
686
|
+
end
|
687
|
+
|
688
|
+
# if all else fails, default to the official default encoding for HTML
|
689
|
+
encoding ||= Encoding::ISO_8859_1
|
690
|
+
|
691
|
+
# change the encoding to match the detected or inferred encoding
|
692
|
+
body = body.dup
|
693
|
+
begin
|
694
|
+
body.force_encoding(encoding)
|
695
|
+
rescue ArgumentError
|
696
|
+
body.force_encoding(Encoding::ISO_8859_1)
|
697
|
+
end
|
698
|
+
end
|
699
|
+
|
700
|
+
body.encode(Encoding::UTF_8)
|
701
|
+
end
|
702
|
+
|
703
|
+
##
|
704
|
+
## Test if a given url has the same hostname as @url
|
705
|
+
##
|
706
|
+
## @param href [String] The url to test
|
707
|
+
##
|
708
|
+
## @return [Boolean] true if hostnames match
|
709
|
+
##
|
710
|
+
def same_origin?(href)
|
711
|
+
begin
|
712
|
+
uri = URI(href)
|
713
|
+
origin = URI(@url)
|
714
|
+
uri.host == origin.host
|
715
|
+
rescue StandardError
|
716
|
+
false
|
717
|
+
end
|
718
|
+
end
|
719
|
+
end
|
720
|
+
end
|