curlyq 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/CHANGELOG.md +8 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +41 -0
- data/LICENSE.txt +19 -0
- data/README.md +233 -0
- data/README.rdoc +6 -0
- data/Rakefile +77 -0
- data/bin/curlyq +477 -0
- data/curlyq.gemspec +27 -0
- data/curlyq.rdoc +355 -0
- data/lib/curly/array.rb +134 -0
- data/lib/curly/curl/html.rb +720 -0
- data/lib/curly/curl/json.rb +108 -0
- data/lib/curly/curl.rb +7 -0
- data/lib/curly/hash.rb +200 -0
- data/lib/curly/string.rb +91 -0
- data/lib/curly/version.rb +3 -0
- data/lib/curly.rb +12 -0
- data/src/_README.md +101 -0
- data/test/default_test.rb +14 -0
- data/test/test_helper.rb +4 -0
- metadata +191 -0
@@ -0,0 +1,720 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Curl
|
4
|
+
# String helpers
|
5
|
+
class ::String
|
6
|
+
def remove_entities
|
7
|
+
gsub(/ /, ' ')
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Class for CURLing an HTML page
|
12
|
+
class Html
|
13
|
+
attr_reader :url, :code, :headers, :meta, :links, :head, :body,
|
14
|
+
:source, :title, :description, :body_links, :body_images, :clean
|
15
|
+
|
16
|
+
def to_data(url: nil)
|
17
|
+
{
|
18
|
+
url: @url || url,
|
19
|
+
code: @code,
|
20
|
+
headers: @headers,
|
21
|
+
meta: @meta,
|
22
|
+
meta_links: @links,
|
23
|
+
head: @head,
|
24
|
+
body: @body,
|
25
|
+
source: @source,
|
26
|
+
title: @title,
|
27
|
+
description: @description,
|
28
|
+
links: @body_links,
|
29
|
+
images: @body_images
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
## Create a new page object from a URL
|
35
|
+
##
|
36
|
+
## @param url [String] The url
|
37
|
+
## @param headers [Hash] The headers to use in the curl call
|
38
|
+
## @param headers_only [Boolean] Return headers only
|
39
|
+
## @param compressed [Boolean] Expect compressed result
|
40
|
+
##
|
41
|
+
## @return [HTMLCurl] new page object
|
42
|
+
##
|
43
|
+
def initialize(url, browser: nil, source: nil, headers: nil,
|
44
|
+
headers_only: false, compressed: false, clean: false, fallback: false,
|
45
|
+
ignore_local_links: false, ignore_fragment_links: false, external_links_only: false)
|
46
|
+
@clean = clean
|
47
|
+
@ignore_local_links = ignore_local_links
|
48
|
+
@ignore_fragment_links = ignore_fragment_links
|
49
|
+
@external_links_only = external_links_only
|
50
|
+
@curl = TTY::Which.which('curl')
|
51
|
+
@url = url
|
52
|
+
res = if url && browser && browser != :none
|
53
|
+
source = curl_dynamic_html(url, browser, headers)
|
54
|
+
curl_html(nil, source: source, headers: headers)
|
55
|
+
elsif url.nil? && !source.nil?
|
56
|
+
curl_html(nil, source: source, headers: headers, headers_only: headers_only, compressed: compressed, fallback: false)
|
57
|
+
else
|
58
|
+
curl_html(url, headers: headers, headers_only: headers_only, compressed: compressed, fallback: fallback)
|
59
|
+
end
|
60
|
+
@url = res[:url]
|
61
|
+
@code = res[:code]
|
62
|
+
@headers = res[:headers]
|
63
|
+
@meta = res[:meta]
|
64
|
+
@links = res[:links]
|
65
|
+
@head = res[:head] unless res[:head].nil?
|
66
|
+
@body = reencode(res[:body])
|
67
|
+
@source = res[:source]
|
68
|
+
@title = @meta['og:title'] || @meta['title'] unless @meta.nil?
|
69
|
+
@description = @meta['og:description'] || @meta['description'] unless @meta.nil?
|
70
|
+
@body_links = content_links
|
71
|
+
@body_images = content_images
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
## Save a screenshot of the url
|
76
|
+
##
|
77
|
+
## @param urls [Array] The urls
|
78
|
+
## @param destination The file destination
|
79
|
+
## @param browser The browser (:firefox,
|
80
|
+
## :chrome)
|
81
|
+
## @param type The type of screenshot to
|
82
|
+
## save (:full_page,
|
83
|
+
## :print_page, :visible)
|
84
|
+
##
|
85
|
+
def screenshot(destination = nil, browser: :chrome, type: :full_page)
|
86
|
+
full_page = type.to_sym == :full_page
|
87
|
+
print_page = type.to_sym == :print_page
|
88
|
+
save_screenshot(destination, browser: browser, type: type)
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
## Extract text between two regular expressions
|
93
|
+
##
|
94
|
+
## @param before [String, Regexp] The before
|
95
|
+
## @param after [String, Regexp] The after
|
96
|
+
##
|
97
|
+
## @return [Array] array of matches
|
98
|
+
##
|
99
|
+
def extract(before, after)
|
100
|
+
before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp)
|
101
|
+
after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp)
|
102
|
+
rx = /(?<=#{before.source})(.*?)(?=#{after.source})/m
|
103
|
+
@body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
## Extract an array of tags or tag attributes
|
108
|
+
##
|
109
|
+
## @param tag [String] The tag
|
110
|
+
## @param attribute [String] The attribute
|
111
|
+
## @param source [Boolean] Return full tag source
|
112
|
+
## (negates attribute if true)
|
113
|
+
## @param content [Boolean] Return only tag
|
114
|
+
## contents
|
115
|
+
##
|
116
|
+
## @return [Hash, Array] if source, return array of full
|
117
|
+
## tags, if content, return array of tag contents,
|
118
|
+
## otherwise, return a hash of tags including
|
119
|
+
## attributes and content
|
120
|
+
##
|
121
|
+
## If attribute is not given, tag contents will be returned
|
122
|
+
##
|
123
|
+
## @example page.extract_tag('h1') => [Array of h1 tag
|
124
|
+
## contents]
|
125
|
+
## @example page.extract_tag('img', 'src') => [Array of img
|
126
|
+
## src attributes]
|
127
|
+
##
|
128
|
+
def extract_tag(tag, attribute = nil, source: false, content: false)
|
129
|
+
res = extract_tag_contents(tag, source: true)
|
130
|
+
|
131
|
+
return res if source
|
132
|
+
|
133
|
+
res.map! do |tag_source|
|
134
|
+
m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
|
135
|
+
attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
|
136
|
+
tags = tag_source.match(/<.*?>(?<content>.*?)</)
|
137
|
+
contents = tags.nil? ? nil : tags['content']
|
138
|
+
{
|
139
|
+
tag: tag,
|
140
|
+
source: tag_source,
|
141
|
+
attrs: attrs,
|
142
|
+
content: @clean ? contents&.clean : contents
|
143
|
+
}
|
144
|
+
end
|
145
|
+
|
146
|
+
return res.map { |r| r[:content] } if content
|
147
|
+
|
148
|
+
return res if attribute.nil?
|
149
|
+
|
150
|
+
res.map { |r| r[:attrs][attribute] }
|
151
|
+
end
|
152
|
+
|
153
|
+
##
|
154
|
+
## Extract tag contents or full tag source
|
155
|
+
##
|
156
|
+
## @param tag The tag
|
157
|
+
## @param source [Boolean] Return full tag instead of contents
|
158
|
+
##
|
159
|
+
## @return [Array] array of tag matches/contents
|
160
|
+
def extract_tag_contents(tag, source: false)
|
161
|
+
return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source
|
162
|
+
|
163
|
+
@body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
|
164
|
+
end
|
165
|
+
|
166
|
+
##
|
167
|
+
## Return all tags in body, or a specific tag
|
168
|
+
##
|
169
|
+
## @param tag [String, Array] The tag to return,
|
170
|
+
## can be an array
|
171
|
+
##
|
172
|
+
## @return [Array] Array of tags. If no tag is specified, a
|
173
|
+
## hierarchical array of all tags in the document
|
174
|
+
## is returned. If one or more tags are specified,
|
175
|
+
## return a flattened list in document order.
|
176
|
+
##
|
177
|
+
def tags(tag = nil)
|
178
|
+
tags = content_tags(@body)
|
179
|
+
return tags if tag.nil?
|
180
|
+
|
181
|
+
tag = [tag] unless tag.is_a?(Array)
|
182
|
+
tag.map!(&:downcase)
|
183
|
+
flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
|
184
|
+
end
|
185
|
+
|
186
|
+
##
|
187
|
+
## Get all images from the page
|
188
|
+
##
|
189
|
+
## @return [Array] Array of images, both from picture sources and img tags
|
190
|
+
##
|
191
|
+
def images(types: :all)
|
192
|
+
output = []
|
193
|
+
types = [types] unless types.is_a?(Array)
|
194
|
+
# types.map!(&:normalize_image_type)
|
195
|
+
types.each do |type|
|
196
|
+
if %i[all opengraph].include?(type)
|
197
|
+
%w[og:image twitter:image].each do |src|
|
198
|
+
next unless @meta.key?(src)
|
199
|
+
|
200
|
+
output << {
|
201
|
+
type: 'opengraph',
|
202
|
+
attrs: nil,
|
203
|
+
src: @meta[src]
|
204
|
+
}
|
205
|
+
end
|
206
|
+
end
|
207
|
+
images = tags(%w[img source])
|
208
|
+
images.each do |img|
|
209
|
+
case img[:tag].downcase
|
210
|
+
when /source/
|
211
|
+
next unless %i[all srcset].include?(type)
|
212
|
+
|
213
|
+
srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i }
|
214
|
+
if srcsets.count.positive?
|
215
|
+
srcset = []
|
216
|
+
srcsets.each do |src|
|
217
|
+
src[:value].split(/ *, */).each do |s|
|
218
|
+
image, media = s.split(/ /)
|
219
|
+
srcset << {
|
220
|
+
src: image,
|
221
|
+
media: media
|
222
|
+
}
|
223
|
+
end
|
224
|
+
end
|
225
|
+
output << {
|
226
|
+
type: 'srcset',
|
227
|
+
attrs: img[:attrs],
|
228
|
+
images: srcset
|
229
|
+
}
|
230
|
+
end
|
231
|
+
when /img/
|
232
|
+
next unless %i[all img].include?(type)
|
233
|
+
|
234
|
+
width = img[:attrs].select { |a| a[:key] == 'width' }.first[:value]
|
235
|
+
height = img[:attrs].select { |a| a[:key] == 'height' }.first[:value]
|
236
|
+
alt = img[:attrs].select { |a| a[:key] == 'alt' }.first[:value]
|
237
|
+
title = img[:attrs].select { |a| a[:key] == 'title' }.first[:value]
|
238
|
+
|
239
|
+
output << {
|
240
|
+
type: 'img',
|
241
|
+
src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value],
|
242
|
+
width: width || 'unknown',
|
243
|
+
height: height || 'unknown',
|
244
|
+
alt: alt,
|
245
|
+
title: title,
|
246
|
+
attrs: img[:attrs],
|
247
|
+
}
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
output
|
252
|
+
end
|
253
|
+
|
254
|
+
def to_s
|
255
|
+
headers = @headers.nil? ? 0 : @headers.count
|
256
|
+
meta = @meta.nil? ? 0 : @meta.count
|
257
|
+
links = @links.nil? ? 0 : @links.count
|
258
|
+
[
|
259
|
+
%(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
|
260
|
+
%(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
|
261
|
+
].join(' ')
|
262
|
+
end
|
263
|
+
|
264
|
+
##
|
265
|
+
## Return all headers of given level
|
266
|
+
##
|
267
|
+
## @param level [Number] The level (1-6)
|
268
|
+
##
|
269
|
+
## @return [Array] array of headers with text and all tag attributes as symbols
|
270
|
+
##
|
271
|
+
def h(level = '\d')
|
272
|
+
res = []
|
273
|
+
headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map { Regexp.last_match }
|
274
|
+
headlines.each do |m|
|
275
|
+
headline = { level: m['level'] }
|
276
|
+
if m['tag'].nil?
|
277
|
+
attrs = nil
|
278
|
+
else
|
279
|
+
attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
|
280
|
+
attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
|
281
|
+
end
|
282
|
+
headline[:text] = m['text'].remove_entities
|
283
|
+
res << headline
|
284
|
+
end
|
285
|
+
res
|
286
|
+
end
|
287
|
+
|
288
|
+
##
|
289
|
+
## Convert a nokogiri element to Curl::Html format
|
290
|
+
##
|
291
|
+
## @param el [Nokogiri] element to convert
|
292
|
+
##
|
293
|
+
def nokogiri_to_tag(el)
|
294
|
+
attributes = el.attribute_nodes.map do |a|
|
295
|
+
{ key: a.name, value: a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value }
|
296
|
+
end
|
297
|
+
|
298
|
+
{
|
299
|
+
tag: el.name,
|
300
|
+
source: el.to_html,
|
301
|
+
attrs: attributes,
|
302
|
+
content: @clean ? el.text&.strip&.clean : el.text.strip,
|
303
|
+
tags: recurse_children(el)
|
304
|
+
}
|
305
|
+
end
|
306
|
+
|
307
|
+
def recurse_children(element)
|
308
|
+
children = []
|
309
|
+
element.children.each do |child|
|
310
|
+
next if child.name == 'text'
|
311
|
+
|
312
|
+
children.push(nokogiri_to_tag(child))
|
313
|
+
end
|
314
|
+
children
|
315
|
+
end
|
316
|
+
|
317
|
+
#-------------------------------------------------------
|
318
|
+
## Perform a CSS query using Nokogiri
|
319
|
+
##
|
320
|
+
## @param path [String] The CSS path
|
321
|
+
##
|
322
|
+
## @return [Array] array of matched elements
|
323
|
+
##
|
324
|
+
def search(path, source: @source)
|
325
|
+
doc = Nokogiri::HTML(source)
|
326
|
+
output = []
|
327
|
+
doc.search(path).each do |el|
|
328
|
+
out = nokogiri_to_tag(el)
|
329
|
+
output.push(out)
|
330
|
+
end
|
331
|
+
output
|
332
|
+
end
|
333
|
+
|
334
|
+
private
|
335
|
+
|
336
|
+
##
|
337
|
+
## Flatten the array of tags
|
338
|
+
##
|
339
|
+
## @param tags [Array] Document tags
|
340
|
+
##
|
341
|
+
def flatten_tags(tags)
|
342
|
+
flattened = []
|
343
|
+
|
344
|
+
tags.each do |t|
|
345
|
+
flattened << { tag: t[:tag], attrs: t[:attrs],
|
346
|
+
content: @clean ? t[:content]&.strip&.clean : t[:content]&.strip }
|
347
|
+
flattened.concat(flatten_tags(t[:tags])) unless t[:tags].nil?
|
348
|
+
end
|
349
|
+
|
350
|
+
flattened
|
351
|
+
end
|
352
|
+
|
353
|
+
##
|
354
|
+
## Return an array of all tags in the content
|
355
|
+
##
|
356
|
+
## @param content [String] The content to parse
|
357
|
+
##
|
358
|
+
def content_tags(content)
|
359
|
+
return nil if content.nil?
|
360
|
+
|
361
|
+
res = content.to_enum(:scan, %r{(?mix)
|
362
|
+
<(?<tag>(?!</)[a-z0-9]+)(?<attrs>\s[^>]+)?
|
363
|
+
(?:\s*/>|>(?<content>.*?)</\k<tag>>)}).map { Regexp.last_match }
|
364
|
+
res.map do |tag|
|
365
|
+
if tag['attrs'].nil?
|
366
|
+
attrs = nil
|
367
|
+
else
|
368
|
+
attrs = tag['attrs'].strip.to_enum(:scan, /(?ix)
|
369
|
+
(?<key>[@a-z0-9-]+)(?:=(?<quot>["'])
|
370
|
+
(?<value>[^"']+)\k<quot>|[ >])?/i).map { Regexp.last_match }
|
371
|
+
attrs.map! { |a| { key: a['key'], value: a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value'] } }
|
372
|
+
end
|
373
|
+
{
|
374
|
+
tag: tag['tag'],
|
375
|
+
source: tag.to_s,
|
376
|
+
attrs: attrs,
|
377
|
+
content: @clean ? tag['content']&.clean : tag['content'],
|
378
|
+
tags: content_tags(tag['content'])
|
379
|
+
}
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
##
|
384
|
+
## Extract all meta tags from the document head
|
385
|
+
##
|
386
|
+
## @param head [String] The head content
|
387
|
+
##
|
388
|
+
## @return [Hash] hash of meta tags and values
|
389
|
+
##
|
390
|
+
def meta_tags(head)
|
391
|
+
meta = {}
|
392
|
+
title = head.match(%r{(?<=<title>)(.*?)(?=</title>)})
|
393
|
+
meta['title'] = title.nil? ? nil : title[1]
|
394
|
+
refresh = head.match(/http-equiv=(['"])refresh\1(.*?)>/)
|
395
|
+
url = refresh.nil? ? nil : refresh[2].match(/url=(.*?)['"]/)
|
396
|
+
meta['refresh_url'] = url
|
397
|
+
meta_tags = head.scan(/<meta.*?>/)
|
398
|
+
meta_tags.each do |tag|
|
399
|
+
meta_name = tag.match(/(?:name|property|http-equiv)=(["'])(.*?)\1/)
|
400
|
+
next if meta_name.nil?
|
401
|
+
|
402
|
+
meta_value = tag.match(/(?:content)=(['"])(.*?)\1/)
|
403
|
+
next if meta_value.nil?
|
404
|
+
|
405
|
+
meta[meta_name[2].downcase] = meta_value[2]
|
406
|
+
end
|
407
|
+
meta
|
408
|
+
rescue StandardError => e
|
409
|
+
warn e
|
410
|
+
{}
|
411
|
+
end
|
412
|
+
|
413
|
+
##
|
414
|
+
## Extract all <link> tags from head
|
415
|
+
##
|
416
|
+
## @param head [String] The head content
|
417
|
+
##
|
418
|
+
## @return [Array] Array of links
|
419
|
+
##
|
420
|
+
def link_tags(head)
|
421
|
+
links = []
|
422
|
+
link_tags = head.scan(/<link.*?>/)
|
423
|
+
link_tags.each do |tag|
|
424
|
+
link_rel = tag.match(/rel=(['"])(.*?)\1/)
|
425
|
+
link_rel = link_rel.nil? ? nil : link_rel[2]
|
426
|
+
|
427
|
+
next if link_rel =~ /preload/
|
428
|
+
|
429
|
+
link_href = tag.match(/href=(["'])(.*?)\1/)
|
430
|
+
next if link_href.nil?
|
431
|
+
|
432
|
+
link_href = link_href[2]
|
433
|
+
|
434
|
+
next if link_href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
|
435
|
+
|
436
|
+
next if link_href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
|
437
|
+
|
438
|
+
next if same_origin?(link_href) && @external_links_only
|
439
|
+
|
440
|
+
link_title = tag.match(/title=(['"])(.*?)\1/)
|
441
|
+
link_title = link_title.nil? ? nil : link_title[2]
|
442
|
+
|
443
|
+
link_type = tag.match(/type=(['"])(.*?)\1/)
|
444
|
+
link_type = link_type.nil? ? nil : link_type[2]
|
445
|
+
|
446
|
+
links << { rel: link_rel, href: link_href, type: link_type, title: link_title }
|
447
|
+
end
|
448
|
+
links
|
449
|
+
end
|
450
|
+
|
451
|
+
##
|
452
|
+
## Get all links in the body of the page
|
453
|
+
##
|
454
|
+
## rel and class are returned as arrays
|
455
|
+
##
|
456
|
+
## @return [Array] array of links with href, title,
|
457
|
+
## rel, content and class
|
458
|
+
##
|
459
|
+
def content_links
|
460
|
+
links = []
|
461
|
+
link_tags = @body.to_enum(:scan, %r{<a ?(?<tag>.*?)>(?<text>.*?)</a>}).map { Regexp.last_match }
|
462
|
+
link_tags.each do |m|
|
463
|
+
href = m['tag'].match(/href=(["'])(.*?)\1/)
|
464
|
+
href = href[2] unless href.nil?
|
465
|
+
next if href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
|
466
|
+
|
467
|
+
next if href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
|
468
|
+
|
469
|
+
next if same_origin?(href) && @external_links_only
|
470
|
+
|
471
|
+
title = m['tag'].match(/title=(["'])(.*?)\1/)
|
472
|
+
title = title[2] unless title.nil?
|
473
|
+
rel = m['tag'].match(/rel=(["'])(.*?)\1/)
|
474
|
+
rel = rel[2].split(/ +/) unless rel.nil?
|
475
|
+
link_class = m['tag'].match(/class=(["'])(.*?)\1/)
|
476
|
+
link_class = link_class[2].split(/ +/) unless link_class.nil?
|
477
|
+
text = m['text'].remove_entities
|
478
|
+
link = {
|
479
|
+
href: href,
|
480
|
+
title: title,
|
481
|
+
rel: rel,
|
482
|
+
content: text,
|
483
|
+
class: link_class
|
484
|
+
}
|
485
|
+
links << link
|
486
|
+
end
|
487
|
+
links
|
488
|
+
end
|
489
|
+
|
490
|
+
##
|
491
|
+
## Get all img tags in the body of the page
|
492
|
+
##
|
493
|
+
## @return [Array] array of images with src and all attributes
|
494
|
+
##
|
495
|
+
def content_images
|
496
|
+
images = []
|
497
|
+
image_tags = @body.to_enum(:scan, %r{<img (?<tag>.*?)/?>}).map { Regexp.last_match }
|
498
|
+
image_tags.each do |m|
|
499
|
+
attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
|
500
|
+
image = {}
|
501
|
+
attrs.each { |a| image[a['attr'].to_sym] = a['content'] }
|
502
|
+
images << image
|
503
|
+
end
|
504
|
+
images
|
505
|
+
end
|
506
|
+
|
507
|
+
##
|
508
|
+
## Uses Selenium to load a page, allowing capture of dynamic (JS) pages
|
509
|
+
##
|
510
|
+
## @param url The url
|
511
|
+
##
|
512
|
+
## @return [String] page source
|
513
|
+
##
|
514
|
+
def curl_dynamic_html(url, browser, headers)
|
515
|
+
browser = browser.normalize_browser_type if browser.is_a?(String)
|
516
|
+
res = nil
|
517
|
+
|
518
|
+
driver = Selenium::WebDriver.for browser
|
519
|
+
driver.manage.timeouts.implicit_wait = 4
|
520
|
+
begin
|
521
|
+
driver.get url
|
522
|
+
res = driver.page_source
|
523
|
+
ensure
|
524
|
+
driver.quit
|
525
|
+
end
|
526
|
+
|
527
|
+
res
|
528
|
+
end
|
529
|
+
|
530
|
+
##
|
531
|
+
## Save a screenshot of a url
|
532
|
+
##
|
533
|
+
## @param destination [String] File path destination
|
534
|
+
## @param browser [Symbol] The browser (:chrome or :firefox)
|
535
|
+
## @param type [Symbol] The type of screenshot (:full_page, :print_page, or :visible)
|
536
|
+
##
|
537
|
+
def save_screenshot(destination = nil, browser: :chrome, type: :full_page)
|
538
|
+
raise 'No URL provided' if url.nil?
|
539
|
+
|
540
|
+
raise 'No file destination provided' if destination.nil?
|
541
|
+
|
542
|
+
destination = File.expand_path(destination)
|
543
|
+
|
544
|
+
raise 'Path doesn\'t exist' unless File.directory?(File.dirname(destination))
|
545
|
+
|
546
|
+
browser = browser.normalize_browser_type if browser.is_a?(String)
|
547
|
+
type = type.normalize_screenshot_type if type.is_a?(String)
|
548
|
+
raise 'Can not save full screen with Chrome, use Firefox' if type == :full_page && browser == :chrome
|
549
|
+
|
550
|
+
destination = case type
|
551
|
+
when :print_page
|
552
|
+
"#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.pdf"
|
553
|
+
else
|
554
|
+
"#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.png"
|
555
|
+
end
|
556
|
+
|
557
|
+
driver = Selenium::WebDriver.for browser
|
558
|
+
driver.manage.timeouts.implicit_wait = 4
|
559
|
+
begin
|
560
|
+
driver.get @url
|
561
|
+
case type
|
562
|
+
when :print_page
|
563
|
+
driver.save_print_page(destination)
|
564
|
+
when :full_page
|
565
|
+
driver.save_full_page_screenshot(destination)
|
566
|
+
else
|
567
|
+
driver.save_screenshot(destination)
|
568
|
+
end
|
569
|
+
ensure
|
570
|
+
driver.quit
|
571
|
+
end
|
572
|
+
|
573
|
+
$stderr.puts "Screenshot saved to #{destination}"
|
574
|
+
end
|
575
|
+
|
576
|
+
##
|
577
|
+
## Curls the html for the page
|
578
|
+
##
|
579
|
+
## @param url [String] The url
|
580
|
+
## @param headers [Hash] The headers
|
581
|
+
## @param headers_only [Boolean] Return headers only
|
582
|
+
## @param compressed [Boolean] expect compressed results
|
583
|
+
##
|
584
|
+
## @return [Hash] hash of url, code, headers, meta, links, head, body, and source
|
585
|
+
##
|
586
|
+
def curl_html(url = nil, source: nil, headers: nil,
|
587
|
+
headers_only: false, compressed: false, fallback: false)
|
588
|
+
unless url.nil?
|
589
|
+
flags = 'SsL'
|
590
|
+
flags += headers_only ? 'I' : 'i'
|
591
|
+
agents = [
|
592
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
|
593
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
|
594
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
|
595
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
|
596
|
+
]
|
597
|
+
headers = headers.nil? ? '' : headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
|
598
|
+
compress = compressed ? '--compressed' : ''
|
599
|
+
source = `#{@curl} -#{flags} #{compress} #{headers} '#{url}' 2>/dev/null`
|
600
|
+
agent = 0
|
601
|
+
while source.nil? || source.empty?
|
602
|
+
source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{url}' 2>/dev/null`
|
603
|
+
break if agent >= agents.count - 1
|
604
|
+
end
|
605
|
+
|
606
|
+
unless $?.success? || fallback
|
607
|
+
warn "Error curling #{url}"
|
608
|
+
Process.exit 1
|
609
|
+
end
|
610
|
+
|
611
|
+
if fallback && (source.nil? || source.empty?)
|
612
|
+
source = curl_dynamic_html(url, fallback, headers)
|
613
|
+
end
|
614
|
+
end
|
615
|
+
|
616
|
+
return false if source.nil? || source.empty?
|
617
|
+
|
618
|
+
source.strip!
|
619
|
+
|
620
|
+
headers = { 'location' => url }
|
621
|
+
lines = source.split(/\r\n/)
|
622
|
+
code = lines[0].match(/(\d\d\d)/)[1]
|
623
|
+
lines.shift
|
624
|
+
lines.each_with_index do |line, idx|
|
625
|
+
if line =~ /^([\w-]+): (.*?)$/
|
626
|
+
m = Regexp.last_match
|
627
|
+
headers[m[1]] = m[2]
|
628
|
+
else
|
629
|
+
source = lines[idx..].join("\n")
|
630
|
+
break
|
631
|
+
end
|
632
|
+
end
|
633
|
+
|
634
|
+
if headers['content-encoding'] =~ /gzip/i && !compressed
|
635
|
+
warn 'Response is gzipped, you may need to try again with --compressed'
|
636
|
+
end
|
637
|
+
|
638
|
+
if headers['content-type'] =~ /json/
|
639
|
+
return { url: url, code: code, headers: headers, meta: nil, links: nil,
|
640
|
+
head: nil, body: source.strip, source: source.strip, body_links: nil, body_images: nil }
|
641
|
+
end
|
642
|
+
|
643
|
+
head = source.match(%r{(?<=<head>)(.*?)(?=</head>)}mi)
|
644
|
+
|
645
|
+
if head.nil?
|
646
|
+
{ url: url, code: code, headers: headers, meta: nil, links: nil, head: nil, body: source.strip,
|
647
|
+
source: source.strip, body_links: nil, body_images: nil }
|
648
|
+
else
|
649
|
+
meta = meta_tags(head[1])
|
650
|
+
links = link_tags(head[1])
|
651
|
+
body = source.match(%r{<body.*?>(.*?)</body>}mi)[1]
|
652
|
+
{ url: url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: body,
|
653
|
+
source: source.strip, body_links: body_links, body_images: body_images }
|
654
|
+
end
|
655
|
+
end
|
656
|
+
|
657
|
+
##
|
658
|
+
## Reencode the content (borrowed from Nokogiri)
|
659
|
+
##
|
660
|
+
## @param body [String] The body
|
661
|
+
## @param content_type [String] Force content type
|
662
|
+
##
|
663
|
+
def reencode(body, content_type = nil)
|
664
|
+
if body.encoding == Encoding::ASCII_8BIT
|
665
|
+
encoding = nil
|
666
|
+
|
667
|
+
# look for a Byte Order Mark (BOM)
|
668
|
+
initial_bytes = body[0..2].bytes
|
669
|
+
if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
670
|
+
encoding = Encoding::UTF_8
|
671
|
+
elsif initial_bytes[0..1] == [0xFE, 0xFF]
|
672
|
+
encoding = Encoding::UTF_16BE
|
673
|
+
elsif initial_bytes[0..1] == [0xFF, 0xFE]
|
674
|
+
encoding = Encoding::UTF_16LE
|
675
|
+
end
|
676
|
+
|
677
|
+
# look for a charset in a content-encoding header
|
678
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] if content_type
|
679
|
+
|
680
|
+
# look for a charset in a meta tag in the first 1024 bytes
|
681
|
+
unless encoding
|
682
|
+
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
683
|
+
data.scan(/<meta.*?>/im).each do |meta|
|
684
|
+
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
685
|
+
end
|
686
|
+
end
|
687
|
+
|
688
|
+
# if all else fails, default to the official default encoding for HTML
|
689
|
+
encoding ||= Encoding::ISO_8859_1
|
690
|
+
|
691
|
+
# change the encoding to match the detected or inferred encoding
|
692
|
+
body = body.dup
|
693
|
+
begin
|
694
|
+
body.force_encoding(encoding)
|
695
|
+
rescue ArgumentError
|
696
|
+
body.force_encoding(Encoding::ISO_8859_1)
|
697
|
+
end
|
698
|
+
end
|
699
|
+
|
700
|
+
body.encode(Encoding::UTF_8)
|
701
|
+
end
|
702
|
+
|
703
|
+
##
|
704
|
+
## Test if a given url has the same hostname as @url
|
705
|
+
##
|
706
|
+
## @param href [String] The url to test
|
707
|
+
##
|
708
|
+
## @return [Boolean] true if hostnames match
|
709
|
+
##
|
710
|
+
def same_origin?(href)
|
711
|
+
begin
|
712
|
+
uri = URI(href)
|
713
|
+
origin = URI(@url)
|
714
|
+
uri.host == origin.host
|
715
|
+
rescue StandardError
|
716
|
+
false
|
717
|
+
end
|
718
|
+
end
|
719
|
+
end
|
720
|
+
end
|