curlyq 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,720 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Curl
4
+ # String helpers
5
+ class ::String
6
+ def remove_entities
7
+ gsub(/ /, ' ')
8
+ end
9
+ end
10
+
11
+ # Class for CURLing an HTML page
12
+ class Html
13
+ attr_reader :url, :code, :headers, :meta, :links, :head, :body,
14
+ :source, :title, :description, :body_links, :body_images, :clean
15
+
16
+ def to_data(url: nil)
17
+ {
18
+ url: @url || url,
19
+ code: @code,
20
+ headers: @headers,
21
+ meta: @meta,
22
+ meta_links: @links,
23
+ head: @head,
24
+ body: @body,
25
+ source: @source,
26
+ title: @title,
27
+ description: @description,
28
+ links: @body_links,
29
+ images: @body_images
30
+ }
31
+ end
32
+
33
+ ##
34
+ ## Create a new page object from a URL
35
+ ##
36
+ ## @param url [String] The url
37
+ ## @param headers [Hash] The headers to use in the curl call
38
+ ## @param headers_only [Boolean] Return headers only
39
+ ## @param compressed [Boolean] Expect compressed result
40
+ ##
41
+ ## @return [HTMLCurl] new page object
42
+ ##
43
+ def initialize(url, browser: nil, source: nil, headers: nil,
44
+ headers_only: false, compressed: false, clean: false, fallback: false,
45
+ ignore_local_links: false, ignore_fragment_links: false, external_links_only: false)
46
+ @clean = clean
47
+ @ignore_local_links = ignore_local_links
48
+ @ignore_fragment_links = ignore_fragment_links
49
+ @external_links_only = external_links_only
50
+ @curl = TTY::Which.which('curl')
51
+ @url = url
52
+ res = if url && browser && browser != :none
53
+ source = curl_dynamic_html(url, browser, headers)
54
+ curl_html(nil, source: source, headers: headers)
55
+ elsif url.nil? && !source.nil?
56
+ curl_html(nil, source: source, headers: headers, headers_only: headers_only, compressed: compressed, fallback: false)
57
+ else
58
+ curl_html(url, headers: headers, headers_only: headers_only, compressed: compressed, fallback: fallback)
59
+ end
60
+ @url = res[:url]
61
+ @code = res[:code]
62
+ @headers = res[:headers]
63
+ @meta = res[:meta]
64
+ @links = res[:links]
65
+ @head = res[:head] unless res[:head].nil?
66
+ @body = reencode(res[:body])
67
+ @source = res[:source]
68
+ @title = @meta['og:title'] || @meta['title'] unless @meta.nil?
69
+ @description = @meta['og:description'] || @meta['description'] unless @meta.nil?
70
+ @body_links = content_links
71
+ @body_images = content_images
72
+ end
73
+
74
+ ##
75
+ ## Save a screenshot of the url
76
+ ##
77
+ ## @param urls [Array] The urls
78
+ ## @param destination The file destination
79
+ ## @param browser The browser (:firefox,
80
+ ## :chrome)
81
+ ## @param type The type of screenshot to
82
+ ## save (:full_page,
83
+ ## :print_page, :visible)
84
+ ##
85
+ def screenshot(destination = nil, browser: :chrome, type: :full_page)
86
+ full_page = type.to_sym == :full_page
87
+ print_page = type.to_sym == :print_page
88
+ save_screenshot(destination, browser: browser, type: type)
89
+ end
90
+
91
+ ##
92
+ ## Extract text between two regular expressions
93
+ ##
94
+ ## @param before [String, Regexp] The before
95
+ ## @param after [String, Regexp] The after
96
+ ##
97
+ ## @return [Array] array of matches
98
+ ##
99
+ def extract(before, after)
100
+ before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp)
101
+ after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp)
102
+ rx = /(?<=#{before.source})(.*?)(?=#{after.source})/m
103
+ @body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
104
+ end
105
+
106
+ ##
107
+ ## Extract an array of tags or tag attributes
108
+ ##
109
+ ## @param tag [String] The tag
110
+ ## @param attribute [String] The attribute
111
+ ## @param source [Boolean] Return full tag source
112
+ ## (negates attribute if true)
113
+ ## @param content [Boolean] Return only tag
114
+ ## contents
115
+ ##
116
+ ## @return [Hash, Array] if source, return array of full
117
+ ## tags, if content, return array of tag contents,
118
+ ## otherwise, return a hash of tags including
119
+ ## attributes and content
120
+ ##
121
+ ## If attribute is not given, tag contents will be returned
122
+ ##
123
+ ## @example page.extract_tag('h1') => [Array of h1 tag
124
+ ## contents]
125
+ ## @example page.extract_tag('img', 'src') => [Array of img
126
+ ## src attributes]
127
+ ##
128
+ def extract_tag(tag, attribute = nil, source: false, content: false)
129
+ res = extract_tag_contents(tag, source: true)
130
+
131
+ return res if source
132
+
133
+ res.map! do |tag_source|
134
+ m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
135
+ attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
136
+ tags = tag_source.match(/<.*?>(?<content>.*?)</)
137
+ contents = tags.nil? ? nil : tags['content']
138
+ {
139
+ tag: tag,
140
+ source: tag_source,
141
+ attrs: attrs,
142
+ content: @clean ? contents&.clean : contents
143
+ }
144
+ end
145
+
146
+ return res.map { |r| r[:content] } if content
147
+
148
+ return res if attribute.nil?
149
+
150
+ res.map { |r| r[:attrs][attribute] }
151
+ end
152
+
153
+ ##
154
+ ## Extract tag contents or full tag source
155
+ ##
156
+ ## @param tag The tag
157
+ ## @param source [Boolean] Return full tag instead of contents
158
+ ##
159
+ ## @return [Array] array of tag matches/contents
160
+ def extract_tag_contents(tag, source: false)
161
+ return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source
162
+
163
+ @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
164
+ end
165
+
166
+ ##
167
+ ## Return all tags in body, or a specific tag
168
+ ##
169
+ ## @param tag [String, Array] The tag to return,
170
+ ## can be an array
171
+ ##
172
+ ## @return [Array] Array of tags. If no tag is specified, a
173
+ ## hierarchical array of all tags in the document
174
+ ## is returned. If one or more tags are specified,
175
+ ## return a flattened list in document order.
176
+ ##
177
+ def tags(tag = nil)
178
+ tags = content_tags(@body)
179
+ return tags if tag.nil?
180
+
181
+ tag = [tag] unless tag.is_a?(Array)
182
+ tag.map!(&:downcase)
183
+ flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
184
+ end
185
+
186
+ ##
187
+ ## Get all images from the page
188
+ ##
189
+ ## @return [Array] Array of images, both from picture sources and img tags
190
+ ##
191
+ def images(types: :all)
192
+ output = []
193
+ types = [types] unless types.is_a?(Array)
194
+ # types.map!(&:normalize_image_type)
195
+ types.each do |type|
196
+ if %i[all opengraph].include?(type)
197
+ %w[og:image twitter:image].each do |src|
198
+ next unless @meta.key?(src)
199
+
200
+ output << {
201
+ type: 'opengraph',
202
+ attrs: nil,
203
+ src: @meta[src]
204
+ }
205
+ end
206
+ end
207
+ images = tags(%w[img source])
208
+ images.each do |img|
209
+ case img[:tag].downcase
210
+ when /source/
211
+ next unless %i[all srcset].include?(type)
212
+
213
+ srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i }
214
+ if srcsets.count.positive?
215
+ srcset = []
216
+ srcsets.each do |src|
217
+ src[:value].split(/ *, */).each do |s|
218
+ image, media = s.split(/ /)
219
+ srcset << {
220
+ src: image,
221
+ media: media
222
+ }
223
+ end
224
+ end
225
+ output << {
226
+ type: 'srcset',
227
+ attrs: img[:attrs],
228
+ images: srcset
229
+ }
230
+ end
231
+ when /img/
232
+ next unless %i[all img].include?(type)
233
+
234
+ width = img[:attrs].select { |a| a[:key] == 'width' }.first[:value]
235
+ height = img[:attrs].select { |a| a[:key] == 'height' }.first[:value]
236
+ alt = img[:attrs].select { |a| a[:key] == 'alt' }.first[:value]
237
+ title = img[:attrs].select { |a| a[:key] == 'title' }.first[:value]
238
+
239
+ output << {
240
+ type: 'img',
241
+ src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value],
242
+ width: width || 'unknown',
243
+ height: height || 'unknown',
244
+ alt: alt,
245
+ title: title,
246
+ attrs: img[:attrs],
247
+ }
248
+ end
249
+ end
250
+ end
251
+ output
252
+ end
253
+
254
+ def to_s
255
+ headers = @headers.nil? ? 0 : @headers.count
256
+ meta = @meta.nil? ? 0 : @meta.count
257
+ links = @links.nil? ? 0 : @links.count
258
+ [
259
+ %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
260
+ %(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
261
+ ].join(' ')
262
+ end
263
+
264
+ ##
265
+ ## Return all headers of given level
266
+ ##
267
+ ## @param level [Number] The level (1-6)
268
+ ##
269
+ ## @return [Array] array of headers with text and all tag attributes as symbols
270
+ ##
271
+ def h(level = '\d')
272
+ res = []
273
+ headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map { Regexp.last_match }
274
+ headlines.each do |m|
275
+ headline = { level: m['level'] }
276
+ if m['tag'].nil?
277
+ attrs = nil
278
+ else
279
+ attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
280
+ attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
281
+ end
282
+ headline[:text] = m['text'].remove_entities
283
+ res << headline
284
+ end
285
+ res
286
+ end
287
+
288
+ ##
289
+ ## Convert a nokogiri element to Curl::Html format
290
+ ##
291
+ ## @param el [Nokogiri] element to convert
292
+ ##
293
+ def nokogiri_to_tag(el)
294
+ attributes = el.attribute_nodes.map do |a|
295
+ { key: a.name, value: a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value }
296
+ end
297
+
298
+ {
299
+ tag: el.name,
300
+ source: el.to_html,
301
+ attrs: attributes,
302
+ content: @clean ? el.text&.strip&.clean : el.text.strip,
303
+ tags: recurse_children(el)
304
+ }
305
+ end
306
+
307
+ def recurse_children(element)
308
+ children = []
309
+ element.children.each do |child|
310
+ next if child.name == 'text'
311
+
312
+ children.push(nokogiri_to_tag(child))
313
+ end
314
+ children
315
+ end
316
+
317
+ #-------------------------------------------------------
318
+ ## Perform a CSS query using Nokogiri
319
+ ##
320
+ ## @param path [String] The CSS path
321
+ ##
322
+ ## @return [Array] array of matched elements
323
+ ##
324
+ def search(path, source: @source)
325
+ doc = Nokogiri::HTML(source)
326
+ output = []
327
+ doc.search(path).each do |el|
328
+ out = nokogiri_to_tag(el)
329
+ output.push(out)
330
+ end
331
+ output
332
+ end
333
+
334
+ private
335
+
336
+ ##
337
+ ## Flatten the array of tags
338
+ ##
339
+ ## @param tags [Array] Document tags
340
+ ##
341
+ def flatten_tags(tags)
342
+ flattened = []
343
+
344
+ tags.each do |t|
345
+ flattened << { tag: t[:tag], attrs: t[:attrs],
346
+ content: @clean ? t[:content]&.strip&.clean : t[:content]&.strip }
347
+ flattened.concat(flatten_tags(t[:tags])) unless t[:tags].nil?
348
+ end
349
+
350
+ flattened
351
+ end
352
+
353
+ ##
354
+ ## Return an array of all tags in the content
355
+ ##
356
+ ## @param content [String] The content to parse
357
+ ##
358
+ def content_tags(content)
359
+ return nil if content.nil?
360
+
361
+ res = content.to_enum(:scan, %r{(?mix)
362
+ <(?<tag>(?!</)[a-z0-9]+)(?<attrs>\s[^>]+)?
363
+ (?:\s*/>|>(?<content>.*?)</\k<tag>>)}).map { Regexp.last_match }
364
+ res.map do |tag|
365
+ if tag['attrs'].nil?
366
+ attrs = nil
367
+ else
368
+ attrs = tag['attrs'].strip.to_enum(:scan, /(?ix)
369
+ (?<key>[@a-z0-9-]+)(?:=(?<quot>["'])
370
+ (?<value>[^"']+)\k<quot>|[ >])?/i).map { Regexp.last_match }
371
+ attrs.map! { |a| { key: a['key'], value: a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value'] } }
372
+ end
373
+ {
374
+ tag: tag['tag'],
375
+ source: tag.to_s,
376
+ attrs: attrs,
377
+ content: @clean ? tag['content']&.clean : tag['content'],
378
+ tags: content_tags(tag['content'])
379
+ }
380
+ end
381
+ end
382
+
383
+ ##
384
+ ## Extract all meta tags from the document head
385
+ ##
386
+ ## @param head [String] The head content
387
+ ##
388
+ ## @return [Hash] hash of meta tags and values
389
+ ##
390
+ def meta_tags(head)
391
+ meta = {}
392
+ title = head.match(%r{(?<=<title>)(.*?)(?=</title>)})
393
+ meta['title'] = title.nil? ? nil : title[1]
394
+ refresh = head.match(/http-equiv=(['"])refresh\1(.*?)>/)
395
+ url = refresh.nil? ? nil : refresh[2].match(/url=(.*?)['"]/)
396
+ meta['refresh_url'] = url
397
+ meta_tags = head.scan(/<meta.*?>/)
398
+ meta_tags.each do |tag|
399
+ meta_name = tag.match(/(?:name|property|http-equiv)=(["'])(.*?)\1/)
400
+ next if meta_name.nil?
401
+
402
+ meta_value = tag.match(/(?:content)=(['"])(.*?)\1/)
403
+ next if meta_value.nil?
404
+
405
+ meta[meta_name[2].downcase] = meta_value[2]
406
+ end
407
+ meta
408
+ rescue StandardError => e
409
+ warn e
410
+ {}
411
+ end
412
+
413
+ ##
414
+ ## Extract all <link> tags from head
415
+ ##
416
+ ## @param head [String] The head content
417
+ ##
418
+ ## @return [Array] Array of links
419
+ ##
420
+ def link_tags(head)
421
+ links = []
422
+ link_tags = head.scan(/<link.*?>/)
423
+ link_tags.each do |tag|
424
+ link_rel = tag.match(/rel=(['"])(.*?)\1/)
425
+ link_rel = link_rel.nil? ? nil : link_rel[2]
426
+
427
+ next if link_rel =~ /preload/
428
+
429
+ link_href = tag.match(/href=(["'])(.*?)\1/)
430
+ next if link_href.nil?
431
+
432
+ link_href = link_href[2]
433
+
434
+ next if link_href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
435
+
436
+ next if link_href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
437
+
438
+ next if same_origin?(link_href) && @external_links_only
439
+
440
+ link_title = tag.match(/title=(['"])(.*?)\1/)
441
+ link_title = link_title.nil? ? nil : link_title[2]
442
+
443
+ link_type = tag.match(/type=(['"])(.*?)\1/)
444
+ link_type = link_type.nil? ? nil : link_type[2]
445
+
446
+ links << { rel: link_rel, href: link_href, type: link_type, title: link_title }
447
+ end
448
+ links
449
+ end
450
+
451
+ ##
452
+ ## Get all links in the body of the page
453
+ ##
454
+ ## rel and class are returned as arrays
455
+ ##
456
+ ## @return [Array] array of links with href, title,
457
+ ## rel, content and class
458
+ ##
459
+ def content_links
460
+ links = []
461
+ link_tags = @body.to_enum(:scan, %r{<a ?(?<tag>.*?)>(?<text>.*?)</a>}).map { Regexp.last_match }
462
+ link_tags.each do |m|
463
+ href = m['tag'].match(/href=(["'])(.*?)\1/)
464
+ href = href[2] unless href.nil?
465
+ next if href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
466
+
467
+ next if href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
468
+
469
+ next if same_origin?(href) && @external_links_only
470
+
471
+ title = m['tag'].match(/title=(["'])(.*?)\1/)
472
+ title = title[2] unless title.nil?
473
+ rel = m['tag'].match(/rel=(["'])(.*?)\1/)
474
+ rel = rel[2].split(/ +/) unless rel.nil?
475
+ link_class = m['tag'].match(/class=(["'])(.*?)\1/)
476
+ link_class = link_class[2].split(/ +/) unless link_class.nil?
477
+ text = m['text'].remove_entities
478
+ link = {
479
+ href: href,
480
+ title: title,
481
+ rel: rel,
482
+ content: text,
483
+ class: link_class
484
+ }
485
+ links << link
486
+ end
487
+ links
488
+ end
489
+
490
+ ##
491
+ ## Get all img tags in the body of the page
492
+ ##
493
+ ## @return [Array] array of images with src and all attributes
494
+ ##
495
+ def content_images
496
+ images = []
497
+ image_tags = @body.to_enum(:scan, %r{<img (?<tag>.*?)/?>}).map { Regexp.last_match }
498
+ image_tags.each do |m|
499
+ attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
500
+ image = {}
501
+ attrs.each { |a| image[a['attr'].to_sym] = a['content'] }
502
+ images << image
503
+ end
504
+ images
505
+ end
506
+
507
+ ##
508
+ ## Uses Selenium to load a page, allowing capture of dynamic (JS) pages
509
+ ##
510
+ ## @param url The url
511
+ ##
512
+ ## @return [String] page source
513
+ ##
514
+ def curl_dynamic_html(url, browser, headers)
515
+ browser = browser.normalize_browser_type if browser.is_a?(String)
516
+ res = nil
517
+
518
+ driver = Selenium::WebDriver.for browser
519
+ driver.manage.timeouts.implicit_wait = 4
520
+ begin
521
+ driver.get url
522
+ res = driver.page_source
523
+ ensure
524
+ driver.quit
525
+ end
526
+
527
+ res
528
+ end
529
+
530
+ ##
531
+ ## Save a screenshot of a url
532
+ ##
533
+ ## @param destination [String] File path destination
534
+ ## @param browser [Symbol] The browser (:chrome or :firefox)
535
+ ## @param type [Symbol] The type of screenshot (:full_page, :print_page, or :visible)
536
+ ##
537
+ def save_screenshot(destination = nil, browser: :chrome, type: :full_page)
538
+ raise 'No URL provided' if url.nil?
539
+
540
+ raise 'No file destination provided' if destination.nil?
541
+
542
+ destination = File.expand_path(destination)
543
+
544
+ raise 'Path doesn\'t exist' unless File.directory?(File.dirname(destination))
545
+
546
+ browser = browser.normalize_browser_type if browser.is_a?(String)
547
+ type = type.normalize_screenshot_type if type.is_a?(String)
548
+ raise 'Can not save full screen with Chrome, use Firefox' if type == :full_page && browser == :chrome
549
+
550
+ destination = case type
551
+ when :print_page
552
+ "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.pdf"
553
+ else
554
+ "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.png"
555
+ end
556
+
557
+ driver = Selenium::WebDriver.for browser
558
+ driver.manage.timeouts.implicit_wait = 4
559
+ begin
560
+ driver.get @url
561
+ case type
562
+ when :print_page
563
+ driver.save_print_page(destination)
564
+ when :full_page
565
+ driver.save_full_page_screenshot(destination)
566
+ else
567
+ driver.save_screenshot(destination)
568
+ end
569
+ ensure
570
+ driver.quit
571
+ end
572
+
573
+ $stderr.puts "Screenshot saved to #{destination}"
574
+ end
575
+
576
+ ##
577
+ ## Curls the html for the page
578
+ ##
579
+ ## @param url [String] The url
580
+ ## @param headers [Hash] The headers
581
+ ## @param headers_only [Boolean] Return headers only
582
+ ## @param compressed [Boolean] expect compressed results
583
+ ##
584
+ ## @return [Hash] hash of url, code, headers, meta, links, head, body, and source
585
+ ##
586
+ def curl_html(url = nil, source: nil, headers: nil,
587
+ headers_only: false, compressed: false, fallback: false)
588
+ unless url.nil?
589
+ flags = 'SsL'
590
+ flags += headers_only ? 'I' : 'i'
591
+ agents = [
592
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
593
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
594
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
595
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
596
+ ]
597
+ headers = headers.nil? ? '' : headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
598
+ compress = compressed ? '--compressed' : ''
599
+ source = `#{@curl} -#{flags} #{compress} #{headers} '#{url}' 2>/dev/null`
600
+ agent = 0
601
+ while source.nil? || source.empty?
602
+ source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{url}' 2>/dev/null`
603
+ break if agent >= agents.count - 1
604
+ end
605
+
606
+ unless $?.success? || fallback
607
+ warn "Error curling #{url}"
608
+ Process.exit 1
609
+ end
610
+
611
+ if fallback && (source.nil? || source.empty?)
612
+ source = curl_dynamic_html(url, fallback, headers)
613
+ end
614
+ end
615
+
616
+ return false if source.nil? || source.empty?
617
+
618
+ source.strip!
619
+
620
+ headers = { 'location' => url }
621
+ lines = source.split(/\r\n/)
622
+ code = lines[0].match(/(\d\d\d)/)[1]
623
+ lines.shift
624
+ lines.each_with_index do |line, idx|
625
+ if line =~ /^([\w-]+): (.*?)$/
626
+ m = Regexp.last_match
627
+ headers[m[1]] = m[2]
628
+ else
629
+ source = lines[idx..].join("\n")
630
+ break
631
+ end
632
+ end
633
+
634
+ if headers['content-encoding'] =~ /gzip/i && !compressed
635
+ warn 'Response is gzipped, you may need to try again with --compressed'
636
+ end
637
+
638
+ if headers['content-type'] =~ /json/
639
+ return { url: url, code: code, headers: headers, meta: nil, links: nil,
640
+ head: nil, body: source.strip, source: source.strip, body_links: nil, body_images: nil }
641
+ end
642
+
643
+ head = source.match(%r{(?<=<head>)(.*?)(?=</head>)}mi)
644
+
645
+ if head.nil?
646
+ { url: url, code: code, headers: headers, meta: nil, links: nil, head: nil, body: source.strip,
647
+ source: source.strip, body_links: nil, body_images: nil }
648
+ else
649
+ meta = meta_tags(head[1])
650
+ links = link_tags(head[1])
651
+ body = source.match(%r{<body.*?>(.*?)</body>}mi)[1]
652
+ { url: url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: body,
653
+ source: source.strip, body_links: body_links, body_images: body_images }
654
+ end
655
+ end
656
+
657
+ ##
658
+ ## Reencode the content (borrowed from Nokogiri)
659
+ ##
660
+ ## @param body [String] The body
661
+ ## @param content_type [String] Force content type
662
+ ##
663
+ def reencode(body, content_type = nil)
664
+ if body.encoding == Encoding::ASCII_8BIT
665
+ encoding = nil
666
+
667
+ # look for a Byte Order Mark (BOM)
668
+ initial_bytes = body[0..2].bytes
669
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
670
+ encoding = Encoding::UTF_8
671
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
672
+ encoding = Encoding::UTF_16BE
673
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
674
+ encoding = Encoding::UTF_16LE
675
+ end
676
+
677
+ # look for a charset in a content-encoding header
678
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] if content_type
679
+
680
+ # look for a charset in a meta tag in the first 1024 bytes
681
+ unless encoding
682
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
683
+ data.scan(/<meta.*?>/im).each do |meta|
684
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
685
+ end
686
+ end
687
+
688
+ # if all else fails, default to the official default encoding for HTML
689
+ encoding ||= Encoding::ISO_8859_1
690
+
691
+ # change the encoding to match the detected or inferred encoding
692
+ body = body.dup
693
+ begin
694
+ body.force_encoding(encoding)
695
+ rescue ArgumentError
696
+ body.force_encoding(Encoding::ISO_8859_1)
697
+ end
698
+ end
699
+
700
+ body.encode(Encoding::UTF_8)
701
+ end
702
+
703
+ ##
704
+ ## Test if a given url has the same hostname as @url
705
+ ##
706
+ ## @param href [String] The url to test
707
+ ##
708
+ ## @return [Boolean] true if hostnames match
709
+ ##
710
+ def same_origin?(href)
711
+ begin
712
+ uri = URI(href)
713
+ origin = URI(@url)
714
+ uri.host == origin.host
715
+ rescue StandardError
716
+ false
717
+ end
718
+ end
719
+ end
720
+ end