curlyq 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,720 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Curl
4
+ # String helpers
5
+ class ::String
6
+ def remove_entities
7
+ gsub(/ /, ' ')
8
+ end
9
+ end
10
+
11
+ # Class for CURLing an HTML page
12
+ class Html
13
+ attr_reader :url, :code, :headers, :meta, :links, :head, :body,
14
+ :source, :title, :description, :body_links, :body_images, :clean
15
+
16
+ def to_data(url: nil)
17
+ {
18
+ url: @url || url,
19
+ code: @code,
20
+ headers: @headers,
21
+ meta: @meta,
22
+ meta_links: @links,
23
+ head: @head,
24
+ body: @body,
25
+ source: @source,
26
+ title: @title,
27
+ description: @description,
28
+ links: @body_links,
29
+ images: @body_images
30
+ }
31
+ end
32
+
33
+ ##
34
+ ## Create a new page object from a URL
35
+ ##
36
+ ## @param url [String] The url
37
+ ## @param headers [Hash] The headers to use in the curl call
38
+ ## @param headers_only [Boolean] Return headers only
39
+ ## @param compressed [Boolean] Expect compressed result
40
+ ##
41
+ ## @return [HTMLCurl] new page object
42
+ ##
43
+ def initialize(url, browser: nil, source: nil, headers: nil,
44
+ headers_only: false, compressed: false, clean: false, fallback: false,
45
+ ignore_local_links: false, ignore_fragment_links: false, external_links_only: false)
46
+ @clean = clean
47
+ @ignore_local_links = ignore_local_links
48
+ @ignore_fragment_links = ignore_fragment_links
49
+ @external_links_only = external_links_only
50
+ @curl = TTY::Which.which('curl')
51
+ @url = url
52
+ res = if url && browser && browser != :none
53
+ source = curl_dynamic_html(url, browser, headers)
54
+ curl_html(nil, source: source, headers: headers)
55
+ elsif url.nil? && !source.nil?
56
+ curl_html(nil, source: source, headers: headers, headers_only: headers_only, compressed: compressed, fallback: false)
57
+ else
58
+ curl_html(url, headers: headers, headers_only: headers_only, compressed: compressed, fallback: fallback)
59
+ end
60
+ @url = res[:url]
61
+ @code = res[:code]
62
+ @headers = res[:headers]
63
+ @meta = res[:meta]
64
+ @links = res[:links]
65
+ @head = res[:head] unless res[:head].nil?
66
+ @body = reencode(res[:body])
67
+ @source = res[:source]
68
+ @title = @meta['og:title'] || @meta['title'] unless @meta.nil?
69
+ @description = @meta['og:description'] || @meta['description'] unless @meta.nil?
70
+ @body_links = content_links
71
+ @body_images = content_images
72
+ end
73
+
74
+ ##
75
+ ## Save a screenshot of the url
76
+ ##
77
+ ## @param urls [Array] The urls
78
+ ## @param destination The file destination
79
+ ## @param browser The browser (:firefox,
80
+ ## :chrome)
81
+ ## @param type The type of screenshot to
82
+ ## save (:full_page,
83
+ ## :print_page, :visible)
84
+ ##
85
+ def screenshot(destination = nil, browser: :chrome, type: :full_page)
86
+ full_page = type.to_sym == :full_page
87
+ print_page = type.to_sym == :print_page
88
+ save_screenshot(destination, browser: browser, type: type)
89
+ end
90
+
91
+ ##
92
+ ## Extract text between two regular expressions
93
+ ##
94
+ ## @param before [String, Regexp] The before
95
+ ## @param after [String, Regexp] The after
96
+ ##
97
+ ## @return [Array] array of matches
98
+ ##
99
+ def extract(before, after)
100
+ before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp)
101
+ after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp)
102
+ rx = /(?<=#{before.source})(.*?)(?=#{after.source})/m
103
+ @body.scan(rx).map { |r| @clean ? r[0].clean : r[0] }
104
+ end
105
+
106
+ ##
107
+ ## Extract an array of tags or tag attributes
108
+ ##
109
+ ## @param tag [String] The tag
110
+ ## @param attribute [String] The attribute
111
+ ## @param source [Boolean] Return full tag source
112
+ ## (negates attribute if true)
113
+ ## @param content [Boolean] Return only tag
114
+ ## contents
115
+ ##
116
+ ## @return [Hash, Array] if source, return array of full
117
+ ## tags, if content, return array of tag contents,
118
+ ## otherwise, return a hash of tags including
119
+ ## attributes and content
120
+ ##
121
+ ## If attribute is not given, tag contents will be returned
122
+ ##
123
+ ## @example page.extract_tag('h1') => [Array of h1 tag
124
+ ## contents]
125
+ ## @example page.extract_tag('img', 'src') => [Array of img
126
+ ## src attributes]
127
+ ##
128
+ def extract_tag(tag, attribute = nil, source: false, content: false)
129
+ res = extract_tag_contents(tag, source: true)
130
+
131
+ return res if source
132
+
133
+ res.map! do |tag_source|
134
+ m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match }
135
+ attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] }
136
+ tags = tag_source.match(/<.*?>(?<content>.*?)</)
137
+ contents = tags.nil? ? nil : tags['content']
138
+ {
139
+ tag: tag,
140
+ source: tag_source,
141
+ attrs: attrs,
142
+ content: @clean ? contents&.clean : contents
143
+ }
144
+ end
145
+
146
+ return res.map { |r| r[:content] } if content
147
+
148
+ return res if attribute.nil?
149
+
150
+ res.map { |r| r[:attrs][attribute] }
151
+ end
152
+
153
+ ##
154
+ ## Extract tag contents or full tag source
155
+ ##
156
+ ## @param tag The tag
157
+ ## @param source [Boolean] Return full tag instead of contents
158
+ ##
159
+ ## @return [Array] array of tag matches/contents
160
+ def extract_tag_contents(tag, source: false)
161
+ return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source
162
+
163
+ @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] }
164
+ end
165
+
166
+ ##
167
+ ## Return all tags in body, or a specific tag
168
+ ##
169
+ ## @param tag [String, Array] The tag to return,
170
+ ## can be an array
171
+ ##
172
+ ## @return [Array] Array of tags. If no tag is specified, a
173
+ ## hierarchical array of all tags in the document
174
+ ## is returned. If one or more tags are specified,
175
+ ## return a flattened list in document order.
176
+ ##
177
+ def tags(tag = nil)
178
+ tags = content_tags(@body)
179
+ return tags if tag.nil?
180
+
181
+ tag = [tag] unless tag.is_a?(Array)
182
+ tag.map!(&:downcase)
183
+ flatten_tags(tags).dup.delete_if { |t| !tag.include?(t[:tag].downcase) }
184
+ end
185
+
186
+ ##
187
+ ## Get all images from the page
188
+ ##
189
+ ## @return [Array] Array of images, both from picture sources and img tags
190
+ ##
191
+ def images(types: :all)
192
+ output = []
193
+ types = [types] unless types.is_a?(Array)
194
+ # types.map!(&:normalize_image_type)
195
+ types.each do |type|
196
+ if %i[all opengraph].include?(type)
197
+ %w[og:image twitter:image].each do |src|
198
+ next unless @meta.key?(src)
199
+
200
+ output << {
201
+ type: 'opengraph',
202
+ attrs: nil,
203
+ src: @meta[src]
204
+ }
205
+ end
206
+ end
207
+ images = tags(%w[img source])
208
+ images.each do |img|
209
+ case img[:tag].downcase
210
+ when /source/
211
+ next unless %i[all srcset].include?(type)
212
+
213
+ srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i }
214
+ if srcsets.count.positive?
215
+ srcset = []
216
+ srcsets.each do |src|
217
+ src[:value].split(/ *, */).each do |s|
218
+ image, media = s.split(/ /)
219
+ srcset << {
220
+ src: image,
221
+ media: media
222
+ }
223
+ end
224
+ end
225
+ output << {
226
+ type: 'srcset',
227
+ attrs: img[:attrs],
228
+ images: srcset
229
+ }
230
+ end
231
+ when /img/
232
+ next unless %i[all img].include?(type)
233
+
234
+ width = img[:attrs].select { |a| a[:key] == 'width' }.first[:value]
235
+ height = img[:attrs].select { |a| a[:key] == 'height' }.first[:value]
236
+ alt = img[:attrs].select { |a| a[:key] == 'alt' }.first[:value]
237
+ title = img[:attrs].select { |a| a[:key] == 'title' }.first[:value]
238
+
239
+ output << {
240
+ type: 'img',
241
+ src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value],
242
+ width: width || 'unknown',
243
+ height: height || 'unknown',
244
+ alt: alt,
245
+ title: title,
246
+ attrs: img[:attrs],
247
+ }
248
+ end
249
+ end
250
+ end
251
+ output
252
+ end
253
+
254
+ def to_s
255
+ headers = @headers.nil? ? 0 : @headers.count
256
+ meta = @meta.nil? ? 0 : @meta.count
257
+ links = @links.nil? ? 0 : @links.count
258
+ [
259
+ %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"),
260
+ %(@description=#{@description} @headers:#{headers} @meta:#{meta} @links:#{links}>)
261
+ ].join(' ')
262
+ end
263
+
264
+ ##
265
+ ## Return all headers of given level
266
+ ##
267
+ ## @param level [Number] The level (1-6)
268
+ ##
269
+ ## @return [Array] array of headers with text and all tag attributes as symbols
270
+ ##
271
+ def h(level = '\d')
272
+ res = []
273
+ headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map { Regexp.last_match }
274
+ headlines.each do |m|
275
+ headline = { level: m['level'] }
276
+ if m['tag'].nil?
277
+ attrs = nil
278
+ else
279
+ attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
280
+ attrs.each { |a| headline[a['attr'].to_sym] = a['content'] }
281
+ end
282
+ headline[:text] = m['text'].remove_entities
283
+ res << headline
284
+ end
285
+ res
286
+ end
287
+
288
+ ##
289
+ ## Convert a nokogiri element to Curl::Html format
290
+ ##
291
+ ## @param el [Nokogiri] element to convert
292
+ ##
293
+ def nokogiri_to_tag(el)
294
+ attributes = el.attribute_nodes.map do |a|
295
+ { key: a.name, value: a.name =~ /^(class|rel)$/ ? a.value.split(/ /) : a.value }
296
+ end
297
+
298
+ {
299
+ tag: el.name,
300
+ source: el.to_html,
301
+ attrs: attributes,
302
+ content: @clean ? el.text&.strip&.clean : el.text.strip,
303
+ tags: recurse_children(el)
304
+ }
305
+ end
306
+
307
+ def recurse_children(element)
308
+ children = []
309
+ element.children.each do |child|
310
+ next if child.name == 'text'
311
+
312
+ children.push(nokogiri_to_tag(child))
313
+ end
314
+ children
315
+ end
316
+
317
+ #-------------------------------------------------------
318
+ ## Perform a CSS query using Nokogiri
319
+ ##
320
+ ## @param path [String] The CSS path
321
+ ##
322
+ ## @return [Array] array of matched elements
323
+ ##
324
+ def search(path, source: @source)
325
+ doc = Nokogiri::HTML(source)
326
+ output = []
327
+ doc.search(path).each do |el|
328
+ out = nokogiri_to_tag(el)
329
+ output.push(out)
330
+ end
331
+ output
332
+ end
333
+
334
+ private
335
+
336
+ ##
337
+ ## Flatten the array of tags
338
+ ##
339
+ ## @param tags [Array] Document tags
340
+ ##
341
+ def flatten_tags(tags)
342
+ flattened = []
343
+
344
+ tags.each do |t|
345
+ flattened << { tag: t[:tag], attrs: t[:attrs],
346
+ content: @clean ? t[:content]&.strip&.clean : t[:content]&.strip }
347
+ flattened.concat(flatten_tags(t[:tags])) unless t[:tags].nil?
348
+ end
349
+
350
+ flattened
351
+ end
352
+
353
+ ##
354
+ ## Return an array of all tags in the content
355
+ ##
356
+ ## @param content [String] The content to parse
357
+ ##
358
+ def content_tags(content)
359
+ return nil if content.nil?
360
+
361
+ res = content.to_enum(:scan, %r{(?mix)
362
+ <(?<tag>(?!</)[a-z0-9]+)(?<attrs>\s[^>]+)?
363
+ (?:\s*/>|>(?<content>.*?)</\k<tag>>)}).map { Regexp.last_match }
364
+ res.map do |tag|
365
+ if tag['attrs'].nil?
366
+ attrs = nil
367
+ else
368
+ attrs = tag['attrs'].strip.to_enum(:scan, /(?ix)
369
+ (?<key>[@a-z0-9-]+)(?:=(?<quot>["'])
370
+ (?<value>[^"']+)\k<quot>|[ >])?/i).map { Regexp.last_match }
371
+ attrs.map! { |a| { key: a['key'], value: a['key'] =~ /^(class|rel)$/ ? a['value'].split(/ /) : a['value'] } }
372
+ end
373
+ {
374
+ tag: tag['tag'],
375
+ source: tag.to_s,
376
+ attrs: attrs,
377
+ content: @clean ? tag['content']&.clean : tag['content'],
378
+ tags: content_tags(tag['content'])
379
+ }
380
+ end
381
+ end
382
+
383
+ ##
384
+ ## Extract all meta tags from the document head
385
+ ##
386
+ ## @param head [String] The head content
387
+ ##
388
+ ## @return [Hash] hash of meta tags and values
389
+ ##
390
+ def meta_tags(head)
391
+ meta = {}
392
+ title = head.match(%r{(?<=<title>)(.*?)(?=</title>)})
393
+ meta['title'] = title.nil? ? nil : title[1]
394
+ refresh = head.match(/http-equiv=(['"])refresh\1(.*?)>/)
395
+ url = refresh.nil? ? nil : refresh[2].match(/url=(.*?)['"]/)
396
+ meta['refresh_url'] = url
397
+ meta_tags = head.scan(/<meta.*?>/)
398
+ meta_tags.each do |tag|
399
+ meta_name = tag.match(/(?:name|property|http-equiv)=(["'])(.*?)\1/)
400
+ next if meta_name.nil?
401
+
402
+ meta_value = tag.match(/(?:content)=(['"])(.*?)\1/)
403
+ next if meta_value.nil?
404
+
405
+ meta[meta_name[2].downcase] = meta_value[2]
406
+ end
407
+ meta
408
+ rescue StandardError => e
409
+ warn e
410
+ {}
411
+ end
412
+
413
+ ##
414
+ ## Extract all <link> tags from head
415
+ ##
416
+ ## @param head [String] The head content
417
+ ##
418
+ ## @return [Array] Array of links
419
+ ##
420
+ def link_tags(head)
421
+ links = []
422
+ link_tags = head.scan(/<link.*?>/)
423
+ link_tags.each do |tag|
424
+ link_rel = tag.match(/rel=(['"])(.*?)\1/)
425
+ link_rel = link_rel.nil? ? nil : link_rel[2]
426
+
427
+ next if link_rel =~ /preload/
428
+
429
+ link_href = tag.match(/href=(["'])(.*?)\1/)
430
+ next if link_href.nil?
431
+
432
+ link_href = link_href[2]
433
+
434
+ next if link_href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
435
+
436
+ next if link_href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
437
+
438
+ next if same_origin?(link_href) && @external_links_only
439
+
440
+ link_title = tag.match(/title=(['"])(.*?)\1/)
441
+ link_title = link_title.nil? ? nil : link_title[2]
442
+
443
+ link_type = tag.match(/type=(['"])(.*?)\1/)
444
+ link_type = link_type.nil? ? nil : link_type[2]
445
+
446
+ links << { rel: link_rel, href: link_href, type: link_type, title: link_title }
447
+ end
448
+ links
449
+ end
450
+
451
+ ##
452
+ ## Get all links in the body of the page
453
+ ##
454
+ ## rel and class are returned as arrays
455
+ ##
456
+ ## @return [Array] array of links with href, title,
457
+ ## rel, content and class
458
+ ##
459
+ def content_links
460
+ links = []
461
+ link_tags = @body.to_enum(:scan, %r{<a ?(?<tag>.*?)>(?<text>.*?)</a>}).map { Regexp.last_match }
462
+ link_tags.each do |m|
463
+ href = m['tag'].match(/href=(["'])(.*?)\1/)
464
+ href = href[2] unless href.nil?
465
+ next if href =~ /^#/ && (@ignore_fragment_links || @external_links_only)
466
+
467
+ next if href !~ %r{^(\w+:)?//} && (@ignore_local_links || @external_links_only)
468
+
469
+ next if same_origin?(href) && @external_links_only
470
+
471
+ title = m['tag'].match(/title=(["'])(.*?)\1/)
472
+ title = title[2] unless title.nil?
473
+ rel = m['tag'].match(/rel=(["'])(.*?)\1/)
474
+ rel = rel[2].split(/ +/) unless rel.nil?
475
+ link_class = m['tag'].match(/class=(["'])(.*?)\1/)
476
+ link_class = link_class[2].split(/ +/) unless link_class.nil?
477
+ text = m['text'].remove_entities
478
+ link = {
479
+ href: href,
480
+ title: title,
481
+ rel: rel,
482
+ content: text,
483
+ class: link_class
484
+ }
485
+ links << link
486
+ end
487
+ links
488
+ end
489
+
490
+ ##
491
+ ## Get all img tags in the body of the page
492
+ ##
493
+ ## @return [Array] array of images with src and all attributes
494
+ ##
495
+ def content_images
496
+ images = []
497
+ image_tags = @body.to_enum(:scan, %r{<img (?<tag>.*?)/?>}).map { Regexp.last_match }
498
+ image_tags.each do |m|
499
+ attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match }
500
+ image = {}
501
+ attrs.each { |a| image[a['attr'].to_sym] = a['content'] }
502
+ images << image
503
+ end
504
+ images
505
+ end
506
+
507
+ ##
508
+ ## Uses Selenium to load a page, allowing capture of dynamic (JS) pages
509
+ ##
510
+ ## @param url The url
511
+ ##
512
+ ## @return [String] page source
513
+ ##
514
+ def curl_dynamic_html(url, browser, headers)
515
+ browser = browser.normalize_browser_type if browser.is_a?(String)
516
+ res = nil
517
+
518
+ driver = Selenium::WebDriver.for browser
519
+ driver.manage.timeouts.implicit_wait = 4
520
+ begin
521
+ driver.get url
522
+ res = driver.page_source
523
+ ensure
524
+ driver.quit
525
+ end
526
+
527
+ res
528
+ end
529
+
530
+ ##
531
+ ## Save a screenshot of a url
532
+ ##
533
+ ## @param destination [String] File path destination
534
+ ## @param browser [Symbol] The browser (:chrome or :firefox)
535
+ ## @param type [Symbol] The type of screenshot (:full_page, :print_page, or :visible)
536
+ ##
537
+ def save_screenshot(destination = nil, browser: :chrome, type: :full_page)
538
+ raise 'No URL provided' if url.nil?
539
+
540
+ raise 'No file destination provided' if destination.nil?
541
+
542
+ destination = File.expand_path(destination)
543
+
544
+ raise 'Path doesn\'t exist' unless File.directory?(File.dirname(destination))
545
+
546
+ browser = browser.normalize_browser_type if browser.is_a?(String)
547
+ type = type.normalize_screenshot_type if type.is_a?(String)
548
+ raise 'Can not save full screen with Chrome, use Firefox' if type == :full_page && browser == :chrome
549
+
550
+ destination = case type
551
+ when :print_page
552
+ "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.pdf"
553
+ else
554
+ "#{destination.sub(/\.(pdf|jpe?g|png)$/, '')}.png"
555
+ end
556
+
557
+ driver = Selenium::WebDriver.for browser
558
+ driver.manage.timeouts.implicit_wait = 4
559
+ begin
560
+ driver.get @url
561
+ case type
562
+ when :print_page
563
+ driver.save_print_page(destination)
564
+ when :full_page
565
+ driver.save_full_page_screenshot(destination)
566
+ else
567
+ driver.save_screenshot(destination)
568
+ end
569
+ ensure
570
+ driver.quit
571
+ end
572
+
573
+ $stderr.puts "Screenshot saved to #{destination}"
574
+ end
575
+
576
+ ##
577
+ ## Curls the html for the page
578
+ ##
579
+ ## @param url [String] The url
580
+ ## @param headers [Hash] The headers
581
+ ## @param headers_only [Boolean] Return headers only
582
+ ## @param compressed [Boolean] expect compressed results
583
+ ##
584
+ ## @return [Hash] hash of url, code, headers, meta, links, head, body, and source
585
+ ##
586
+ def curl_html(url = nil, source: nil, headers: nil,
587
+ headers_only: false, compressed: false, fallback: false)
588
+ unless url.nil?
589
+ flags = 'SsL'
590
+ flags += headers_only ? 'I' : 'i'
591
+ agents = [
592
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.1',
593
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.',
594
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.3',
595
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.'
596
+ ]
597
+ headers = headers.nil? ? '' : headers.map { |h, v| %(-H "#{h}: #{v}") }.join(' ')
598
+ compress = compressed ? '--compressed' : ''
599
+ source = `#{@curl} -#{flags} #{compress} #{headers} '#{url}' 2>/dev/null`
600
+ agent = 0
601
+ while source.nil? || source.empty?
602
+ source = `#{@curl} -#{flags} #{compress} -A "#{agents[agent]}" #{headers} '#{url}' 2>/dev/null`
603
+ break if agent >= agents.count - 1
604
+ end
605
+
606
+ unless $?.success? || fallback
607
+ warn "Error curling #{url}"
608
+ Process.exit 1
609
+ end
610
+
611
+ if fallback && (source.nil? || source.empty?)
612
+ source = curl_dynamic_html(url, fallback, headers)
613
+ end
614
+ end
615
+
616
+ return false if source.nil? || source.empty?
617
+
618
+ source.strip!
619
+
620
+ headers = { 'location' => url }
621
+ lines = source.split(/\r\n/)
622
+ code = lines[0].match(/(\d\d\d)/)[1]
623
+ lines.shift
624
+ lines.each_with_index do |line, idx|
625
+ if line =~ /^([\w-]+): (.*?)$/
626
+ m = Regexp.last_match
627
+ headers[m[1]] = m[2]
628
+ else
629
+ source = lines[idx..].join("\n")
630
+ break
631
+ end
632
+ end
633
+
634
+ if headers['content-encoding'] =~ /gzip/i && !compressed
635
+ warn 'Response is gzipped, you may need to try again with --compressed'
636
+ end
637
+
638
+ if headers['content-type'] =~ /json/
639
+ return { url: url, code: code, headers: headers, meta: nil, links: nil,
640
+ head: nil, body: source.strip, source: source.strip, body_links: nil, body_images: nil }
641
+ end
642
+
643
+ head = source.match(%r{(?<=<head>)(.*?)(?=</head>)}mi)
644
+
645
+ if head.nil?
646
+ { url: url, code: code, headers: headers, meta: nil, links: nil, head: nil, body: source.strip,
647
+ source: source.strip, body_links: nil, body_images: nil }
648
+ else
649
+ meta = meta_tags(head[1])
650
+ links = link_tags(head[1])
651
+ body = source.match(%r{<body.*?>(.*?)</body>}mi)[1]
652
+ { url: url, code: code, headers: headers, meta: meta, links: links, head: head[1], body: body,
653
+ source: source.strip, body_links: body_links, body_images: body_images }
654
+ end
655
+ end
656
+
657
+ ##
658
+ ## Reencode the content (borrowed from Nokogiri)
659
+ ##
660
+ ## @param body [String] The body
661
+ ## @param content_type [String] Force content type
662
+ ##
663
+ def reencode(body, content_type = nil)
664
+ if body.encoding == Encoding::ASCII_8BIT
665
+ encoding = nil
666
+
667
+ # look for a Byte Order Mark (BOM)
668
+ initial_bytes = body[0..2].bytes
669
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
670
+ encoding = Encoding::UTF_8
671
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
672
+ encoding = Encoding::UTF_16BE
673
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
674
+ encoding = Encoding::UTF_16LE
675
+ end
676
+
677
+ # look for a charset in a content-encoding header
678
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] if content_type
679
+
680
+ # look for a charset in a meta tag in the first 1024 bytes
681
+ unless encoding
682
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
683
+ data.scan(/<meta.*?>/im).each do |meta|
684
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
685
+ end
686
+ end
687
+
688
+ # if all else fails, default to the official default encoding for HTML
689
+ encoding ||= Encoding::ISO_8859_1
690
+
691
+ # change the encoding to match the detected or inferred encoding
692
+ body = body.dup
693
+ begin
694
+ body.force_encoding(encoding)
695
+ rescue ArgumentError
696
+ body.force_encoding(Encoding::ISO_8859_1)
697
+ end
698
+ end
699
+
700
+ body.encode(Encoding::UTF_8)
701
+ end
702
+
703
+ ##
704
+ ## Test if a given url has the same hostname as @url
705
+ ##
706
+ ## @param href [String] The url to test
707
+ ##
708
+ ## @return [Boolean] true if hostnames match
709
+ ##
710
+ def same_origin?(href)
711
+ begin
712
+ uri = URI(href)
713
+ origin = URI(@url)
714
+ uri.host == origin.host
715
+ rescue StandardError
716
+ false
717
+ end
718
+ end
719
+ end
720
+ end