trawler 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/trawler/parser.rb +20 -4
- data/lib/trawler/version.rb +1 -1
- data/spec/fixtures/sample_pages/buzzfeed.html +10260 -0
- data/spec/fixtures/vcr_cassettes/buzzfeed_page.yml +4542 -0
- data/spec/trawler_spec.rb +0 -1
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 933f90b39be83ae7cf959157b7a0ee98d69f3b86
|
4
|
+
data.tar.gz: ebb64ed51ebbd8618b714db0e60d833a10be0756
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5a0d1b0bc1cae4f46614c288e608534d4e18cd4052bcebe05541e5562aeedabff01033055a5038bb7f0a332639ecc0957120f638d1a5c7d6592a4b3f3e7a745
|
7
|
+
data.tar.gz: 0018ce6c9fb18886bc5f3038657b58473634e0aa1c2742471bd1522f3e7664eec2dff8b127da04c3a185cbcdce81a49905cb783677f2545a8f91fc4fe48fa8bb
|
data/lib/trawler/parser.rb
CHANGED
@@ -22,10 +22,22 @@ module Trawler
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def images
|
25
|
-
images = []
|
26
|
-
images << meta_image
|
25
|
+
images = [meta_image]
|
27
26
|
images << find_images
|
28
|
-
images.flatten
|
27
|
+
images.flatten!
|
28
|
+
images = images.select { |img| !img.nil? }.select { |s| !s.empty? }
|
29
|
+
images.flatten.map! { |img| img.strip }.uniq
|
30
|
+
images.map { |img| normalize_url(img) }
|
31
|
+
end
|
32
|
+
|
33
|
+
def normalize_url(uri)
|
34
|
+
if uri =~ /^\w*\:/i
|
35
|
+
return uri
|
36
|
+
else
|
37
|
+
Addressable::URI.join(url, uri).normalize.to_s
|
38
|
+
end
|
39
|
+
rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
|
40
|
+
add_fatal_error "Link parsing exception: #{e.message}" and nil
|
29
41
|
end
|
30
42
|
|
31
43
|
def video
|
@@ -53,7 +65,11 @@ module Trawler
|
|
53
65
|
images.reject! { |img| img[:alt] =~ /(loading|spinner)/i }
|
54
66
|
images.reject! { |img| img[:class] =~ /(loading|spinner|icon)/i }
|
55
67
|
images.reject! { |img| img[:width] && img[:width] < @min_image_size }
|
56
|
-
images.map { |
|
68
|
+
buzzfeed_images = images.map { |i| i["rel:bf_image_src"].to_s }
|
69
|
+
images.reject! { |img| img[:src] =~ /(background|icons|icon)/i }
|
70
|
+
images.map! { |img| img[:src] }
|
71
|
+
images << buzzfeed_images
|
72
|
+
# images
|
57
73
|
end
|
58
74
|
|
59
75
|
def meta_title
|
data/lib/trawler/version.rb
CHANGED