scrapifier 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +2 -2
- data/lib/scrapifier/methods.rb +1 -1
- data/lib/scrapifier/support.rb +13 -39
- data/lib/scrapifier/version.rb +1 -1
- data/lib/scrapifier/xpath.rb +66 -0
- data/spec/factories/samples.rb +1 -0
- data/spec/scrapifier_spec.rb +30 -10
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e52f7f47695c7b16fed80a51f99a119d3f98a3b7
|
4
|
+
data.tar.gz: 24db438acd2fb2df421ab7b6c148e7cbed3331ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41e5f58a1760d61196ee3b4f429644025789d4432e7a1cef1a70d31473a26c12a8e38091527adacc78e7650759fc55cbdb80e365ce3f86954fcf493304f08d86
|
7
|
+
data.tar.gz: 248f23ec549f331a942d1847b9fc4d92935dea34993128fe5bd63fca9a4a4b5814ee531ef3c390e38502cd7f802c55a12dfeef6aa7eb434782d6f735a4d29e28
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -7,6 +7,8 @@
|
|
7
7
|
|
8
8
|
It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
|
9
9
|
|
10
|
+
Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
|
11
|
+
|
10
12
|
## Installation
|
11
13
|
|
12
14
|
Compatible with Ruby 1.9.3+
|
@@ -31,8 +33,6 @@ An then require the gem:
|
|
31
33
|
|
32
34
|
The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
|
33
35
|
|
34
|
-
Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
|
35
|
-
|
36
36
|
#### Default usage.
|
37
37
|
|
38
38
|
``` ruby
|
data/lib/scrapifier/methods.rb
CHANGED
data/lib/scrapifier/support.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'scrapifier/xpath'
|
2
|
+
|
1
3
|
module Scrapifier
|
2
4
|
# Support methods to get, check and organize data.
|
3
5
|
module Support
|
6
|
+
include XPath
|
4
7
|
module_function
|
5
8
|
|
6
9
|
# Evaluate the URI's HTML document and get its metadata.
|
@@ -25,7 +28,7 @@ module Scrapifier
|
|
25
28
|
doc = Nokogiri::HTML(open(uri).read)
|
26
29
|
doc.encoding, meta = 'utf-8', { uri: uri }
|
27
30
|
|
28
|
-
[:title, :description].each do |k|
|
31
|
+
[:title, :description, :keywords, :lang, :encode, :reply_to, :author].each do |k|
|
29
32
|
node = doc.xpath(sf_xpaths[k])[0]
|
30
33
|
meta[k] = node.nil? ? '-' : node.text
|
31
34
|
end
|
@@ -111,45 +114,16 @@ module Scrapifier
|
|
111
114
|
%r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
|
112
115
|
end
|
113
116
|
|
114
|
-
#
|
115
|
-
# from the parsed HTML.
|
117
|
+
# Organize XPaths.
|
116
118
|
def sf_xpaths
|
117
|
-
{
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
<<-END.gsub(/^\s+\|/, '')
|
126
|
-
|//meta[@property = "og:title"]/@content|
|
127
|
-
|//meta[@name = "title"]/@content|
|
128
|
-
|//meta[@name = "Title"]/@content|
|
129
|
-
|//title|//h1
|
130
|
-
END
|
131
|
-
end
|
132
|
-
|
133
|
-
def sf_desc_xpath
|
134
|
-
<<-END.gsub(/^\s+\|/, '')
|
135
|
-
|//meta[@property = "og:description"]/@content|
|
136
|
-
|//meta[@name = "description"]/@content|
|
137
|
-
|//meta[@name = "Description"]/@content|
|
138
|
-
|//h1|//h3|//p|//span|//font
|
139
|
-
END
|
140
|
-
end
|
141
|
-
|
142
|
-
def sf_img_xpath
|
143
|
-
<<-END.gsub(/^\s+\|/, '')
|
144
|
-
|//meta[@property = "og:image"]/@content|
|
145
|
-
|//link[@rel = "image_src"]/@href|
|
146
|
-
|//meta[@itemprop = "image"]/@content|
|
147
|
-
|//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
|
148
|
-
|//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
|
149
|
-
|//a//img[@width]/@src|//img[@width]/@src|
|
150
|
-
|//a//img[@height]/@src|//img[@height]/@src|
|
151
|
-
|//a//img/@src|//span//img/@src|//img/@src
|
152
|
-
END
|
119
|
+
{ title: XPath::TITLE,
|
120
|
+
description: XPath::DESC,
|
121
|
+
keywords: XPath::KEYWORDS,
|
122
|
+
lang: XPath::LANG,
|
123
|
+
encode: XPath::ENCODE,
|
124
|
+
reply_to: XPath::REPLY_TO,
|
125
|
+
author: XPath::AUTHOR,
|
126
|
+
image: XPath::IMG }
|
153
127
|
end
|
154
128
|
|
155
129
|
# Check and return only the valid image URIs.
|
data/lib/scrapifier/version.rb
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
module Scrapifier
|
3
|
+
# Collection of all XPaths which are used to find
|
4
|
+
# the nodes within the parsed HTML doc.
|
5
|
+
module XPath
|
6
|
+
TITLE =
|
7
|
+
<<-END.gsub(/^\s+\|/, '')
|
8
|
+
|//meta[@property="og:title"]/@content|
|
9
|
+
|//meta[@name="title"]/@content|
|
10
|
+
|//meta[@name="Title"]/@content|
|
11
|
+
|//title|//h1
|
12
|
+
END
|
13
|
+
|
14
|
+
DESC =
|
15
|
+
<<-END.gsub(/^\s+\|/, '')
|
16
|
+
|//meta[@property="og:description"]/@content|
|
17
|
+
|//meta[@name="description"]/@content|
|
18
|
+
|//meta[@name="Description"]/@content|
|
19
|
+
|//h1|//h3|//p|//span|//font
|
20
|
+
END
|
21
|
+
|
22
|
+
KEYWORDS =
|
23
|
+
<<-END.gsub(/^\s+\|/, '')
|
24
|
+
|//meta[@name="keywords"]/@content|
|
25
|
+
|//meta[@name="Keywords"]/@content|
|
26
|
+
|//meta[@property="og:type"]/@content
|
27
|
+
END
|
28
|
+
|
29
|
+
LANG =
|
30
|
+
<<-END.gsub(/^\s+\|/, '')
|
31
|
+
|//html/@lang|
|
32
|
+
|//meta[@property="og:locale"]/@content|
|
33
|
+
|//meta[@http-equiv="content-language"]/@content
|
34
|
+
END
|
35
|
+
|
36
|
+
ENCODE =
|
37
|
+
<<-END.gsub(/^\s+\|/, '')
|
38
|
+
|//meta/@charset|
|
39
|
+
|//meta[@http-equiv="content-type"]/@content
|
40
|
+
END
|
41
|
+
|
42
|
+
REPLY_TO =
|
43
|
+
<<-END.gsub(/^\s+\|/, '')
|
44
|
+
|//meta[@name="reply_to"]/@content
|
45
|
+
END
|
46
|
+
|
47
|
+
AUTHOR =
|
48
|
+
<<-END.gsub(/^\s+\|/, '')
|
49
|
+
|//meta[@name="author"]/@content|
|
50
|
+
|//meta[@name="Author"]/@content|
|
51
|
+
|//meta[@name="reply_to"]/@content
|
52
|
+
END
|
53
|
+
|
54
|
+
IMG =
|
55
|
+
<<-END.gsub(/^\s+\|/, '')
|
56
|
+
|//meta[@property="og:image"]/@content|
|
57
|
+
|//link[@rel="image_src"]/@href|
|
58
|
+
|//meta[@itemprop="image"]/@content|
|
59
|
+
|//div[@id="logo"]/img/@src|//a[@id="logo"]/img/@src|
|
60
|
+
|//div[@class="logo"]/img/@src|//a[@class="logo"]/img/@src|
|
61
|
+
|//a//img[@width]/@src|//img[@width]/@src|
|
62
|
+
|//a//img[@height]/@src|//img[@height]/@src|
|
63
|
+
|//a//img/@src|//span//img/@src|//img/@src
|
64
|
+
END
|
65
|
+
end
|
66
|
+
end
|
data/spec/factories/samples.rb
CHANGED
data/spec/scrapifier_spec.rb
CHANGED
@@ -46,24 +46,44 @@ describe String do
|
|
46
46
|
subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
|
47
47
|
|
48
48
|
it "includes a field with the site's title" do
|
49
|
-
hash[:title].is_a?(String).should
|
50
|
-
hash[:title].empty?.should
|
49
|
+
hash[:title].is_a?(String).should be true
|
50
|
+
hash[:title].empty?.should be false
|
51
51
|
end
|
52
52
|
|
53
53
|
it "includes a field with the site's description" do
|
54
|
-
hash[:description].is_a?(String).should
|
55
|
-
hash[:description].empty?.should
|
54
|
+
hash[:description].is_a?(String).should be true
|
55
|
+
hash[:description].empty?.should be false
|
56
56
|
end
|
57
57
|
|
58
58
|
it 'includes a field with the page URI' do
|
59
|
-
hash[:uri].is_a?(String).should
|
60
|
-
hash[:uri].empty?.should
|
59
|
+
hash[:uri].is_a?(String).should be true
|
60
|
+
hash[:uri].empty?.should be false
|
61
61
|
hash[:uri].should eq(misc[:http])
|
62
62
|
end
|
63
63
|
|
64
|
+
it "includes a field with the site's keywords" do
|
65
|
+
hash[:keywords].is_a?(String).should be true
|
66
|
+
end
|
67
|
+
|
68
|
+
it "includes a field with the site's language" do
|
69
|
+
hash[:lang].is_a?(String).should be true
|
70
|
+
end
|
71
|
+
|
72
|
+
it "includes a field with the site's encode" do
|
73
|
+
hash[:encode].is_a?(String).should be true
|
74
|
+
end
|
75
|
+
|
76
|
+
it "includes a field with the site's reply email address" do
|
77
|
+
hash[:reply_to].is_a?(String).should be true
|
78
|
+
end
|
79
|
+
|
80
|
+
it "includes a field with the site's author name or email" do
|
81
|
+
hash[:author].is_a?(String).should be true
|
82
|
+
end
|
83
|
+
|
64
84
|
it "includes a field with image URIs from the site's head/body" do
|
65
85
|
unless hash[:images].empty?
|
66
|
-
hash[:images].is_a?(Array).should
|
86
|
+
hash[:images].is_a?(Array).should be true
|
67
87
|
hash[:images].sample.should match(regexes[:image][:all])
|
68
88
|
end
|
69
89
|
end
|
@@ -77,8 +97,8 @@ describe String do
|
|
77
97
|
it "can choose the URI in the String to be scrapified" do
|
78
98
|
hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
|
79
99
|
[:title, :description, :uri].each do |key|
|
80
|
-
hash[key].is_a?(String).should
|
81
|
-
hash[key].empty?.should
|
100
|
+
hash[key].is_a?(String).should be true
|
101
|
+
hash[key].empty?.should be false
|
82
102
|
end
|
83
103
|
hash[:uri].should eq("http://#{misc[:www]}")
|
84
104
|
hash[:images].sample.should match(regexes[:image][:png])
|
@@ -175,7 +195,7 @@ describe String do
|
|
175
195
|
end
|
176
196
|
|
177
197
|
it 'always returns an Array' do
|
178
|
-
checked.each { |c| c[1].is_a?(Array).should
|
198
|
+
checked.each { |c| c[1].is_a?(Array).should be true }
|
179
199
|
end
|
180
200
|
end
|
181
201
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tiago Guedes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -84,6 +84,7 @@ files:
|
|
84
84
|
- lib/scrapifier/methods.rb
|
85
85
|
- lib/scrapifier/support.rb
|
86
86
|
- lib/scrapifier/version.rb
|
87
|
+
- lib/scrapifier/xpath.rb
|
87
88
|
- scrapifier.gemspec
|
88
89
|
- spec/factories/samples.rb
|
89
90
|
- spec/scrapifier_spec.rb
|