scrapifier 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +2 -2
- data/lib/scrapifier/methods.rb +1 -1
- data/lib/scrapifier/support.rb +13 -39
- data/lib/scrapifier/version.rb +1 -1
- data/lib/scrapifier/xpath.rb +66 -0
- data/spec/factories/samples.rb +1 -0
- data/spec/scrapifier_spec.rb +30 -10
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e52f7f47695c7b16fed80a51f99a119d3f98a3b7
|
4
|
+
data.tar.gz: 24db438acd2fb2df421ab7b6c148e7cbed3331ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41e5f58a1760d61196ee3b4f429644025789d4432e7a1cef1a70d31473a26c12a8e38091527adacc78e7650759fc55cbdb80e365ce3f86954fcf493304f08d86
|
7
|
+
data.tar.gz: 248f23ec549f331a942d1847b9fc4d92935dea34993128fe5bd63fca9a4a4b5814ee531ef3c390e38502cd7f802c55a12dfeef6aa7eb434782d6f735a4d29e28
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -7,6 +7,8 @@
|
|
7
7
|
|
8
8
|
It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
|
9
9
|
|
10
|
+
Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
|
11
|
+
|
10
12
|
## Installation
|
11
13
|
|
12
14
|
Compatible with Ruby 1.9.3+
|
@@ -31,8 +33,6 @@ An then require the gem:
|
|
31
33
|
|
32
34
|
The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
|
33
35
|
|
34
|
-
Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
|
35
|
-
|
36
36
|
#### Default usage.
|
37
37
|
|
38
38
|
``` ruby
|
data/lib/scrapifier/methods.rb
CHANGED
data/lib/scrapifier/support.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
+
require 'scrapifier/xpath'
|
2
|
+
|
1
3
|
module Scrapifier
|
2
4
|
# Support methods to get, check and organize data.
|
3
5
|
module Support
|
6
|
+
include XPath
|
4
7
|
module_function
|
5
8
|
|
6
9
|
# Evaluate the URI's HTML document and get its metadata.
|
@@ -25,7 +28,7 @@ module Scrapifier
|
|
25
28
|
doc = Nokogiri::HTML(open(uri).read)
|
26
29
|
doc.encoding, meta = 'utf-8', { uri: uri }
|
27
30
|
|
28
|
-
[:title, :description].each do |k|
|
31
|
+
[:title, :description, :keywords, :lang, :encode, :reply_to, :author].each do |k|
|
29
32
|
node = doc.xpath(sf_xpaths[k])[0]
|
30
33
|
meta[k] = node.nil? ? '-' : node.text
|
31
34
|
end
|
@@ -111,45 +114,16 @@ module Scrapifier
|
|
111
114
|
%r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
|
112
115
|
end
|
113
116
|
|
114
|
-
#
|
115
|
-
# from the parsed HTML.
|
117
|
+
# Organize XPaths.
|
116
118
|
def sf_xpaths
|
117
|
-
{
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
<<-END.gsub(/^\s+\|/, '')
|
126
|
-
|//meta[@property = "og:title"]/@content|
|
127
|
-
|//meta[@name = "title"]/@content|
|
128
|
-
|//meta[@name = "Title"]/@content|
|
129
|
-
|//title|//h1
|
130
|
-
END
|
131
|
-
end
|
132
|
-
|
133
|
-
def sf_desc_xpath
|
134
|
-
<<-END.gsub(/^\s+\|/, '')
|
135
|
-
|//meta[@property = "og:description"]/@content|
|
136
|
-
|//meta[@name = "description"]/@content|
|
137
|
-
|//meta[@name = "Description"]/@content|
|
138
|
-
|//h1|//h3|//p|//span|//font
|
139
|
-
END
|
140
|
-
end
|
141
|
-
|
142
|
-
def sf_img_xpath
|
143
|
-
<<-END.gsub(/^\s+\|/, '')
|
144
|
-
|//meta[@property = "og:image"]/@content|
|
145
|
-
|//link[@rel = "image_src"]/@href|
|
146
|
-
|//meta[@itemprop = "image"]/@content|
|
147
|
-
|//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
|
148
|
-
|//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
|
149
|
-
|//a//img[@width]/@src|//img[@width]/@src|
|
150
|
-
|//a//img[@height]/@src|//img[@height]/@src|
|
151
|
-
|//a//img/@src|//span//img/@src|//img/@src
|
152
|
-
END
|
119
|
+
{ title: XPath::TITLE,
|
120
|
+
description: XPath::DESC,
|
121
|
+
keywords: XPath::KEYWORDS,
|
122
|
+
lang: XPath::LANG,
|
123
|
+
encode: XPath::ENCODE,
|
124
|
+
reply_to: XPath::REPLY_TO,
|
125
|
+
author: XPath::AUTHOR,
|
126
|
+
image: XPath::IMG }
|
153
127
|
end
|
154
128
|
|
155
129
|
# Check and return only the valid image URIs.
|
data/lib/scrapifier/version.rb
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
module Scrapifier
|
3
|
+
# Collection of all XPaths which are used to find
|
4
|
+
# the nodes within the parsed HTML doc.
|
5
|
+
module XPath
|
6
|
+
TITLE =
|
7
|
+
<<-END.gsub(/^\s+\|/, '')
|
8
|
+
|//meta[@property="og:title"]/@content|
|
9
|
+
|//meta[@name="title"]/@content|
|
10
|
+
|//meta[@name="Title"]/@content|
|
11
|
+
|//title|//h1
|
12
|
+
END
|
13
|
+
|
14
|
+
DESC =
|
15
|
+
<<-END.gsub(/^\s+\|/, '')
|
16
|
+
|//meta[@property="og:description"]/@content|
|
17
|
+
|//meta[@name="description"]/@content|
|
18
|
+
|//meta[@name="Description"]/@content|
|
19
|
+
|//h1|//h3|//p|//span|//font
|
20
|
+
END
|
21
|
+
|
22
|
+
KEYWORDS =
|
23
|
+
<<-END.gsub(/^\s+\|/, '')
|
24
|
+
|//meta[@name="keywords"]/@content|
|
25
|
+
|//meta[@name="Keywords"]/@content|
|
26
|
+
|//meta[@property="og:type"]/@content
|
27
|
+
END
|
28
|
+
|
29
|
+
LANG =
|
30
|
+
<<-END.gsub(/^\s+\|/, '')
|
31
|
+
|//html/@lang|
|
32
|
+
|//meta[@property="og:locale"]/@content|
|
33
|
+
|//meta[@http-equiv="content-language"]/@content
|
34
|
+
END
|
35
|
+
|
36
|
+
ENCODE =
|
37
|
+
<<-END.gsub(/^\s+\|/, '')
|
38
|
+
|//meta/@charset|
|
39
|
+
|//meta[@http-equiv="content-type"]/@content
|
40
|
+
END
|
41
|
+
|
42
|
+
REPLY_TO =
|
43
|
+
<<-END.gsub(/^\s+\|/, '')
|
44
|
+
|//meta[@name="reply_to"]/@content
|
45
|
+
END
|
46
|
+
|
47
|
+
AUTHOR =
|
48
|
+
<<-END.gsub(/^\s+\|/, '')
|
49
|
+
|//meta[@name="author"]/@content|
|
50
|
+
|//meta[@name="Author"]/@content|
|
51
|
+
|//meta[@name="reply_to"]/@content
|
52
|
+
END
|
53
|
+
|
54
|
+
IMG =
|
55
|
+
<<-END.gsub(/^\s+\|/, '')
|
56
|
+
|//meta[@property="og:image"]/@content|
|
57
|
+
|//link[@rel="image_src"]/@href|
|
58
|
+
|//meta[@itemprop="image"]/@content|
|
59
|
+
|//div[@id="logo"]/img/@src|//a[@id="logo"]/img/@src|
|
60
|
+
|//div[@class="logo"]/img/@src|//a[@class="logo"]/img/@src|
|
61
|
+
|//a//img[@width]/@src|//img[@width]/@src|
|
62
|
+
|//a//img[@height]/@src|//img[@height]/@src|
|
63
|
+
|//a//img/@src|//span//img/@src|//img/@src
|
64
|
+
END
|
65
|
+
end
|
66
|
+
end
|
data/spec/factories/samples.rb
CHANGED
data/spec/scrapifier_spec.rb
CHANGED
@@ -46,24 +46,44 @@ describe String do
|
|
46
46
|
subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
|
47
47
|
|
48
48
|
it "includes a field with the site's title" do
|
49
|
-
hash[:title].is_a?(String).should
|
50
|
-
hash[:title].empty?.should
|
49
|
+
hash[:title].is_a?(String).should be true
|
50
|
+
hash[:title].empty?.should be false
|
51
51
|
end
|
52
52
|
|
53
53
|
it "includes a field with the site's description" do
|
54
|
-
hash[:description].is_a?(String).should
|
55
|
-
hash[:description].empty?.should
|
54
|
+
hash[:description].is_a?(String).should be true
|
55
|
+
hash[:description].empty?.should be false
|
56
56
|
end
|
57
57
|
|
58
58
|
it 'includes a field with the page URI' do
|
59
|
-
hash[:uri].is_a?(String).should
|
60
|
-
hash[:uri].empty?.should
|
59
|
+
hash[:uri].is_a?(String).should be true
|
60
|
+
hash[:uri].empty?.should be false
|
61
61
|
hash[:uri].should eq(misc[:http])
|
62
62
|
end
|
63
63
|
|
64
|
+
it "includes a field with the site's keywords" do
|
65
|
+
hash[:keywords].is_a?(String).should be true
|
66
|
+
end
|
67
|
+
|
68
|
+
it "includes a field with the site's language" do
|
69
|
+
hash[:lang].is_a?(String).should be true
|
70
|
+
end
|
71
|
+
|
72
|
+
it "includes a field with the site's encode" do
|
73
|
+
hash[:encode].is_a?(String).should be true
|
74
|
+
end
|
75
|
+
|
76
|
+
it "includes a field with the site's reply email address" do
|
77
|
+
hash[:reply_to].is_a?(String).should be true
|
78
|
+
end
|
79
|
+
|
80
|
+
it "includes a field with the site's author name or email" do
|
81
|
+
hash[:author].is_a?(String).should be true
|
82
|
+
end
|
83
|
+
|
64
84
|
it "includes a field with image URIs from the site's head/body" do
|
65
85
|
unless hash[:images].empty?
|
66
|
-
hash[:images].is_a?(Array).should
|
86
|
+
hash[:images].is_a?(Array).should be true
|
67
87
|
hash[:images].sample.should match(regexes[:image][:all])
|
68
88
|
end
|
69
89
|
end
|
@@ -77,8 +97,8 @@ describe String do
|
|
77
97
|
it "can choose the URI in the String to be scrapified" do
|
78
98
|
hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
|
79
99
|
[:title, :description, :uri].each do |key|
|
80
|
-
hash[key].is_a?(String).should
|
81
|
-
hash[key].empty?.should
|
100
|
+
hash[key].is_a?(String).should be true
|
101
|
+
hash[key].empty?.should be false
|
82
102
|
end
|
83
103
|
hash[:uri].should eq("http://#{misc[:www]}")
|
84
104
|
hash[:images].sample.should match(regexes[:image][:png])
|
@@ -175,7 +195,7 @@ describe String do
|
|
175
195
|
end
|
176
196
|
|
177
197
|
it 'always returns an Array' do
|
178
|
-
checked.each { |c| c[1].is_a?(Array).should
|
198
|
+
checked.each { |c| c[1].is_a?(Array).should be true }
|
179
199
|
end
|
180
200
|
end
|
181
201
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tiago Guedes
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -84,6 +84,7 @@ files:
|
|
84
84
|
- lib/scrapifier/methods.rb
|
85
85
|
- lib/scrapifier/support.rb
|
86
86
|
- lib/scrapifier/version.rb
|
87
|
+
- lib/scrapifier/xpath.rb
|
87
88
|
- scrapifier.gemspec
|
88
89
|
- spec/factories/samples.rb
|
89
90
|
- spec/scrapifier_spec.rb
|