scrapifier 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
4
- data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
3
+ metadata.gz: e52f7f47695c7b16fed80a51f99a119d3f98a3b7
4
+ data.tar.gz: 24db438acd2fb2df421ab7b6c148e7cbed3331ee
5
5
  SHA512:
6
- metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
7
- data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d
6
+ metadata.gz: 41e5f58a1760d61196ee3b4f429644025789d4432e7a1cef1a70d31473a26c12a8e38091527adacc78e7650759fc55cbdb80e365ce3f86954fcf493304f08d86
7
+ data.tar.gz: 248f23ec549f331a942d1847b9fc4d92935dea34993128fe5bd63fca9a4a4b5814ee531ef3c390e38502cd7f802c55a12dfeef6aa7eb434782d6f735a4d29e28
@@ -1,4 +1,5 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 2.0.0
4
- - 1.9.3
4
+ - 2.1.0
5
+ - 2.1.1
data/README.md CHANGED
@@ -7,6 +7,8 @@
7
7
 
8
8
  It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
9
9
 
10
+ Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
11
+
10
12
  ## Installation
11
13
 
12
14
  Compatible with Ruby 1.9.3+
@@ -31,8 +33,6 @@ An then require the gem:
31
33
 
32
34
  The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
33
35
 
34
- Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
35
-
36
36
  #### Default usage.
37
37
 
38
38
  ``` ruby
@@ -6,7 +6,7 @@ require 'scrapifier/support'
6
6
  module Scrapifier
7
7
  # Methods which will be included into the String class.
8
8
  module Methods
9
- include Scrapifier::Support
9
+ include Support
10
10
 
11
11
  # Get metadata from an URI using the screen scraping technique.
12
12
  #
@@ -1,6 +1,9 @@
1
+ require 'scrapifier/xpath'
2
+
1
3
  module Scrapifier
2
4
  # Support methods to get, check and organize data.
3
5
  module Support
6
+ include XPath
4
7
  module_function
5
8
 
6
9
  # Evaluate the URI's HTML document and get its metadata.
@@ -25,7 +28,7 @@ module Scrapifier
25
28
  doc = Nokogiri::HTML(open(uri).read)
26
29
  doc.encoding, meta = 'utf-8', { uri: uri }
27
30
 
28
- [:title, :description].each do |k|
31
+ [:title, :description, :keywords, :lang, :encode, :reply_to, :author].each do |k|
29
32
  node = doc.xpath(sf_xpaths[k])[0]
30
33
  meta[k] = node.nil? ? '-' : node.text
31
34
  end
@@ -111,45 +114,16 @@ module Scrapifier
111
114
  %r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
112
115
  end
113
116
 
114
- # Collection of xpath that are used to get nodes
115
- # from the parsed HTML.
117
+ # Organize XPaths.
116
118
  def sf_xpaths
117
- {
118
- title: sf_title_xpath,
119
- description: sf_desc_xpath,
120
- image: sf_img_xpath
121
- }
122
- end
123
-
124
- def sf_title_xpath
125
- <<-END.gsub(/^\s+\|/, '')
126
- |//meta[@property = "og:title"]/@content|
127
- |//meta[@name = "title"]/@content|
128
- |//meta[@name = "Title"]/@content|
129
- |//title|//h1
130
- END
131
- end
132
-
133
- def sf_desc_xpath
134
- <<-END.gsub(/^\s+\|/, '')
135
- |//meta[@property = "og:description"]/@content|
136
- |//meta[@name = "description"]/@content|
137
- |//meta[@name = "Description"]/@content|
138
- |//h1|//h3|//p|//span|//font
139
- END
140
- end
141
-
142
- def sf_img_xpath
143
- <<-END.gsub(/^\s+\|/, '')
144
- |//meta[@property = "og:image"]/@content|
145
- |//link[@rel = "image_src"]/@href|
146
- |//meta[@itemprop = "image"]/@content|
147
- |//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
148
- |//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
149
- |//a//img[@width]/@src|//img[@width]/@src|
150
- |//a//img[@height]/@src|//img[@height]/@src|
151
- |//a//img/@src|//span//img/@src|//img/@src
152
- END
119
+ { title: XPath::TITLE,
120
+ description: XPath::DESC,
121
+ keywords: XPath::KEYWORDS,
122
+ lang: XPath::LANG,
123
+ encode: XPath::ENCODE,
124
+ reply_to: XPath::REPLY_TO,
125
+ author: XPath::AUTHOR,
126
+ image: XPath::IMG }
153
127
  end
154
128
 
155
129
  # Check and return only the valid image URIs.
@@ -1,3 +1,3 @@
1
1
  module Scrapifier
2
- VERSION = '0.0.4'
2
+ VERSION = '0.0.5'
3
3
  end
@@ -0,0 +1,66 @@
1
+ # coding: utf-8
2
+ module Scrapifier
3
+ # Collection of all XPaths which are used to find
4
+ # the nodes within the parsed HTML doc.
5
+ module XPath
6
+ TITLE =
7
+ <<-END.gsub(/^\s+\|/, '')
8
+ |//meta[@property="og:title"]/@content|
9
+ |//meta[@name="title"]/@content|
10
+ |//meta[@name="Title"]/@content|
11
+ |//title|//h1
12
+ END
13
+
14
+ DESC =
15
+ <<-END.gsub(/^\s+\|/, '')
16
+ |//meta[@property="og:description"]/@content|
17
+ |//meta[@name="description"]/@content|
18
+ |//meta[@name="Description"]/@content|
19
+ |//h1|//h3|//p|//span|//font
20
+ END
21
+
22
+ KEYWORDS =
23
+ <<-END.gsub(/^\s+\|/, '')
24
+ |//meta[@name="keywords"]/@content|
25
+ |//meta[@name="Keywords"]/@content|
26
+ |//meta[@property="og:type"]/@content
27
+ END
28
+
29
+ LANG =
30
+ <<-END.gsub(/^\s+\|/, '')
31
+ |//html/@lang|
32
+ |//meta[@property="og:locale"]/@content|
33
+ |//meta[@http-equiv="content-language"]/@content
34
+ END
35
+
36
+ ENCODE =
37
+ <<-END.gsub(/^\s+\|/, '')
38
+ |//meta/@charset|
39
+ |//meta[@http-equiv="content-type"]/@content
40
+ END
41
+
42
+ REPLY_TO =
43
+ <<-END.gsub(/^\s+\|/, '')
44
+ |//meta[@name="reply_to"]/@content
45
+ END
46
+
47
+ AUTHOR =
48
+ <<-END.gsub(/^\s+\|/, '')
49
+ |//meta[@name="author"]/@content|
50
+ |//meta[@name="Author"]/@content|
51
+ |//meta[@name="reply_to"]/@content
52
+ END
53
+
54
+ IMG =
55
+ <<-END.gsub(/^\s+\|/, '')
56
+ |//meta[@property="og:image"]/@content|
57
+ |//link[@rel="image_src"]/@href|
58
+ |//meta[@itemprop="image"]/@content|
59
+ |//div[@id="logo"]/img/@src|//a[@id="logo"]/img/@src|
60
+ |//div[@class="logo"]/img/@src|//a[@class="logo"]/img/@src|
61
+ |//a//img[@width]/@src|//img[@width]/@src|
62
+ |//a//img[@height]/@src|//img[@height]/@src|
63
+ |//a//img/@src|//span//img/@src|//img/@src
64
+ END
65
+ end
66
+ end
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  module Factories
2
3
  private
3
4
 
@@ -46,24 +46,44 @@ describe String do
46
46
  subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
47
47
 
48
48
  it "includes a field with the site's title" do
49
- hash[:title].is_a?(String).should be_true
50
- hash[:title].empty?.should be_false
49
+ hash[:title].is_a?(String).should be true
50
+ hash[:title].empty?.should be false
51
51
  end
52
52
 
53
53
  it "includes a field with the site's description" do
54
- hash[:description].is_a?(String).should be_true
55
- hash[:description].empty?.should be_false
54
+ hash[:description].is_a?(String).should be true
55
+ hash[:description].empty?.should be false
56
56
  end
57
57
 
58
58
  it 'includes a field with the page URI' do
59
- hash[:uri].is_a?(String).should be_true
60
- hash[:uri].empty?.should be_false
59
+ hash[:uri].is_a?(String).should be true
60
+ hash[:uri].empty?.should be false
61
61
  hash[:uri].should eq(misc[:http])
62
62
  end
63
63
 
64
+ it "includes a field with the site's keywords" do
65
+ hash[:keywords].is_a?(String).should be true
66
+ end
67
+
68
+ it "includes a field with the site's language" do
69
+ hash[:lang].is_a?(String).should be true
70
+ end
71
+
72
+ it "includes a field with the site's encode" do
73
+ hash[:encode].is_a?(String).should be true
74
+ end
75
+
76
+ it "includes a field with the site's reply email address" do
77
+ hash[:reply_to].is_a?(String).should be true
78
+ end
79
+
80
+ it "includes a field with the site's author name or email" do
81
+ hash[:author].is_a?(String).should be true
82
+ end
83
+
64
84
  it "includes a field with image URIs from the site's head/body" do
65
85
  unless hash[:images].empty?
66
- hash[:images].is_a?(Array).should be_true
86
+ hash[:images].is_a?(Array).should be true
67
87
  hash[:images].sample.should match(regexes[:image][:all])
68
88
  end
69
89
  end
@@ -77,8 +97,8 @@ describe String do
77
97
  it "can choose the URI in the String to be scrapified" do
78
98
  hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
79
99
  [:title, :description, :uri].each do |key|
80
- hash[key].is_a?(String).should be_true
81
- hash[key].empty?.should be_false
100
+ hash[key].is_a?(String).should be true
101
+ hash[key].empty?.should be false
82
102
  end
83
103
  hash[:uri].should eq("http://#{misc[:www]}")
84
104
  hash[:images].sample.should match(regexes[:image][:png])
@@ -175,7 +195,7 @@ describe String do
175
195
  end
176
196
 
177
197
  it 'always returns an Array' do
178
- checked.each { |c| c[1].is_a?(Array).should be_true }
198
+ checked.each { |c| c[1].is_a?(Array).should be true }
179
199
  end
180
200
  end
181
201
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tiago Guedes
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-06 00:00:00.000000000 Z
11
+ date: 2014-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -84,6 +84,7 @@ files:
84
84
  - lib/scrapifier/methods.rb
85
85
  - lib/scrapifier/support.rb
86
86
  - lib/scrapifier/version.rb
87
+ - lib/scrapifier/xpath.rb
87
88
  - scrapifier.gemspec
88
89
  - spec/factories/samples.rb
89
90
  - spec/scrapifier_spec.rb