scrapifier 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 67a72b01b9c64d37f15c459e11c536f29bd0abc2
4
- data.tar.gz: d83f2f8c7ff3203ee2a5472e2e2099a4389af7aa
3
+ metadata.gz: e52f7f47695c7b16fed80a51f99a119d3f98a3b7
4
+ data.tar.gz: 24db438acd2fb2df421ab7b6c148e7cbed3331ee
5
5
  SHA512:
6
- metadata.gz: 2a4abdb171a16b46b89bfb4c43a3262169c08c70125b3ac88e9c4f13209acd5085368b3a4b9399691b9fb9dc89fc4cacfe34b781a26df3c5646bfd4f20f44435
7
- data.tar.gz: c246dd2f9770e3835f4423b497b0bb86d38e51ff9030b59aa516be89f11262db159d2938f94e43f60b91fb14c2c56015965fb8556eec963a13e3c568a4c68d9d
6
+ metadata.gz: 41e5f58a1760d61196ee3b4f429644025789d4432e7a1cef1a70d31473a26c12a8e38091527adacc78e7650759fc55cbdb80e365ce3f86954fcf493304f08d86
7
+ data.tar.gz: 248f23ec549f331a942d1847b9fc4d92935dea34993128fe5bd63fca9a4a4b5814ee531ef3c390e38502cd7f802c55a12dfeef6aa7eb434782d6f735a4d29e28
@@ -1,4 +1,5 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 2.0.0
4
- - 1.9.3
4
+ - 2.1.0
5
+ - 2.1.1
data/README.md CHANGED
@@ -7,6 +7,8 @@
7
7
 
8
8
  It's a Ruby gem that brings a very simple way to extract meta information from URIs using the screen scraping technique.
9
9
 
10
+ Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
11
+
10
12
  ## Installation
11
13
 
12
14
  Compatible with Ruby 1.9.3+
@@ -31,8 +33,6 @@ An then require the gem:
31
33
 
32
34
  The String#scrapify method finds URIs in a string and then gets their metadata, e.g., the page's title, description, images and URI. All the data is returned in a well-formatted hash.
33
35
 
34
- Note: This gem is mainly focused on screen scraping URLs (presence of protocol, such as: "http", "https" and "ftp"), but it also works with URIs which have the "www" without any protocol defined, like: "www.google.com".
35
-
36
36
  #### Default usage.
37
37
 
38
38
  ``` ruby
@@ -6,7 +6,7 @@ require 'scrapifier/support'
6
6
  module Scrapifier
7
7
  # Methods which will be included into the String class.
8
8
  module Methods
9
- include Scrapifier::Support
9
+ include Support
10
10
 
11
11
  # Get metadata from an URI using the screen scraping technique.
12
12
  #
@@ -1,6 +1,9 @@
1
+ require 'scrapifier/xpath'
2
+
1
3
  module Scrapifier
2
4
  # Support methods to get, check and organize data.
3
5
  module Support
6
+ include XPath
4
7
  module_function
5
8
 
6
9
  # Evaluate the URI's HTML document and get its metadata.
@@ -25,7 +28,7 @@ module Scrapifier
25
28
  doc = Nokogiri::HTML(open(uri).read)
26
29
  doc.encoding, meta = 'utf-8', { uri: uri }
27
30
 
28
- [:title, :description].each do |k|
31
+ [:title, :description, :keywords, :lang, :encode, :reply_to, :author].each do |k|
29
32
  node = doc.xpath(sf_xpaths[k])[0]
30
33
  meta[k] = node.nil? ? '-' : node.text
31
34
  end
@@ -111,45 +114,16 @@ module Scrapifier
111
114
  %r{(^http{1}[s]?://([w]{3}\.)?.+\.(#{exts.join('|')})(\?.+)?$)}i
112
115
  end
113
116
 
114
- # Collection of xpath that are used to get nodes
115
- # from the parsed HTML.
117
+ # Organize XPaths.
116
118
  def sf_xpaths
117
- {
118
- title: sf_title_xpath,
119
- description: sf_desc_xpath,
120
- image: sf_img_xpath
121
- }
122
- end
123
-
124
- def sf_title_xpath
125
- <<-END.gsub(/^\s+\|/, '')
126
- |//meta[@property = "og:title"]/@content|
127
- |//meta[@name = "title"]/@content|
128
- |//meta[@name = "Title"]/@content|
129
- |//title|//h1
130
- END
131
- end
132
-
133
- def sf_desc_xpath
134
- <<-END.gsub(/^\s+\|/, '')
135
- |//meta[@property = "og:description"]/@content|
136
- |//meta[@name = "description"]/@content|
137
- |//meta[@name = "Description"]/@content|
138
- |//h1|//h3|//p|//span|//font
139
- END
140
- end
141
-
142
- def sf_img_xpath
143
- <<-END.gsub(/^\s+\|/, '')
144
- |//meta[@property = "og:image"]/@content|
145
- |//link[@rel = "image_src"]/@href|
146
- |//meta[@itemprop = "image"]/@content|
147
- |//div[@id = "logo"]/img/@src|//a[@id = "logo"]/img/@src|
148
- |//div[@class = "logo"]/img/@src|//a[@class = "logo"]/img/@src|
149
- |//a//img[@width]/@src|//img[@width]/@src|
150
- |//a//img[@height]/@src|//img[@height]/@src|
151
- |//a//img/@src|//span//img/@src|//img/@src
152
- END
119
+ { title: XPath::TITLE,
120
+ description: XPath::DESC,
121
+ keywords: XPath::KEYWORDS,
122
+ lang: XPath::LANG,
123
+ encode: XPath::ENCODE,
124
+ reply_to: XPath::REPLY_TO,
125
+ author: XPath::AUTHOR,
126
+ image: XPath::IMG }
153
127
  end
154
128
 
155
129
  # Check and return only the valid image URIs.
@@ -1,3 +1,3 @@
1
1
  module Scrapifier
2
- VERSION = '0.0.4'
2
+ VERSION = '0.0.5'
3
3
  end
@@ -0,0 +1,66 @@
1
+ # coding: utf-8
2
+ module Scrapifier
3
+ # Collection of all XPaths which are used to find
4
+ # the nodes within the parsed HTML doc.
5
+ module XPath
6
+ TITLE =
7
+ <<-END.gsub(/^\s+\|/, '')
8
+ |//meta[@property="og:title"]/@content|
9
+ |//meta[@name="title"]/@content|
10
+ |//meta[@name="Title"]/@content|
11
+ |//title|//h1
12
+ END
13
+
14
+ DESC =
15
+ <<-END.gsub(/^\s+\|/, '')
16
+ |//meta[@property="og:description"]/@content|
17
+ |//meta[@name="description"]/@content|
18
+ |//meta[@name="Description"]/@content|
19
+ |//h1|//h3|//p|//span|//font
20
+ END
21
+
22
+ KEYWORDS =
23
+ <<-END.gsub(/^\s+\|/, '')
24
+ |//meta[@name="keywords"]/@content|
25
+ |//meta[@name="Keywords"]/@content|
26
+ |//meta[@property="og:type"]/@content
27
+ END
28
+
29
+ LANG =
30
+ <<-END.gsub(/^\s+\|/, '')
31
+ |//html/@lang|
32
+ |//meta[@property="og:locale"]/@content|
33
+ |//meta[@http-equiv="content-language"]/@content
34
+ END
35
+
36
+ ENCODE =
37
+ <<-END.gsub(/^\s+\|/, '')
38
+ |//meta/@charset|
39
+ |//meta[@http-equiv="content-type"]/@content
40
+ END
41
+
42
+ REPLY_TO =
43
+ <<-END.gsub(/^\s+\|/, '')
44
+ |//meta[@name="reply_to"]/@content
45
+ END
46
+
47
+ AUTHOR =
48
+ <<-END.gsub(/^\s+\|/, '')
49
+ |//meta[@name="author"]/@content|
50
+ |//meta[@name="Author"]/@content|
51
+ |//meta[@name="reply_to"]/@content
52
+ END
53
+
54
+ IMG =
55
+ <<-END.gsub(/^\s+\|/, '')
56
+ |//meta[@property="og:image"]/@content|
57
+ |//link[@rel="image_src"]/@href|
58
+ |//meta[@itemprop="image"]/@content|
59
+ |//div[@id="logo"]/img/@src|//a[@id="logo"]/img/@src|
60
+ |//div[@class="logo"]/img/@src|//a[@class="logo"]/img/@src|
61
+ |//a//img[@width]/@src|//img[@width]/@src|
62
+ |//a//img[@height]/@src|//img[@height]/@src|
63
+ |//a//img/@src|//span//img/@src|//img/@src
64
+ END
65
+ end
66
+ end
@@ -1,3 +1,4 @@
1
+ # coding: utf-8
1
2
  module Factories
2
3
  private
3
4
 
@@ -46,24 +46,44 @@ describe String do
46
46
  subject(:hash) { "Look this awesome site #{misc[:http]}".scrapify }
47
47
 
48
48
  it "includes a field with the site's title" do
49
- hash[:title].is_a?(String).should be_true
50
- hash[:title].empty?.should be_false
49
+ hash[:title].is_a?(String).should be true
50
+ hash[:title].empty?.should be false
51
51
  end
52
52
 
53
53
  it "includes a field with the site's description" do
54
- hash[:description].is_a?(String).should be_true
55
- hash[:description].empty?.should be_false
54
+ hash[:description].is_a?(String).should be true
55
+ hash[:description].empty?.should be false
56
56
  end
57
57
 
58
58
  it 'includes a field with the page URI' do
59
- hash[:uri].is_a?(String).should be_true
60
- hash[:uri].empty?.should be_false
59
+ hash[:uri].is_a?(String).should be true
60
+ hash[:uri].empty?.should be false
61
61
  hash[:uri].should eq(misc[:http])
62
62
  end
63
63
 
64
+ it "includes a field with the site's keywords" do
65
+ hash[:keywords].is_a?(String).should be true
66
+ end
67
+
68
+ it "includes a field with the site's language" do
69
+ hash[:lang].is_a?(String).should be true
70
+ end
71
+
72
+ it "includes a field with the site's encode" do
73
+ hash[:encode].is_a?(String).should be true
74
+ end
75
+
76
+ it "includes a field with the site's reply email address" do
77
+ hash[:reply_to].is_a?(String).should be true
78
+ end
79
+
80
+ it "includes a field with the site's author name or email" do
81
+ hash[:author].is_a?(String).should be true
82
+ end
83
+
64
84
  it "includes a field with image URIs from the site's head/body" do
65
85
  unless hash[:images].empty?
66
- hash[:images].is_a?(Array).should be_true
86
+ hash[:images].is_a?(Array).should be true
67
87
  hash[:images].sample.should match(regexes[:image][:all])
68
88
  end
69
89
  end
@@ -77,8 +97,8 @@ describe String do
77
97
  it "can choose the URI in the String to be scrapified" do
78
98
  hash = "Check out these awesome sites: #{misc[:http]} and #{misc[:www]}".scrapify(which: 1, images: :png)
79
99
  [:title, :description, :uri].each do |key|
80
- hash[key].is_a?(String).should be_true
81
- hash[key].empty?.should be_false
100
+ hash[key].is_a?(String).should be true
101
+ hash[key].empty?.should be false
82
102
  end
83
103
  hash[:uri].should eq("http://#{misc[:www]}")
84
104
  hash[:images].sample.should match(regexes[:image][:png])
@@ -175,7 +195,7 @@ describe String do
175
195
  end
176
196
 
177
197
  it 'always returns an Array' do
178
- checked.each { |c| c[1].is_a?(Array).should be_true }
198
+ checked.each { |c| c[1].is_a?(Array).should be true }
179
199
  end
180
200
  end
181
201
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tiago Guedes
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-06 00:00:00.000000000 Z
11
+ date: 2014-06-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -84,6 +84,7 @@ files:
84
84
  - lib/scrapifier/methods.rb
85
85
  - lib/scrapifier/support.rb
86
86
  - lib/scrapifier/version.rb
87
+ - lib/scrapifier/xpath.rb
87
88
  - scrapifier.gemspec
88
89
  - spec/factories/samples.rb
89
90
  - spec/scrapifier_spec.rb