web_scraper 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/web_scraper.rb +129 -5
  3. metadata +6 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3e01f01d20813809f915bc0b97280f3d35c2153
4
- data.tar.gz: e73cf357e0151f499414ba21ce178a8724271e9c
3
+ metadata.gz: 33e7eb69bd6cea28712f7f778792cb45539ad6d4
4
+ data.tar.gz: 46874fa0d02cf37e67bbf22ce28d457c51126535
5
5
  SHA512:
6
- metadata.gz: d38b93e448b86791f93918226d9dbadd6304667b4b7a2c1ef794802eb47cbc21d98331d2af86ccebb0d5011c36d0578bb861a7e8663c91e832c77a2356d0ba8d
7
- data.tar.gz: 4c5213c3d410bb89ad093da6fc9f578c637e6bade5dbf94b5d4e93a891c1b94ef9ce6dbb9d35197bd0b728bb11e20076d3a21837e18120e6660a60d157cc150b
6
+ metadata.gz: bee62dc14b4540bbeb5f975a0a88eea729c1f4cb815954b61be533d04c4a94046ce54bf0bdba94d728e2281b26e6ff01683d627be5a9f63e3b5cbf5351dbf617
7
+ data.tar.gz: db2358697c31df901f36a8e6dedd0a140bc3c1af65d83dab07ccafbb3f670b70c8dc8c278ae718359e98bd1e1e6d52c7e030624deeb754742becea7047ac34c4
@@ -1,25 +1,74 @@
1
1
  require 'open-uri'
2
2
  require 'nokogiri'
3
3
 
4
+ ##
5
+ # WebScraper allows you to describe html structure declaratively,
6
+ # get appropriate blocks, and work with them as with ruby objects.
7
+ # @example
8
+ # class Article < WebScraper
9
+ # resource 'http://hbswk.hbs.edu/topics/it.html'
10
+ #
11
+ # base css: '.tile-medium'
12
+ #
13
+ # property :title, xpath: './/h4/a/text()'
14
+ # property :date, xpath: './/li[1]/text()'
15
+ # property :category, xpath: './/li[2]/a/text()'
16
+ # property :description, xpath: './/p/text()'
17
+ #
18
+ # key :title
19
+ # end
20
+ #
21
+ # puts "#{Article.count} articles were found"
22
+ # puts
23
+ #
24
+ # articles = Article.all
25
+ #
26
+ # articles.each do |article|
27
+ # header = article.title
28
+ # puts header
29
+ # puts '=' * header.length
30
+ # puts
31
+ #
32
+ # subheader = "#{article.date} #{article.category}"
33
+ # puts subheader
34
+ # puts '-' * subheader.length
35
+ # puts
36
+ #
37
+ # puts article.description
38
+ # puts
39
+ # end
40
+ #
41
+ # article = Article.find('Tech Investment the Wise Way')
42
+ #
43
+ # puts article.description
4
44
  class WebScraper
45
+ ##
46
+ # The error raises when a user tries to call a class method
47
+ # when not all required attributes were defined.
5
48
  class ConfigurationError < RuntimeError
6
49
  def message
7
50
  'resource, base, properties and key should be defined'
8
51
  end
9
52
  end
10
53
 
54
+ ##
55
+ # The error raises when a user tries to define resource improperly.
11
56
  class ResourceDefentitionError < RuntimeError
12
57
  def message
13
58
  'resource should be a string'
14
59
  end
15
60
  end
16
61
 
62
+ ##
63
+ # The error raises when a user tries to define base improperly.
17
64
  class BaseDefentitionError < RuntimeError
18
65
  def message
19
66
  'base should be a selector (:css|:xpath => String)'
20
67
  end
21
68
  end
22
69
 
70
+ ##
71
+ # The error raises when a user tries to define propery improperly.
23
72
  class PropertyDefentitionError < RuntimeError
24
73
  def message
25
74
  'property is a name (with type optionally) ' +
@@ -27,6 +76,8 @@ class WebScraper
27
76
  end
28
77
  end
29
78
 
79
+ ##
80
+ # The error raises when a user tries to define key improperly.
30
81
  class KeyDefentitionError < RuntimeError
31
82
  def message
32
83
  'key should be a name of a defined property'
@@ -34,6 +85,12 @@ class WebScraper
34
85
  end
35
86
 
36
87
  class << self
88
+ ##
89
+ # Loads html page, detects appropriate blocks,
90
+ # wraps them in objects.
91
+ # The result will be cached.
92
+ # @example
93
+ # articles = Article.all
37
94
  def all
38
95
  raise ConfigurationError unless valid?
39
96
 
@@ -41,18 +98,38 @@ class WebScraper
41
98
  .send(*_base).map { |node| new(node) }
42
99
  end
43
100
 
101
+ ##
102
+ # Returns number of objects found.
103
+ # @example
104
+ # puts "#{Article.count} articles were found"
44
105
  def count
45
106
  all.size
46
107
  end
47
108
 
48
- def expire
109
+ ##
110
+ # Resets cache of the html data.
111
+ # @example
112
+ # Article.reset
113
+ def reset
49
114
  @all = nil
50
115
  end
51
116
 
117
+ ##
118
+ # Finds first object with required key.
119
+ # @example
120
+ # article = Article.find('Tech Investment the Wise Way')
52
121
  def find(key)
53
122
  all.find { |e| e.send(_key) == key }
54
123
  end
55
124
 
125
+ ##
126
+ # Defines resource -- url of the html page.
127
+ # @example
128
+ # class Article < WebScraper
129
+ # ...
130
+ # resource 'http://hbswk.hbs.edu/topics/it.html'
131
+ # ...
132
+ # end
56
133
  def resource(_resource)
57
134
  raise ResourceDefentitionError unless _resource.is_a? String
58
135
 
@@ -61,6 +138,15 @@ class WebScraper
61
138
 
62
139
  attr_reader :_resource
63
140
 
141
+ ##
142
+ # Defines base -- selector which determines blocks of content.
143
+ # You can use css or xpath selectors.
144
+ # @example
145
+ # class Article < WebScraper
146
+ # ...
147
+ # base css: '.tile-medium'
148
+ # ...
149
+ # end
64
150
  def base(_base)
65
151
  raise BaseDefentitionError unless valid_selector? _base
66
152
 
@@ -69,6 +155,19 @@ class WebScraper
69
155
 
70
156
  attr_reader :_base
71
157
 
158
+ ##
159
+ # Defines property -- name (and type optionally) and selector.
160
+ # You can use css or xpath selectors.
161
+ # Types determine returning values.
162
+ # Available types (default is string): string, integer, float, node.
163
+ # The node option means nokogiri node.
164
+ # @example
165
+ # class Article < WebScraper
166
+ # ...
167
+ # property :title, xpath: './/h4/a/text()'
168
+ # property views: :integer, xpath: './/h4/span/text()'
169
+ # ...
170
+ # end
72
171
  def property(*args)
73
172
  @properties ||= {}
74
173
 
@@ -101,6 +200,14 @@ class WebScraper
101
200
 
102
201
  attr_reader :properties
103
202
 
203
+ ##
204
+ # Defines key -- property which will be used in find method.
205
+ # @example
206
+ # class Article < WebScraper
207
+ # ...
208
+ # key :title
209
+ # ...
210
+ # end
104
211
  def key(_key)
105
212
  raise KeyDefentitionError unless properties.keys.include? _key
106
213
 
@@ -109,10 +216,14 @@ class WebScraper
109
216
 
110
217
  attr_reader :_key
111
218
 
219
+ ##
220
+ # Checks if all attributes were set.
112
221
  def valid?
113
222
  _resource && _base && _key
114
223
  end
115
224
 
225
+ ##
226
+ # Checks if selector was defined correctly.
116
227
  def valid_selector?(selector)
117
228
  (selector.is_a? Hash) &&
118
229
  (selector.size == 1) &&
@@ -120,6 +231,8 @@ class WebScraper
120
231
  (selector.values.first.is_a? String)
121
232
  end
122
233
 
234
+ ##
235
+ # Checks if property information (i.e. name and type) were defined correctly.
123
236
  def valid_info?(info)
124
237
  (info.is_a? Hash) &&
125
238
  (info.size == 1) &&
@@ -130,20 +243,31 @@ class WebScraper
130
243
  private :new
131
244
  end
132
245
 
246
+ ##
247
+ # Sets nokogiri node. It's private method.
133
248
  def initialize(node)
134
249
  @node = node
135
250
  end
136
251
 
137
- attr_reader :node
138
-
252
+ ##
253
+ # Allows you to use nokogiri css method directly on your object.
254
+ # It proxies it to nokogiri node.
139
255
  def css(*args)
140
- node.css(*args)
256
+ @node.css(*args)
141
257
  end
142
258
 
259
+ ##
260
+ # Allows you to use nokogiri xpath method directly on your object.
261
+ # It proxies it to nokogiri node.
143
262
  def xpath(*args)
144
- node.xpath(*args)
263
+ @node.xpath(*args)
145
264
  end
146
265
 
266
+ ##
267
+ # Returns appropriate value for property if found.
268
+ # Converts it to the defined type.
269
+ # @example
270
+ # puts article.description
147
271
  def method_missing(name, *args, &block)
148
272
  if self.class.properties.key? name
149
273
  property = self.class.properties[name]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Speransky Danil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-16 00:00:00.000000000 Z
11
+ date: 2014-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,7 +24,8 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- description: ''
27
+ description: Web Scraper is a library to build APIs by scraping static sites and use
28
+ data as models.
28
29
  email: speranskydanil@gmail.com
29
30
  executables: []
30
31
  extensions: []
@@ -54,6 +55,7 @@ rubyforge_project:
54
55
  rubygems_version: 2.2.2
55
56
  signing_key:
56
57
  specification_version: 4
57
- summary: ''
58
+ summary: Web Scraper is a library to build APIs by scraping static sites and use data
59
+ as models.
58
60
  test_files: []
59
61
  has_rdoc: