web_scraper 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/web_scraper.rb +129 -5
  3. metadata +6 -4
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3e01f01d20813809f915bc0b97280f3d35c2153
4
- data.tar.gz: e73cf357e0151f499414ba21ce178a8724271e9c
3
+ metadata.gz: 33e7eb69bd6cea28712f7f778792cb45539ad6d4
4
+ data.tar.gz: 46874fa0d02cf37e67bbf22ce28d457c51126535
5
5
  SHA512:
6
- metadata.gz: d38b93e448b86791f93918226d9dbadd6304667b4b7a2c1ef794802eb47cbc21d98331d2af86ccebb0d5011c36d0578bb861a7e8663c91e832c77a2356d0ba8d
7
- data.tar.gz: 4c5213c3d410bb89ad093da6fc9f578c637e6bade5dbf94b5d4e93a891c1b94ef9ce6dbb9d35197bd0b728bb11e20076d3a21837e18120e6660a60d157cc150b
6
+ metadata.gz: bee62dc14b4540bbeb5f975a0a88eea729c1f4cb815954b61be533d04c4a94046ce54bf0bdba94d728e2281b26e6ff01683d627be5a9f63e3b5cbf5351dbf617
7
+ data.tar.gz: db2358697c31df901f36a8e6dedd0a140bc3c1af65d83dab07ccafbb3f670b70c8dc8c278ae718359e98bd1e1e6d52c7e030624deeb754742becea7047ac34c4
@@ -1,25 +1,74 @@
1
1
  require 'open-uri'
2
2
  require 'nokogiri'
3
3
 
4
+ ##
5
+ # WebScraper allows you to describe html structure declaratively,
6
+ # get appropriate blocks, and work with them as with ruby objects.
7
+ # @example
8
+ # class Article < WebScraper
9
+ # resource 'http://hbswk.hbs.edu/topics/it.html'
10
+ #
11
+ # base css: '.tile-medium'
12
+ #
13
+ # property :title, xpath: './/h4/a/text()'
14
+ # property :date, xpath: './/li[1]/text()'
15
+ # property :category, xpath: './/li[2]/a/text()'
16
+ # property :description, xpath: './/p/text()'
17
+ #
18
+ # key :title
19
+ # end
20
+ #
21
+ # puts "#{Article.count} articles were found"
22
+ # puts
23
+ #
24
+ # articles = Article.all
25
+ #
26
+ # articles.each do |article|
27
+ # header = article.title
28
+ # puts header
29
+ # puts '=' * header.length
30
+ # puts
31
+ #
32
+ # subheader = "#{article.date} #{article.category}"
33
+ # puts subheader
34
+ # puts '-' * subheader.length
35
+ # puts
36
+ #
37
+ # puts article.description
38
+ # puts
39
+ # end
40
+ #
41
+ # article = Article.find('Tech Investment the Wise Way')
42
+ #
43
+ # puts article.description
4
44
  class WebScraper
45
+ ##
46
+ # The error raises when a user tries to call a class method
47
+ # when not all required attributes were defined.
5
48
  class ConfigurationError < RuntimeError
6
49
  def message
7
50
  'resource, base, properties and key should be defined'
8
51
  end
9
52
  end
10
53
 
54
+ ##
55
+ # The error raises when a user tries to define resource improperly.
11
56
  class ResourceDefentitionError < RuntimeError
12
57
  def message
13
58
  'resource should be a string'
14
59
  end
15
60
  end
16
61
 
62
+ ##
63
+ # The error raises when a user tries to define base improperly.
17
64
  class BaseDefentitionError < RuntimeError
18
65
  def message
19
66
  'base should be a selector (:css|:xpath => String)'
20
67
  end
21
68
  end
22
69
 
70
+ ##
71
+ # The error raises when a user tries to define propery improperly.
23
72
  class PropertyDefentitionError < RuntimeError
24
73
  def message
25
74
  'property is a name (with type optionally) ' +
@@ -27,6 +76,8 @@ class WebScraper
27
76
  end
28
77
  end
29
78
 
79
+ ##
80
+ # The error raises when a user tries to define key improperly.
30
81
  class KeyDefentitionError < RuntimeError
31
82
  def message
32
83
  'key should be a name of a defined property'
@@ -34,6 +85,12 @@ class WebScraper
34
85
  end
35
86
 
36
87
  class << self
88
+ ##
89
+ # Loads html page, detects appropriate blocks,
90
+ # wraps them in objects.
91
+ # The result will be cached.
92
+ # @example
93
+ # articles = Article.all
37
94
  def all
38
95
  raise ConfigurationError unless valid?
39
96
 
@@ -41,18 +98,38 @@ class WebScraper
41
98
  .send(*_base).map { |node| new(node) }
42
99
  end
43
100
 
101
+ ##
102
+ # Returns number of objects found.
103
+ # @example
104
+ # puts "#{Article.count} articles were found"
44
105
  def count
45
106
  all.size
46
107
  end
47
108
 
48
- def expire
109
+ ##
110
+ # Resets cache of the html data.
111
+ # @example
112
+ # Article.reset
113
+ def reset
49
114
  @all = nil
50
115
  end
51
116
 
117
+ ##
118
+ # Finds first object with required key.
119
+ # @example
120
+ # article = Article.find('Tech Investment the Wise Way')
52
121
  def find(key)
53
122
  all.find { |e| e.send(_key) == key }
54
123
  end
55
124
 
125
+ ##
126
+ # Defines resource -- url of the html page.
127
+ # @example
128
+ # class Article < WebScraper
129
+ # ...
130
+ # resource 'http://hbswk.hbs.edu/topics/it.html'
131
+ # ...
132
+ # end
56
133
  def resource(_resource)
57
134
  raise ResourceDefentitionError unless _resource.is_a? String
58
135
 
@@ -61,6 +138,15 @@ class WebScraper
61
138
 
62
139
  attr_reader :_resource
63
140
 
141
+ ##
142
+ # Defines base -- selector which determines blocks of content.
143
+ # You can use css or xpath selectors.
144
+ # @example
145
+ # class Article < WebScraper
146
+ # ...
147
+ # base css: '.tile-medium'
148
+ # ...
149
+ # end
64
150
  def base(_base)
65
151
  raise BaseDefentitionError unless valid_selector? _base
66
152
 
@@ -69,6 +155,19 @@ class WebScraper
69
155
 
70
156
  attr_reader :_base
71
157
 
158
+ ##
159
+ # Defines property -- name (and type optionally) and selector.
160
+ # You can use css or xpath selectors.
161
+ # Types determine returning values.
162
+ # Available types (default is string): string, integer, float, node.
163
+ # The node option means nokogiri node.
164
+ # @example
165
+ # class Article < WebScraper
166
+ # ...
167
+ # property :title, xpath: './/h4/a/text()'
168
+ # property views: :integer, xpath: './/h4/span/text()'
169
+ # ...
170
+ # end
72
171
  def property(*args)
73
172
  @properties ||= {}
74
173
 
@@ -101,6 +200,14 @@ class WebScraper
101
200
 
102
201
  attr_reader :properties
103
202
 
203
+ ##
204
+ # Defines key -- property which will be used in find method.
205
+ # @example
206
+ # class Article < WebScraper
207
+ # ...
208
+ # key :title
209
+ # ...
210
+ # end
104
211
  def key(_key)
105
212
  raise KeyDefentitionError unless properties.keys.include? _key
106
213
 
@@ -109,10 +216,14 @@ class WebScraper
109
216
 
110
217
  attr_reader :_key
111
218
 
219
+ ##
220
+ # Checks if all attributes were set.
112
221
  def valid?
113
222
  _resource && _base && _key
114
223
  end
115
224
 
225
+ ##
226
+ # Checks if selector was defined correctly.
116
227
  def valid_selector?(selector)
117
228
  (selector.is_a? Hash) &&
118
229
  (selector.size == 1) &&
@@ -120,6 +231,8 @@ class WebScraper
120
231
  (selector.values.first.is_a? String)
121
232
  end
122
233
 
234
+ ##
235
+ # Checks if property information (i.e. name and type) were defined correctly.
123
236
  def valid_info?(info)
124
237
  (info.is_a? Hash) &&
125
238
  (info.size == 1) &&
@@ -130,20 +243,31 @@ class WebScraper
130
243
  private :new
131
244
  end
132
245
 
246
+ ##
247
+ # Sets nokogiri node. It's private method.
133
248
  def initialize(node)
134
249
  @node = node
135
250
  end
136
251
 
137
- attr_reader :node
138
-
252
+ ##
253
+ # Allows you to use nokogiri css method directly on your object.
254
+ # It proxies it to nokogiri node.
139
255
  def css(*args)
140
- node.css(*args)
256
+ @node.css(*args)
141
257
  end
142
258
 
259
+ ##
260
+ # Allows you to use nokogiri xpath method directly on your object.
261
+ # It proxies it to nokogiri node.
143
262
  def xpath(*args)
144
- node.xpath(*args)
263
+ @node.xpath(*args)
145
264
  end
146
265
 
266
+ ##
267
+ # Returns appropriate value for property if found.
268
+ # Converts it to the defined type.
269
+ # @example
270
+ # puts article.description
147
271
  def method_missing(name, *args, &block)
148
272
  if self.class.properties.key? name
149
273
  property = self.class.properties[name]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Speransky Danil
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-03-16 00:00:00.000000000 Z
11
+ date: 2014-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,7 +24,8 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
- description: ''
27
+ description: Web Scraper is a library to build APIs by scraping static sites and use
28
+ data as models.
28
29
  email: speranskydanil@gmail.com
29
30
  executables: []
30
31
  extensions: []
@@ -54,6 +55,7 @@ rubyforge_project:
54
55
  rubygems_version: 2.2.2
55
56
  signing_key:
56
57
  specification_version: 4
57
- summary: ''
58
+ summary: Web Scraper is a library to build APIs by scraping static sites and use data
59
+ as models.
58
60
  test_files: []
59
61
  has_rdoc: