web_scraper 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/web_scraper.rb +129 -5
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33e7eb69bd6cea28712f7f778792cb45539ad6d4
|
4
|
+
data.tar.gz: 46874fa0d02cf37e67bbf22ce28d457c51126535
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bee62dc14b4540bbeb5f975a0a88eea729c1f4cb815954b61be533d04c4a94046ce54bf0bdba94d728e2281b26e6ff01683d627be5a9f63e3b5cbf5351dbf617
|
7
|
+
data.tar.gz: db2358697c31df901f36a8e6dedd0a140bc3c1af65d83dab07ccafbb3f670b70c8dc8c278ae718359e98bd1e1e6d52c7e030624deeb754742becea7047ac34c4
|
data/lib/web_scraper.rb
CHANGED
@@ -1,25 +1,74 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
|
+
##
|
5
|
+
# WebScraper allows you to describe html structure declaratively,
|
6
|
+
# get appropriate blocks, and work with them as with ruby objects.
|
7
|
+
# @example
|
8
|
+
# class Article < WebScraper
|
9
|
+
# resource 'http://hbswk.hbs.edu/topics/it.html'
|
10
|
+
#
|
11
|
+
# base css: '.tile-medium'
|
12
|
+
#
|
13
|
+
# property :title, xpath: './/h4/a/text()'
|
14
|
+
# property :date, xpath: './/li[1]/text()'
|
15
|
+
# property :category, xpath: './/li[2]/a/text()'
|
16
|
+
# property :description, xpath: './/p/text()'
|
17
|
+
#
|
18
|
+
# key :title
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# puts "#{Article.count} articles were found"
|
22
|
+
# puts
|
23
|
+
#
|
24
|
+
# articles = Article.all
|
25
|
+
#
|
26
|
+
# articles.each do |article|
|
27
|
+
# header = article.title
|
28
|
+
# puts header
|
29
|
+
# puts '=' * header.length
|
30
|
+
# puts
|
31
|
+
#
|
32
|
+
# subheader = "#{article.date} #{article.category}"
|
33
|
+
# puts subheader
|
34
|
+
# puts '-' * subheader.length
|
35
|
+
# puts
|
36
|
+
#
|
37
|
+
# puts article.description
|
38
|
+
# puts
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# article = Article.find('Tech Investment the Wise Way')
|
42
|
+
#
|
43
|
+
# puts article.description
|
4
44
|
class WebScraper
|
45
|
+
##
|
46
|
+
# The error raises when a user tries to call a class method
|
47
|
+
# when not all required attributes were defined.
|
5
48
|
class ConfigurationError < RuntimeError
|
6
49
|
def message
|
7
50
|
'resource, base, properties and key should be defined'
|
8
51
|
end
|
9
52
|
end
|
10
53
|
|
54
|
+
##
|
55
|
+
# The error raises when a user tries to define resource improperly.
|
11
56
|
class ResourceDefentitionError < RuntimeError
|
12
57
|
def message
|
13
58
|
'resource should be a string'
|
14
59
|
end
|
15
60
|
end
|
16
61
|
|
62
|
+
##
|
63
|
+
# The error raises when a user tries to define base improperly.
|
17
64
|
class BaseDefentitionError < RuntimeError
|
18
65
|
def message
|
19
66
|
'base should be a selector (:css|:xpath => String)'
|
20
67
|
end
|
21
68
|
end
|
22
69
|
|
70
|
+
##
|
71
|
+
# The error raises when a user tries to define propery improperly.
|
23
72
|
class PropertyDefentitionError < RuntimeError
|
24
73
|
def message
|
25
74
|
'property is a name (with type optionally) ' +
|
@@ -27,6 +76,8 @@ class WebScraper
|
|
27
76
|
end
|
28
77
|
end
|
29
78
|
|
79
|
+
##
|
80
|
+
# The error raises when a user tries to define key improperly.
|
30
81
|
class KeyDefentitionError < RuntimeError
|
31
82
|
def message
|
32
83
|
'key should be a name of a defined property'
|
@@ -34,6 +85,12 @@ class WebScraper
|
|
34
85
|
end
|
35
86
|
|
36
87
|
class << self
|
88
|
+
##
|
89
|
+
# Loads html page, detects appropriate blocks,
|
90
|
+
# wraps them in objects.
|
91
|
+
# The result will be cached.
|
92
|
+
# @example
|
93
|
+
# articles = Article.all
|
37
94
|
def all
|
38
95
|
raise ConfigurationError unless valid?
|
39
96
|
|
@@ -41,18 +98,38 @@ class WebScraper
|
|
41
98
|
.send(*_base).map { |node| new(node) }
|
42
99
|
end
|
43
100
|
|
101
|
+
##
|
102
|
+
# Returns number of objects found.
|
103
|
+
# @example
|
104
|
+
# puts "#{Article.count} articles were found"
|
44
105
|
def count
|
45
106
|
all.size
|
46
107
|
end
|
47
108
|
|
48
|
-
|
109
|
+
##
|
110
|
+
# Resets cache of the html data.
|
111
|
+
# @example
|
112
|
+
# Article.reset
|
113
|
+
def reset
|
49
114
|
@all = nil
|
50
115
|
end
|
51
116
|
|
117
|
+
##
|
118
|
+
# Finds first object with required key.
|
119
|
+
# @example
|
120
|
+
# article = Article.find('Tech Investment the Wise Way')
|
52
121
|
def find(key)
|
53
122
|
all.find { |e| e.send(_key) == key }
|
54
123
|
end
|
55
124
|
|
125
|
+
##
|
126
|
+
# Defines resource -- url of the html page.
|
127
|
+
# @example
|
128
|
+
# class Article < WebScraper
|
129
|
+
# ...
|
130
|
+
# resource 'http://hbswk.hbs.edu/topics/it.html'
|
131
|
+
# ...
|
132
|
+
# end
|
56
133
|
def resource(_resource)
|
57
134
|
raise ResourceDefentitionError unless _resource.is_a? String
|
58
135
|
|
@@ -61,6 +138,15 @@ class WebScraper
|
|
61
138
|
|
62
139
|
attr_reader :_resource
|
63
140
|
|
141
|
+
##
|
142
|
+
# Defines base -- selector which determines blocks of content.
|
143
|
+
# You can use css or xpath selectors.
|
144
|
+
# @example
|
145
|
+
# class Article < WebScraper
|
146
|
+
# ...
|
147
|
+
# base css: '.tile-medium'
|
148
|
+
# ...
|
149
|
+
# end
|
64
150
|
def base(_base)
|
65
151
|
raise BaseDefentitionError unless valid_selector? _base
|
66
152
|
|
@@ -69,6 +155,19 @@ class WebScraper
|
|
69
155
|
|
70
156
|
attr_reader :_base
|
71
157
|
|
158
|
+
##
|
159
|
+
# Defines property -- name (and type optionally) and selector.
|
160
|
+
# You can use css or xpath selectors.
|
161
|
+
# Types determine returning values.
|
162
|
+
# Available types (default is string): string, integer, float, node.
|
163
|
+
# The node option means nokogiri node.
|
164
|
+
# @example
|
165
|
+
# class Article < WebScraper
|
166
|
+
# ...
|
167
|
+
# property :title, xpath: './/h4/a/text()'
|
168
|
+
# property views: :integer, xpath: './/h4/span/text()'
|
169
|
+
# ...
|
170
|
+
# end
|
72
171
|
def property(*args)
|
73
172
|
@properties ||= {}
|
74
173
|
|
@@ -101,6 +200,14 @@ class WebScraper
|
|
101
200
|
|
102
201
|
attr_reader :properties
|
103
202
|
|
203
|
+
##
|
204
|
+
# Defines key -- property which will be used in find method.
|
205
|
+
# @example
|
206
|
+
# class Article < WebScraper
|
207
|
+
# ...
|
208
|
+
# key :title
|
209
|
+
# ...
|
210
|
+
# end
|
104
211
|
def key(_key)
|
105
212
|
raise KeyDefentitionError unless properties.keys.include? _key
|
106
213
|
|
@@ -109,10 +216,14 @@ class WebScraper
|
|
109
216
|
|
110
217
|
attr_reader :_key
|
111
218
|
|
219
|
+
##
|
220
|
+
# Checks if all attributes were set.
|
112
221
|
def valid?
|
113
222
|
_resource && _base && _key
|
114
223
|
end
|
115
224
|
|
225
|
+
##
|
226
|
+
# Checks if selector was defined correctly.
|
116
227
|
def valid_selector?(selector)
|
117
228
|
(selector.is_a? Hash) &&
|
118
229
|
(selector.size == 1) &&
|
@@ -120,6 +231,8 @@ class WebScraper
|
|
120
231
|
(selector.values.first.is_a? String)
|
121
232
|
end
|
122
233
|
|
234
|
+
##
|
235
|
+
# Checks if property information (i.e. name and type) were defined correctly.
|
123
236
|
def valid_info?(info)
|
124
237
|
(info.is_a? Hash) &&
|
125
238
|
(info.size == 1) &&
|
@@ -130,20 +243,31 @@ class WebScraper
|
|
130
243
|
private :new
|
131
244
|
end
|
132
245
|
|
246
|
+
##
|
247
|
+
# Sets nokogiri node. It's private method.
|
133
248
|
def initialize(node)
|
134
249
|
@node = node
|
135
250
|
end
|
136
251
|
|
137
|
-
|
138
|
-
|
252
|
+
##
|
253
|
+
# Allows you to use nokogiri css method directly on your object.
|
254
|
+
# It proxies it to nokogiri node.
|
139
255
|
def css(*args)
|
140
|
-
node.css(*args)
|
256
|
+
@node.css(*args)
|
141
257
|
end
|
142
258
|
|
259
|
+
##
|
260
|
+
# Allows you to use nokogiri xpath method directly on your object.
|
261
|
+
# It proxies it to nokogiri node.
|
143
262
|
def xpath(*args)
|
144
|
-
node.xpath(*args)
|
263
|
+
@node.xpath(*args)
|
145
264
|
end
|
146
265
|
|
266
|
+
##
|
267
|
+
# Returns appropriate value for property if found.
|
268
|
+
# Converts it to the defined type.
|
269
|
+
# @example
|
270
|
+
# puts article.description
|
147
271
|
def method_missing(name, *args, &block)
|
148
272
|
if self.class.properties.key? name
|
149
273
|
property = self.class.properties[name]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Speransky Danil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,7 +24,8 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
description:
|
27
|
+
description: Web Scraper is a library to build APIs by scraping static sites and use
|
28
|
+
data as models.
|
28
29
|
email: speranskydanil@gmail.com
|
29
30
|
executables: []
|
30
31
|
extensions: []
|
@@ -54,6 +55,7 @@ rubyforge_project:
|
|
54
55
|
rubygems_version: 2.2.2
|
55
56
|
signing_key:
|
56
57
|
specification_version: 4
|
57
|
-
summary:
|
58
|
+
summary: Web Scraper is a library to build APIs by scraping static sites and use data
|
59
|
+
as models.
|
58
60
|
test_files: []
|
59
61
|
has_rdoc:
|