web_scraper 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/web_scraper.rb +129 -5
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33e7eb69bd6cea28712f7f778792cb45539ad6d4
|
4
|
+
data.tar.gz: 46874fa0d02cf37e67bbf22ce28d457c51126535
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bee62dc14b4540bbeb5f975a0a88eea729c1f4cb815954b61be533d04c4a94046ce54bf0bdba94d728e2281b26e6ff01683d627be5a9f63e3b5cbf5351dbf617
|
7
|
+
data.tar.gz: db2358697c31df901f36a8e6dedd0a140bc3c1af65d83dab07ccafbb3f670b70c8dc8c278ae718359e98bd1e1e6d52c7e030624deeb754742becea7047ac34c4
|
data/lib/web_scraper.rb
CHANGED
@@ -1,25 +1,74 @@
|
|
1
1
|
require 'open-uri'
|
2
2
|
require 'nokogiri'
|
3
3
|
|
4
|
+
##
|
5
|
+
# WebScraper allows you to describe html structure declaratively,
|
6
|
+
# get appropriate blocks, and work with them as with ruby objects.
|
7
|
+
# @example
|
8
|
+
# class Article < WebScraper
|
9
|
+
# resource 'http://hbswk.hbs.edu/topics/it.html'
|
10
|
+
#
|
11
|
+
# base css: '.tile-medium'
|
12
|
+
#
|
13
|
+
# property :title, xpath: './/h4/a/text()'
|
14
|
+
# property :date, xpath: './/li[1]/text()'
|
15
|
+
# property :category, xpath: './/li[2]/a/text()'
|
16
|
+
# property :description, xpath: './/p/text()'
|
17
|
+
#
|
18
|
+
# key :title
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# puts "#{Article.count} articles were found"
|
22
|
+
# puts
|
23
|
+
#
|
24
|
+
# articles = Article.all
|
25
|
+
#
|
26
|
+
# articles.each do |article|
|
27
|
+
# header = article.title
|
28
|
+
# puts header
|
29
|
+
# puts '=' * header.length
|
30
|
+
# puts
|
31
|
+
#
|
32
|
+
# subheader = "#{article.date} #{article.category}"
|
33
|
+
# puts subheader
|
34
|
+
# puts '-' * subheader.length
|
35
|
+
# puts
|
36
|
+
#
|
37
|
+
# puts article.description
|
38
|
+
# puts
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# article = Article.find('Tech Investment the Wise Way')
|
42
|
+
#
|
43
|
+
# puts article.description
|
4
44
|
class WebScraper
|
45
|
+
##
|
46
|
+
# The error raises when a user tries to call a class method
|
47
|
+
# when not all required attributes were defined.
|
5
48
|
class ConfigurationError < RuntimeError
|
6
49
|
def message
|
7
50
|
'resource, base, properties and key should be defined'
|
8
51
|
end
|
9
52
|
end
|
10
53
|
|
54
|
+
##
|
55
|
+
# The error raises when a user tries to define resource improperly.
|
11
56
|
class ResourceDefentitionError < RuntimeError
|
12
57
|
def message
|
13
58
|
'resource should be a string'
|
14
59
|
end
|
15
60
|
end
|
16
61
|
|
62
|
+
##
|
63
|
+
# The error raises when a user tries to define base improperly.
|
17
64
|
class BaseDefentitionError < RuntimeError
|
18
65
|
def message
|
19
66
|
'base should be a selector (:css|:xpath => String)'
|
20
67
|
end
|
21
68
|
end
|
22
69
|
|
70
|
+
##
|
71
|
+
# The error raises when a user tries to define propery improperly.
|
23
72
|
class PropertyDefentitionError < RuntimeError
|
24
73
|
def message
|
25
74
|
'property is a name (with type optionally) ' +
|
@@ -27,6 +76,8 @@ class WebScraper
|
|
27
76
|
end
|
28
77
|
end
|
29
78
|
|
79
|
+
##
|
80
|
+
# The error raises when a user tries to define key improperly.
|
30
81
|
class KeyDefentitionError < RuntimeError
|
31
82
|
def message
|
32
83
|
'key should be a name of a defined property'
|
@@ -34,6 +85,12 @@ class WebScraper
|
|
34
85
|
end
|
35
86
|
|
36
87
|
class << self
|
88
|
+
##
|
89
|
+
# Loads html page, detects appropriate blocks,
|
90
|
+
# wraps them in objects.
|
91
|
+
# The result will be cached.
|
92
|
+
# @example
|
93
|
+
# articles = Article.all
|
37
94
|
def all
|
38
95
|
raise ConfigurationError unless valid?
|
39
96
|
|
@@ -41,18 +98,38 @@ class WebScraper
|
|
41
98
|
.send(*_base).map { |node| new(node) }
|
42
99
|
end
|
43
100
|
|
101
|
+
##
|
102
|
+
# Returns number of objects found.
|
103
|
+
# @example
|
104
|
+
# puts "#{Article.count} articles were found"
|
44
105
|
def count
|
45
106
|
all.size
|
46
107
|
end
|
47
108
|
|
48
|
-
|
109
|
+
##
|
110
|
+
# Resets cache of the html data.
|
111
|
+
# @example
|
112
|
+
# Article.reset
|
113
|
+
def reset
|
49
114
|
@all = nil
|
50
115
|
end
|
51
116
|
|
117
|
+
##
|
118
|
+
# Finds first object with required key.
|
119
|
+
# @example
|
120
|
+
# article = Article.find('Tech Investment the Wise Way')
|
52
121
|
def find(key)
|
53
122
|
all.find { |e| e.send(_key) == key }
|
54
123
|
end
|
55
124
|
|
125
|
+
##
|
126
|
+
# Defines resource -- url of the html page.
|
127
|
+
# @example
|
128
|
+
# class Article < WebScraper
|
129
|
+
# ...
|
130
|
+
# resource 'http://hbswk.hbs.edu/topics/it.html'
|
131
|
+
# ...
|
132
|
+
# end
|
56
133
|
def resource(_resource)
|
57
134
|
raise ResourceDefentitionError unless _resource.is_a? String
|
58
135
|
|
@@ -61,6 +138,15 @@ class WebScraper
|
|
61
138
|
|
62
139
|
attr_reader :_resource
|
63
140
|
|
141
|
+
##
|
142
|
+
# Defines base -- selector which determines blocks of content.
|
143
|
+
# You can use css or xpath selectors.
|
144
|
+
# @example
|
145
|
+
# class Article < WebScraper
|
146
|
+
# ...
|
147
|
+
# base css: '.tile-medium'
|
148
|
+
# ...
|
149
|
+
# end
|
64
150
|
def base(_base)
|
65
151
|
raise BaseDefentitionError unless valid_selector? _base
|
66
152
|
|
@@ -69,6 +155,19 @@ class WebScraper
|
|
69
155
|
|
70
156
|
attr_reader :_base
|
71
157
|
|
158
|
+
##
|
159
|
+
# Defines property -- name (and type optionally) and selector.
|
160
|
+
# You can use css or xpath selectors.
|
161
|
+
# Types determine returning values.
|
162
|
+
# Available types (default is string): string, integer, float, node.
|
163
|
+
# The node option means nokogiri node.
|
164
|
+
# @example
|
165
|
+
# class Article < WebScraper
|
166
|
+
# ...
|
167
|
+
# property :title, xpath: './/h4/a/text()'
|
168
|
+
# property views: :integer, xpath: './/h4/span/text()'
|
169
|
+
# ...
|
170
|
+
# end
|
72
171
|
def property(*args)
|
73
172
|
@properties ||= {}
|
74
173
|
|
@@ -101,6 +200,14 @@ class WebScraper
|
|
101
200
|
|
102
201
|
attr_reader :properties
|
103
202
|
|
203
|
+
##
|
204
|
+
# Defines key -- property which will be used in find method.
|
205
|
+
# @example
|
206
|
+
# class Article < WebScraper
|
207
|
+
# ...
|
208
|
+
# key :title
|
209
|
+
# ...
|
210
|
+
# end
|
104
211
|
def key(_key)
|
105
212
|
raise KeyDefentitionError unless properties.keys.include? _key
|
106
213
|
|
@@ -109,10 +216,14 @@ class WebScraper
|
|
109
216
|
|
110
217
|
attr_reader :_key
|
111
218
|
|
219
|
+
##
|
220
|
+
# Checks if all attributes were set.
|
112
221
|
def valid?
|
113
222
|
_resource && _base && _key
|
114
223
|
end
|
115
224
|
|
225
|
+
##
|
226
|
+
# Checks if selector was defined correctly.
|
116
227
|
def valid_selector?(selector)
|
117
228
|
(selector.is_a? Hash) &&
|
118
229
|
(selector.size == 1) &&
|
@@ -120,6 +231,8 @@ class WebScraper
|
|
120
231
|
(selector.values.first.is_a? String)
|
121
232
|
end
|
122
233
|
|
234
|
+
##
|
235
|
+
# Checks if property information (i.e. name and type) were defined correctly.
|
123
236
|
def valid_info?(info)
|
124
237
|
(info.is_a? Hash) &&
|
125
238
|
(info.size == 1) &&
|
@@ -130,20 +243,31 @@ class WebScraper
|
|
130
243
|
private :new
|
131
244
|
end
|
132
245
|
|
246
|
+
##
|
247
|
+
# Sets nokogiri node. It's private method.
|
133
248
|
def initialize(node)
|
134
249
|
@node = node
|
135
250
|
end
|
136
251
|
|
137
|
-
|
138
|
-
|
252
|
+
##
|
253
|
+
# Allows you to use nokogiri css method directly on your object.
|
254
|
+
# It proxies it to nokogiri node.
|
139
255
|
def css(*args)
|
140
|
-
node.css(*args)
|
256
|
+
@node.css(*args)
|
141
257
|
end
|
142
258
|
|
259
|
+
##
|
260
|
+
# Allows you to use nokogiri xpath method directly on your object.
|
261
|
+
# It proxies it to nokogiri node.
|
143
262
|
def xpath(*args)
|
144
|
-
node.xpath(*args)
|
263
|
+
@node.xpath(*args)
|
145
264
|
end
|
146
265
|
|
266
|
+
##
|
267
|
+
# Returns appropriate value for property if found.
|
268
|
+
# Converts it to the defined type.
|
269
|
+
# @example
|
270
|
+
# puts article.description
|
147
271
|
def method_missing(name, *args, &block)
|
148
272
|
if self.class.properties.key? name
|
149
273
|
property = self.class.properties[name]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Speransky Danil
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-03-
|
11
|
+
date: 2014-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,7 +24,8 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
-
description:
|
27
|
+
description: Web Scraper is a library to build APIs by scraping static sites and use
|
28
|
+
data as models.
|
28
29
|
email: speranskydanil@gmail.com
|
29
30
|
executables: []
|
30
31
|
extensions: []
|
@@ -54,6 +55,7 @@ rubyforge_project:
|
|
54
55
|
rubygems_version: 2.2.2
|
55
56
|
signing_key:
|
56
57
|
specification_version: 4
|
57
|
-
summary:
|
58
|
+
summary: Web Scraper is a library to build APIs by scraping static sites and use data
|
59
|
+
as models.
|
58
60
|
test_files: []
|
59
61
|
has_rdoc:
|