web_scraper 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/web_scraper.rb +165 -0
- metadata +59 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a3e01f01d20813809f915bc0b97280f3d35c2153
|
4
|
+
data.tar.gz: e73cf357e0151f499414ba21ce178a8724271e9c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d38b93e448b86791f93918226d9dbadd6304667b4b7a2c1ef794802eb47cbc21d98331d2af86ccebb0d5011c36d0578bb861a7e8663c91e832c77a2356d0ba8d
|
7
|
+
data.tar.gz: 4c5213c3d410bb89ad093da6fc9f578c637e6bade5dbf94b5d4e93a891c1b94ef9ce6dbb9d35197bd0b728bb11e20076d3a21837e18120e6660a60d157cc150b
|
data/lib/web_scraper.rb
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
class WebScraper
|
5
|
+
class ConfigurationError < RuntimeError
|
6
|
+
def message
|
7
|
+
'resource, base, properties and key should be defined'
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class ResourceDefentitionError < RuntimeError
|
12
|
+
def message
|
13
|
+
'resource should be a string'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class BaseDefentitionError < RuntimeError
|
18
|
+
def message
|
19
|
+
'base should be a selector (:css|:xpath => String)'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class PropertyDefentitionError < RuntimeError
|
24
|
+
def message
|
25
|
+
'property is a name (with type optionally) ' +
|
26
|
+
'and a selector (:css|:xpath => String)'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
class KeyDefentitionError < RuntimeError
|
31
|
+
def message
|
32
|
+
'key should be a name of a defined property'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class << self
|
37
|
+
def all
|
38
|
+
raise ConfigurationError unless valid?
|
39
|
+
|
40
|
+
@all ||= Nokogiri::HTML(open(_resource))
|
41
|
+
.send(*_base).map { |node| new(node) }
|
42
|
+
end
|
43
|
+
|
44
|
+
def count
|
45
|
+
all.size
|
46
|
+
end
|
47
|
+
|
48
|
+
def expire
|
49
|
+
@all = nil
|
50
|
+
end
|
51
|
+
|
52
|
+
def find(key)
|
53
|
+
all.find { |e| e.send(_key) == key }
|
54
|
+
end
|
55
|
+
|
56
|
+
def resource(_resource)
|
57
|
+
raise ResourceDefentitionError unless _resource.is_a? String
|
58
|
+
|
59
|
+
@_resource = _resource
|
60
|
+
end
|
61
|
+
|
62
|
+
attr_reader :_resource
|
63
|
+
|
64
|
+
def base(_base)
|
65
|
+
raise BaseDefentitionError unless valid_selector? _base
|
66
|
+
|
67
|
+
@_base = _base.to_a.flatten
|
68
|
+
end
|
69
|
+
|
70
|
+
attr_reader :_base
|
71
|
+
|
72
|
+
def property(*args)
|
73
|
+
@properties ||= {}
|
74
|
+
|
75
|
+
exception = PropertyDefentitionError
|
76
|
+
|
77
|
+
case args.length
|
78
|
+
when 1
|
79
|
+
params = args[0]
|
80
|
+
|
81
|
+
raise exception unless params.is_a? Hash
|
82
|
+
|
83
|
+
info = params.reject { |k| [:css, :xpath].include? k }
|
84
|
+
selector = params.select { |k| [:css, :xpath].include? k }
|
85
|
+
when 2
|
86
|
+
name, selector = args
|
87
|
+
info = { name => :string }
|
88
|
+
else
|
89
|
+
raise exception
|
90
|
+
end
|
91
|
+
|
92
|
+
raise exception unless valid_selector? selector
|
93
|
+
raise exception unless valid_info? info
|
94
|
+
|
95
|
+
name = info.keys.first
|
96
|
+
type = info.values.first
|
97
|
+
selector = selector.to_a.flatten
|
98
|
+
|
99
|
+
@properties[name] = { type: type, selector: selector }
|
100
|
+
end
|
101
|
+
|
102
|
+
attr_reader :properties
|
103
|
+
|
104
|
+
def key(_key)
|
105
|
+
raise KeyDefentitionError unless properties.keys.include? _key
|
106
|
+
|
107
|
+
@_key = _key
|
108
|
+
end
|
109
|
+
|
110
|
+
attr_reader :_key
|
111
|
+
|
112
|
+
def valid?
|
113
|
+
_resource && _base && _key
|
114
|
+
end
|
115
|
+
|
116
|
+
def valid_selector?(selector)
|
117
|
+
(selector.is_a? Hash) &&
|
118
|
+
(selector.size == 1) &&
|
119
|
+
([:css, :xpath].include? selector.keys.first) &&
|
120
|
+
(selector.values.first.is_a? String)
|
121
|
+
end
|
122
|
+
|
123
|
+
def valid_info?(info)
|
124
|
+
(info.is_a? Hash) &&
|
125
|
+
(info.size == 1) &&
|
126
|
+
(info.keys.first.is_a? Symbol) &&
|
127
|
+
([:string, :integer, :float, :node].include? info.values.first)
|
128
|
+
end
|
129
|
+
|
130
|
+
private :new
|
131
|
+
end
|
132
|
+
|
133
|
+
def initialize(node)
|
134
|
+
@node = node
|
135
|
+
end
|
136
|
+
|
137
|
+
attr_reader :node
|
138
|
+
|
139
|
+
def css(*args)
|
140
|
+
node.css(*args)
|
141
|
+
end
|
142
|
+
|
143
|
+
def xpath(*args)
|
144
|
+
node.xpath(*args)
|
145
|
+
end
|
146
|
+
|
147
|
+
def method_missing(name, *args, &block)
|
148
|
+
if self.class.properties.key? name
|
149
|
+
property = self.class.properties[name]
|
150
|
+
|
151
|
+
type = property[:type]
|
152
|
+
value = @node.send(*property[:selector])
|
153
|
+
|
154
|
+
case type
|
155
|
+
when :string then value.text.strip
|
156
|
+
when :integer then value.text.to_i
|
157
|
+
when :float then value.text.to_f
|
158
|
+
when :node then value
|
159
|
+
end
|
160
|
+
else
|
161
|
+
super(name, *args, &block)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: web_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Speransky Danil
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-03-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
description: ''
|
28
|
+
email: speranskydanil@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/web_scraper.rb
|
34
|
+
homepage: http://speranskydanil.github.io/web_scraper/
|
35
|
+
licenses:
|
36
|
+
- MIT
|
37
|
+
metadata: {}
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
require_paths:
|
41
|
+
- lib
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
requirements: []
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 2.2.2
|
55
|
+
signing_key:
|
56
|
+
specification_version: 4
|
57
|
+
summary: ''
|
58
|
+
test_files: []
|
59
|
+
has_rdoc:
|