html_scraper 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +13 -0
- data/README.md +116 -12
- data/html_scraper.gemspec +3 -2
- data/lib/html_scraper/scraper.rb +27 -9
- data/lib/html_scraper/version.rb +1 -1
- metadata +19 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c21eb31e2e7f7b4b0a50b2beda393ae9326e64d3
|
4
|
+
data.tar.gz: 5e1213905ec2d05376a0d9bd9ad86eae90d2f29e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ead2bc0b28d5bb9ccf14f3931cdba4077edc0fcc92390d26598a1560942df5182fafc280c75b063d97de8e64b2f148d94ee8e1e8903d2adfff521fb59cc57aa9
|
7
|
+
data.tar.gz: cc778c4a4613213f2583ec4d7ae9d30a0d300ea8ba6a37e8ac52e90acf39bcd7af5dce812e788bbbc9d736e83fab6e452c44138459d40eb50ec3e3087e44365f
|
data/.gitignore
CHANGED
data/CHANGELOG.md
ADDED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# HtmlScraper
|
2
2
|
|
3
|
-
HtmlScraper is a ruby gem that
|
3
|
+
HtmlScraper is a ruby gem that allows parsing an html document to a json structure following a template
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -20,22 +20,24 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
Expressions sourrounded by `{{ }}` will be parsed as simple json attributes:
|
23
|
+
Define an html template matching the html document that will be parsed. On the blocks wehre data needs to be extracted define the json attribute sourrounded by `{{ }}` and the data for that block will be assigned in that json attribute:
|
26
24
|
|
27
25
|
```ruby
|
28
26
|
template = '
|
29
|
-
|
30
|
-
|
31
|
-
|
27
|
+
<div id="people-list">
|
28
|
+
<div class="person" hs-repeat="people">
|
29
|
+
<a href="{{ link }}">{{ surname }}</a>
|
30
|
+
<p>{{ name }}</p>
|
32
31
|
</div>
|
32
|
+
</div>
|
33
33
|
'
|
34
|
+
|
34
35
|
html = '
|
35
36
|
<html>
|
36
37
|
<body>
|
38
|
+
<div id="people-list">
|
37
39
|
<div class="person">
|
38
|
-
<
|
40
|
+
<a href="/clint-eastwood">Eastwood</a>
|
39
41
|
<p>Clint</p>
|
40
42
|
</div>
|
41
43
|
</body>
|
@@ -45,13 +47,14 @@ html = '
|
|
45
47
|
```
|
46
48
|
|
47
49
|
The json result:
|
50
|
+
|
48
51
|
```
|
49
|
-
{:surname=>"Eastwood", :name=>"Clint"}
|
52
|
+
{:surname=>"Eastwood", :name=>"Clint", :link=>"/clint-eastwood"}
|
50
53
|
```
|
51
54
|
|
52
55
|
### Iterative data
|
53
56
|
|
54
|
-
To parse iterative structures define the attribute `hs-repeat` to the html node containing the iteration:
|
57
|
+
To parse iterative structures define the attribute `hs-repeat` to the html node containing the iteration. The value of `hs-repeat` will be the name of the json attribute containing an array of the parsed subelements:
|
55
58
|
|
56
59
|
```ruby
|
57
60
|
template = '
|
@@ -88,13 +91,114 @@ json = HtmlScraper::Scraper.new(template: template).parse(html)
|
|
88
91
|
|
89
92
|
The json result:
|
90
93
|
|
91
|
-
|
94
|
+
```ruby
|
92
95
|
{:people=>
|
93
96
|
[{:surname=>"Eastwood", :name=>"Clint"},
|
94
97
|
{:surname=>"Woods", :name=>"James"},
|
95
98
|
{:surname=>"Kinski", :name=>"Klaus"}]}
|
96
99
|
```
|
97
100
|
|
101
|
+
### Regular expressions
|
102
|
+
|
103
|
+
Regular expressions can be used next to the attribute name (surrounded by `//`) to filter the parsed string that will be assigned to the attribute. The attribute value will be the first string matching the regular expression:
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
template = '<div id="people-list">
|
107
|
+
<div class="person">
|
108
|
+
<h5>{{ surname }}</h5>
|
109
|
+
<p>{{ name }}</p>
|
110
|
+
<span>{{ birthday/\d+\.\d+\.\d+/ }}</span>
|
111
|
+
</div>
|
112
|
+
</div>
|
113
|
+
'
|
114
|
+
|
115
|
+
html = '
|
116
|
+
<html>
|
117
|
+
<body>
|
118
|
+
<div id="people-list">
|
119
|
+
<div class="person">
|
120
|
+
<h5>Eastwood</h5>
|
121
|
+
<p>Clint</p>
|
122
|
+
<span>Born on 31.05.1930</span>
|
123
|
+
</div>
|
124
|
+
</body>
|
125
|
+
</html>
|
126
|
+
'
|
127
|
+
json = HtmlScraper::Scraper.new(template: template).parse(html)
|
128
|
+
```
|
129
|
+
|
130
|
+
will result in:
|
131
|
+
|
132
|
+
|
133
|
+
```
|
134
|
+
{:surname=>"Eastwood", :name=>"Clint", :birthday=>"31.05.1930"}
|
135
|
+
```
|
136
|
+
|
137
|
+
### Ruby code evaluation
|
138
|
+
|
139
|
+
For more complex attribute evaluations, ruby code can be used to manipulate the parsed expression. After the attribute name and `=` a ruby block can follow and the result will be assigned to the corresponding json attriibute. Use the symbol `$` to reference the evaluated expression within the ruby block:
|
140
|
+
|
141
|
+
```ruby
|
142
|
+
template = '
|
143
|
+
<div id="people-list">
|
144
|
+
<div class="person">
|
145
|
+
<h5>{{ surname = $.upcase }}</h5>
|
146
|
+
</div>
|
147
|
+
'
|
148
|
+
|
149
|
+
html = '
|
150
|
+
<html>
|
151
|
+
<body>
|
152
|
+
<div id="people-list">
|
153
|
+
<div class="person">
|
154
|
+
<h5>Eastwood</h5>
|
155
|
+
<p>Clint</p>
|
156
|
+
<span>Born on 31.05.1930</span>
|
157
|
+
</div>
|
158
|
+
</body>
|
159
|
+
</html>
|
160
|
+
'
|
161
|
+
json = HtmlScraper::Scraper.new(template: template).parse(html)
|
162
|
+
```
|
163
|
+
|
164
|
+
will result in:
|
165
|
+
|
166
|
+
|
167
|
+
```
|
168
|
+
{:surname=>"EASTWOOD" }
|
169
|
+
```
|
170
|
+
|
171
|
+
Regular expressions and ruby code can be both combined:
|
172
|
+
|
173
|
+
```ruby
|
174
|
+
template = '<div id="people-list">
|
175
|
+
<div class="person">
|
176
|
+
<h5>{{ surname/\w{4}/ = $.upcase }}</h5>
|
177
|
+
</div>
|
178
|
+
'
|
179
|
+
|
180
|
+
html = '
|
181
|
+
<html>
|
182
|
+
<body>
|
183
|
+
<div id="people-list">
|
184
|
+
<div class="person">
|
185
|
+
<h5>Eastwood</h5>
|
186
|
+
<p>Clint</p>
|
187
|
+
<span>Born on 31.05.1930</span>
|
188
|
+
</div>
|
189
|
+
</body>
|
190
|
+
</html>
|
191
|
+
'
|
192
|
+
json = HtmlScraper::Scraper.new(template: template).parse(html)
|
193
|
+
```
|
194
|
+
|
195
|
+
will result in:
|
196
|
+
|
197
|
+
|
198
|
+
```
|
199
|
+
{:surname=>"EAST" }
|
200
|
+
```
|
201
|
+
|
98
202
|
## Development
|
99
203
|
|
100
204
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -103,7 +207,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
103
207
|
|
104
208
|
## Contributing
|
105
209
|
|
106
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
210
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/bduran82/html_scraper.
|
107
211
|
|
108
212
|
|
109
213
|
## License
|
data/html_scraper.gemspec
CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = HtmlScraper::VERSION
|
9
9
|
spec.authors = ['Bernat Duran']
|
10
10
|
|
11
|
-
spec.summary = 'Parses
|
12
|
-
spec.homepage = 'https://github.com/bduran82/html_scraper
|
11
|
+
spec.summary = 'Parses an html document to a json structure following a template'
|
12
|
+
spec.homepage = 'https://github.com/bduran82/html_scraper'
|
13
13
|
spec.license = 'MIT'
|
14
14
|
|
15
15
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
@@ -24,4 +24,5 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency 'rake', '~> 10.0'
|
25
25
|
spec.add_development_dependency 'minitest', '~> 5.0'
|
26
26
|
spec.add_development_dependency 'pry'
|
27
|
+
spec.add_development_dependency 'pry-nav'
|
27
28
|
end
|
data/lib/html_scraper/scraper.rb
CHANGED
@@ -12,9 +12,7 @@ module HtmlScraper
|
|
12
12
|
def parse(html)
|
13
13
|
html_template = Nokogiri::HTML(@template)
|
14
14
|
return {} if html_template.root.nil?
|
15
|
-
|
16
|
-
html_root = Nokogiri::HTML(html).root
|
17
|
-
return inspect(template_root, html_root)
|
15
|
+
return inspect(html_template.root, Nokogiri::HTML(html))
|
18
16
|
end
|
19
17
|
|
20
18
|
def inspect(template_node, html_node)
|
@@ -36,15 +34,28 @@ module HtmlScraper
|
|
36
34
|
end
|
37
35
|
|
38
36
|
def parse_node(template_node, html_node)
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
37
|
+
return [
|
38
|
+
evaluate_attributes(template_node, html_node),
|
39
|
+
evaluate_text(template_node, html_node),
|
40
|
+
template_node.children.map { |t_node| inspect(t_node, html_node) }.reduce({}, &:merge)
|
41
|
+
].reduce(&:merge)
|
43
42
|
end
|
44
43
|
private :parse_node
|
45
44
|
|
45
|
+
def evaluate_attributes(template_node, html_node)
|
46
|
+
return template_node.attributes.map do |name, attr|
|
47
|
+
evaluate_expressions(attr.value, html_node.attributes[name]&.value)
|
48
|
+
end.reduce({}, &:merge)
|
49
|
+
end
|
50
|
+
private :evaluate_attributes
|
51
|
+
|
52
|
+
def evaluate_text(template_node, html_node)
|
53
|
+
return evaluate_expressions(template_node.xpath('./text()').text, html_node.text)
|
54
|
+
end
|
55
|
+
private :evaluate_text
|
56
|
+
|
46
57
|
def evaluate_expressions(expression, text)
|
47
|
-
result = expression.scan(
|
58
|
+
result = expression.scan(expr_regexp).flatten.reduce({}) do |res, expr|
|
48
59
|
res.merge(Expression.new(expr).evaluate(text))
|
49
60
|
end
|
50
61
|
|
@@ -54,7 +65,9 @@ module HtmlScraper
|
|
54
65
|
|
55
66
|
def build_xpath(template_node)
|
56
67
|
xpath = ".//#{template_node.name}"
|
57
|
-
attributes = template_node.attributes.reject
|
68
|
+
attributes = template_node.attributes.reject do |name, attr|
|
69
|
+
name.start_with?('hs-') || attr.value =~ expr_regexp
|
70
|
+
end
|
58
71
|
if !attributes.blank?
|
59
72
|
selector = attributes.map { |k, v| attribute_selector(k, v) }.join
|
60
73
|
xpath = "#{xpath}#{selector}"
|
@@ -73,6 +86,11 @@ module HtmlScraper
|
|
73
86
|
end
|
74
87
|
private :attribute_selector
|
75
88
|
|
89
|
+
def expr_regexp
|
90
|
+
/^\s*{{(.*)}}\s*$/
|
91
|
+
end
|
92
|
+
private :expr_regexp
|
93
|
+
|
76
94
|
def log(text)
|
77
95
|
puts "#{' ' * @depth}#{text}" if @verbose
|
78
96
|
end
|
data/lib/html_scraper/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bernat Duran
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-08-
|
11
|
+
date: 2016-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pry-nav
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description:
|
98
112
|
email:
|
99
113
|
executables: []
|
@@ -103,6 +117,7 @@ files:
|
|
103
117
|
- ".gitignore"
|
104
118
|
- ".rubocop.yml"
|
105
119
|
- ".travis.yml"
|
120
|
+
- CHANGELOG.md
|
106
121
|
- Gemfile
|
107
122
|
- LICENSE.txt
|
108
123
|
- README.md
|
@@ -114,7 +129,7 @@ files:
|
|
114
129
|
- lib/html_scraper/expression.rb
|
115
130
|
- lib/html_scraper/scraper.rb
|
116
131
|
- lib/html_scraper/version.rb
|
117
|
-
homepage: https://github.com/bduran82/html_scraper
|
132
|
+
homepage: https://github.com/bduran82/html_scraper
|
118
133
|
licenses:
|
119
134
|
- MIT
|
120
135
|
metadata: {}
|
@@ -137,5 +152,5 @@ rubyforge_project:
|
|
137
152
|
rubygems_version: 2.5.1
|
138
153
|
signing_key:
|
139
154
|
specification_version: 4
|
140
|
-
summary: Parses
|
155
|
+
summary: Parses an html document to a json structure following a template
|
141
156
|
test_files: []
|